diff --git a/ArchR.Rproj b/ArchR.Rproj new file mode 100644 index 00000000..21a4da08 --- /dev/null +++ b/ArchR.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 00000000..3b15d9ac --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,39 @@ +Package: ArchR +Type: Package +Date: 2019-10-17 +Title: Analyzing regulatory chromatin in R +Version: 0.1.4 +Author: Jeffrey Granja [aut, cre], Ryan Corces [aut] +Maintainer: Jeffrey Granja +Description: This package is designed to streamline scATAC analyses in R. +License: GPL (>= 2) +LinkingTo: Rcpp +LazyData: TRUE +RoxygenNote: 6.1.1 +Encoding: UTF-8 +Imports: + Rcpp (>= 0.12.16), + matrixStats, + plyr, + SummarizedExperiment, + Matrix, + nabor, + motifmatchr, + chromVAR, + uwot, + ggrepel, + Rsamtools, + gtable, + grid, + gridExtra, + Biostrings, + ComplexHeatmap, +Depends: + ggplot2, + data.table, + rhdf5, + magrittr, + S4Vectors (>= 0.9.25), + BiocGenerics, + GenomicRanges, + diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..f7da5db4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 jgranja24 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 00000000..44d7f493 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,135 @@ +# Generated by roxygen2: do not edit by hand + +export("%bcin%") +export("%bcni%") +export("%ni%") +export(.ArchRLogo) +export(.addMatToArrow) +export(.availableCells) +export(.availableChr) +export(.availableSeqnames) +export(.batchlapply) +export(.centerRollMean) +export(.checkPath) +export(.computeROC) +export(.confusionMatrix) +export(.fileExtension) +export(.fixPlotSize) +export(.getAssay) +export(.getColSums) +export(.getFeatureDF) +export(.getFragsFromArrow) +export(.getGroupMatrix) +export(.getMatFromArrow) +export(.getMatrixValues) +export(.getMetadata) +export(.getPartialMatrix) +export(.getQuantiles) +export(.getRowSums) +export(.groupMeans) +export(.groupSds) +export(.groupSums) +export(.h5read) +export(.initializeMat) +export(.isProtectedArray) +export(.mergeParams) +export(.messageDiffTime) +export(.normalizeCols) +export(.nullGeneAnnotation) +export(.nullGenomeAnnotation) +export(.quantileCut) +export(.requirePackage) +export(.rowScale) +export(.rowZscores) +export(.safeSubset) +export(.safelapply) +export(.sampleName) +export(.summarizeArrowContent) +export(.suppressAll) +export(.validArrow) +export(.validBSgenome) +export(.validGRanges) +export(.validOrgDb) +export(.validTxDb) +export(ArchRProject) +export(ArchRRegionTrack) +export(ArchR_palettes) +export(ComputeEmbedding) +export(FilterCells) +export(IdentifyClusters) +export(IterativeLSI) +export(VisualizeEmbedding) +export(VisualizeGroups) +export(addBackgroundPeaks) +export(addCellColData) +export(addDemuxletResults) +export(addDeviationsMatrix) +export(addDoubletScores) +export(addFeatureMatrix) +export(addGeneScoreMatrix) +export(addGroupCoverages) +export(addMotifAnnotations) +export(addPeakMatrix) +export(addPeakSet) +export(addReproduciblePeakSet) +export(addSampleColData) +export(addSeqLengths) +export(addTileMatrix) +export(alignCellsToTrajectory) +export(availableFeatures) +export(columnOverlaps) +export(computeCoAccessibility) +export(computeKNN) +export(computeLSI) +export(constructGRanges) +export(createArrowFiles) +export(extendGRanges) +export(featureEnrichment) +export(getAnnotation) +export(getArrowFiles) +export(getBlacklist) +export(getCellColData) +export(getCellNames) +export(getChromLengths) +export(getChromSizes) +export(getEmbedding) +export(getExons) +export(getFragmentsFromArrow) +export(getGeneAnnotation) +export(getGenes) +export(getGenome) +export(getGenomeAnnotation) +export(getMatches) +export(getMatrixFromArrow) +export(getOutputDirectory) +export(getPeakSet) +export(getPositions) +export(getReducedDims) +export(getSampleColData) +export(getSampleNames) +export(getTSS) +export(ggAlignPlots) +export(ggHex) +export(ggLine) +export(ggOneToOne) +export(ggPoint) +export(ggViolin) +export(keepFilteredChromosomes) +export(markerFeatures) +export(markerHeatmap) +export(mergeGRanges) +export(nonOverlappingGRanges) +export(overlappingBP) +export(overlapsMany) +export(paletteContinuous) +export(paletteDiscrete) +export(plotFootprint) +export(plotPDF) +export(plotPeakCallSummary) +export(projectLSI) +export(shuffleGRanges) +export(subsetSeqnames) +export(summarizeFootprints) +export(theme_ArchR) +importFrom(Rcpp,sourceCpp) +useDynLib(ArchR) diff --git a/R/.DS_Store b/R/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/R/.DS_Store differ diff --git a/R/.Rapp.history b/R/.Rapp.history new file mode 100644 index 00000000..e69de29b diff --git a/R/AllClasses.R b/R/AllClasses.R new file mode 100644 index 00000000..a3e6b958 --- /dev/null +++ b/R/AllClasses.R @@ -0,0 +1,110 @@ +#' @useDynLib ArchR +#' @importFrom Rcpp sourceCpp +NULL + +setClassUnion("characterOrNull", c("character", "NULL")) +setClassUnion("GRangesOrNull", c("GRanges", "NULL")) + +setClass("ArchRProject", + representation( + projectMetadata = "SimpleList", + sampleColData = "DataFrame", + sampleMetadata = "SimpleList", + cellColData = "DataFrame", + cellMetadata = "SimpleList", #Where clustering output will go to + reducedDims = "SimpleList", #Where clustering output will go to + embeddings = "SimpleList", #Where clustering output will go to + peakSet = "GRangesOrNull", + annotations = "SimpleList", #MotifMatches ETC go here + geneAnnotation = "SimpleList", #genes exons TSS + genomeAnnotation = "SimpleList" #genome chromSizes BSgenome blacklist + ) +) + +setMethod("show", "ArchRProject", + function(object) { + scat <- function(fmt, vals=character(), exdent=2, n = 5, ...){ + vals <- ifelse(nzchar(vals), vals, "''") + lbls <- paste(S4Vectors:::selectSome(vals, maxToShow = n), collapse=" ") + txt <- sprintf(fmt, length(vals), lbls) + cat(strwrap(txt, exdent=exdent, ...), sep="\n") + } + .ArchRLogo(ascii = "Package") + cat("class:", class(object), "\n") + cat("outputDirectory:", object@projectMetadata$outputDirectory, "\n") + scat("samples(%d): %s\n", rownames(object@sampleColData)) + scat("sampleColData names(%d): %s\n", names(object@sampleColData)) + scat("cellColData names(%d): %s\n", names(object@cellColData)) + scat("numberOfCells(%d): %s\n", nrow(object@cellColData)) + scat("medianTSS(%d): %s\n", median(object@cellColData$TSSEnrichment)) + scat("medianFrags(%d): %s\n", median(object@cellColData$nFrags)) + } +) + +#' @export +ArchRProject <- function( + ArrowFiles=NULL, + sampleNames=NULL, + outputDirectory = "ArchR_Results", + copyArrows = FALSE, + geneAnnotation = NULL, + genomeAnnotation = NULL, + showLogo = TRUE){ + + if(is.null(ArrowFiles)){ + stop("Need to Provide Arrow Files!") + } + + #Validate + message("Validating Arrows...") + ArrowFiles <- unlist(lapply(ArrowFiles, .validArrow)) + + if(is.null(sampleNames)){ + message("Getting SampleNames...") + sampleNames <- unlist(lapply(seq_along(ArrowFiles), function(x) .sampleName(ArrowFiles[x]))) + } + + if(any(duplicated(sampleNames))){ + stop("Error cannot have duplicate sampleNames, please add sampleNames that will overwrite the current sample name in Arrow file!") + } + + if(length(sampleNames) != length(ArrowFiles)) stop("Samples is not equal to input ArrowFiles!") + + dir.create(outputDirectory,showWarnings=FALSE) + sampleDirectory <- file.path(normalizePath(outputDirectory),"InputArrows") + dir.create(sampleDirectory,showWarnings=FALSE) + + if(copyArrows){ + message("Copying ArrowFiles to Ouptut Directory!") + cf <- file.copy(ArrowFiles, file.path(sampleDirectory, paste0(sampleNames, ".arrow"))) + ArrowFiles <- file.path(sampleDirectory, paste0(sampleNames, ".arrow")) + } + + #Sample Information + sampleColData <- DataFrame(row.names = sampleNames, ArrowFiles = ArrowFiles) + sampleMetadata <- SimpleList(lapply(sampleNames, function(x) SimpleList())) + names(sampleMetadata) <- sampleNames + + #Cell Information + metadataList <- lapply(ArrowFiles, .getMetadata) + intCols <- Reduce("intersect",lapply(metadataList,colnames)) + cellColData <- lapply(metadataList, function(x) x[,intCols]) %>% Reduce("rbind",.) + + proj <- new("ArchRProject", + projectMetadata = SimpleList(outputDirectory = normalizePath(outputDirectory)), + sampleColData = sampleColData, + sampleMetadata = sampleMetadata, + cellColData = cellColData, + cellMetadata = SimpleList(), + reducedDims = SimpleList(), + embeddings = SimpleList(), + annotations = SimpleList(), + geneAnnotation = geneAnnotation, + genomeAnnotation = genomeAnnotation) + if(showLogo){ + .ArchRLogo(ascii = "Logo") + } + proj + +} + diff --git a/R/ArchRBrowser.R b/R/ArchRBrowser.R new file mode 100644 index 00000000..a8c45fba --- /dev/null +++ b/R/ArchRBrowser.R @@ -0,0 +1,676 @@ +#' Plot ArchR Region Track +#' +#' This function will plot the coverage at an input region +#' +#' @param ArchRProj ArchRProject +#' @param region GRanges region that will be plotted in (if more that one first will be selected) +#' @param groupBy use groupings for bulk/scTrack +#' @param useGroups select a subset of groups for plotting +#' @param useCoverages use group coverages for track plotting +#' @param plotSummary summary of region track to be plotted +#' @param sizes sizes corresponding to plotSummary +#' @param features GRanges features to be plotted (ie getPeakSet(ArchRProj)) +#' @param geneSymbol if region is null plotting can be centered at gene start site corresponding to the gene symbol +#' @param upstream bp upstream of geneStart to extend +#' @param downstream bp downstream of geneStart to extend +#' @param tileSize with of tiles to plot bulk/scTrack +#' @param normMethod normMethod normalization column in cellColData to normalize bulkTrack +#' @param threads number of threads for parallel execution +#' @param ylim y-limits for bulkTrack +#' @param baseSize size of font in plot +#' @param borderWidth border width in plot +#' @param tickWidth axis tick width in plot +#' @param geneAnno geneAnnotation for geneTrack +#' @param title verbose sections +#' @param ... additional args +#' @export +ArchRRegionTrack <- function( + ArchRProj, + region = NULL, + groupBy = "Clusters", + useGroups = NULL, + useCoverages = FALSE, + plotSummary = c("bulkTrack", "featureTrack", "geneTrack"), + sizes = c(10, 0.5, 4), + features = NULL, + geneSymbol = NULL, + upstream = 50000, + downstream = 50000, + tileSize = 100, + normMethod = "ReadsInTSS", + threads = 1, + ylim = NULL, + baseSize = 7, + borderWidth = 0.4, + tickWidth = 0.4, + facetbaseSize = 7, + geneAnno = getGeneAnnotation(ArchRProj), + title = "", + ... + ){ + + tstart <- Sys.time() + + ########################################################## + # Get Region Where Plot Will Occur (GenomicRanges) + ########################################################## + .messageDiffTime("Validating Region", tstart) + if(is.null(region)){ + if(!is.null(geneSymbol)){ + region <- geneAnno$genes + region <- region[which(tolower(mcols(region)$symbol) == tolower(geneSymbol))] + region <- resize(region, 1, "start") + strand(region) <- "*" + region <- extendGRanges(region, upstream = upstream, downstream = downstream) + } + } + region <- .validGRanges(region)[1] + plotList <- list() + + ########################################################## + # Bulk Tracks + ########################################################## + if("bulktrack" %in% tolower(plotSummary)){ + .messageDiffTime("Adding Bulk Tracks", tstart) + plotList$bulktrack <- .bulkTracks( + ArchRProj = ArchRProj, + region = region, + tileSize = tileSize, + groupBy = groupBy, + threads = threads, + ylim = ylim, + baseSize = baseSize, + borderWidth = borderWidth, + tickWidth = tickWidth, + facetbaseSize = facetbaseSize, + normMethod = normMethod, + geneAnno = geneAnno, + title = title, + useGroups = useGroups, + useCoverages = useCoverages, + tstart = tstart) + theme(plot.margin = unit(c(0.35, 0.75, 0.35, 0.75), "cm")) + } + + ########################################################## + # Feature Tracks + ########################################################## + if("featuretrack" %in% tolower(plotSummary)){ + .messageDiffTime("Adding Feature Tracks", tstart) + if(!is.null(features)){ + plotList$featuretrack <- .featureTracks( + features = features, + region = region, + hideX = TRUE, + title = "Peaks") + theme(plot.margin = unit(c(0.1, 0.75, 0.1, 0.75), "cm")) + } + } + + ########################################################## + # Gene Tracks + ########################################################## + if("genetrack" %in% tolower(plotSummary)){ + .messageDiffTime("Adding Gene Tracks", tstart) + plotList$genetrack <- .geneTracks( + geneAnnotation = geneAnno, + region = region, + title = "Genes") + theme(plot.margin = unit(c(0.1, 0.75, 0.1, 0.75), "cm")) + } + + ########################################################## + # Time to plot + ########################################################## + plotSummary <- tolower(plotSummary) + sizes <- sizes[order(plotSummary)] + plotSummary <- plotSummary[order(plotSummary)] + + nullSummary <- unlist(lapply(seq_along(plotSummary), function(x) is.null(eval(parse(text=paste0("plotList$", plotSummary[x])))))) + if(any(nullSummary)){ + sizes <- sizes[-which(nullSummary)] + } + + .messageDiffTime("Plotting", tstart) + .suppressAll(ggAlignPlots(plotList = plotList, sizes=sizes)) + +} + +####################################################### +# Bulk Aggregated ATAC Track Methods +####################################################### + +.bulkTracks <- function( + ArchRProj, + region = NULL, + tileSize = 100, + groupBy = "Clusters", + useGroups = NULL, + normMethod = "ReadsInTSS", + threads = 1, + ylim = NULL, + baseSize = 7, + borderWidth = 0.4, + tickWidth = 0.4, + facetbaseSize = 7, + geneAnno = getGeneAnnotation(ArchRProj), + title = "", + useCoverages = TRUE, + tstart = NULL, + ... + ){ + + .requirePackage("ggplot2") + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + if(useCoverages){ + df <- .groupRegionSumCoverages( + ArchRProj = ArchRProj, + groupBy = groupBy, + normMethod = normMethod, + region = region, + tileSize = tileSize, + verbose = verbose + ) + }else{ + df <- .groupRegionSumArrows( + ArchRProj = ArchRProj, + groupBy = groupBy, + normMethod = normMethod, + region = region, + tileSize = tileSize, + verbose = verbose + ) + } + + ###################################################### + # Plot Track + ###################################################### + if(!is.null(ylim)){ + ylim <- quantile(df$y, ylim) + df$y[df$y < ylim[1]] <- ylim[1] + df$y[df$y > ylim[2]] <- ylim[2] + }else{ + ylim <- c(0,quantile(df$y, probs=c(0.999))) + df$y[df$y < ylim[1]] <- ylim[1] + df$y[df$y > ylim[2]] <- ylim[2] + } + uniqueGroups <- gtools::mixedsort(unique(paste0(df$group))) + df$group <- factor(df$group, levels = uniqueGroups) + title <- paste0(as.character(seqnames(region)),":", start(region)-1, "-", end(region), " ", title) + pal <- suppressWarnings(paletteDiscrete(values = uniqueGroups)) + + #Plot Track + p <- ggplot(df, aes_string("x","y", color = "group", fill = "group")) + + geom_area(stat = "identity") + + facet_wrap(facets = ~group, strip.position = 'right', ncol = 1) + + ylab(sprintf("Coverage (Normalized ATAC Insertions Range %s - %s by %s)", round(min(ylim),2), round(max(ylim),2), normMethod)) + + scale_color_manual(values = pal) + + scale_fill_manual(values = pal) + + scale_x_continuous(limits = c(start(region), end(region)), expand = c(0,0)) + + scale_y_continuous(limits = ylim, expand = c(0,0)) + + theme_ArchR(baseSize = baseSize, + baseRectSize = borderWidth, + baseLineSize = tickWidth, + legendPosition = "right", + axisTickCm = 0.1) + + theme(panel.spacing= unit(0, "lines"), + axis.title.x=element_blank(), + axis.text.y=element_blank(), + axis.ticks.y=element_blank(), + strip.text = element_text( + size = facetbaseSize, + color = "black", + margin = margin(0,0.35,0,0.35, "cm")), + strip.text.y = element_text(angle = 0), + strip.background = element_rect(color="black")) + + guides(fill = FALSE, colour = FALSE) + ggtitle(title) + + p + +} + +############################################################################## +# Create Average Tracks from Coverages +############################################################################## +.groupRegionSumCoverages <- function(ArchRProj, groupBy, useGroups = NULL, region, tileSize, normMethod, verbose){ + + coverageMetadata <- .getCoverageMetadata( + ArchRProj = ArchRProj, + groupBy = groupBy, + useGroups = useGroups + ) + + cellGroups <- .getCoverageParams( + ArchRProj = ArchRProj, + groupBy = groupBy + )[["cellGroups"]] %>% unlist + + groupRegionRle <- .groupRegionCoverages( + coverageMetadata = coverageMetadata, + region = region, + tileSize = tileSize, + buffer = tileSize * 5, + threads = threads + ) + groupNames <- names(groupRegionRle) + + #Normalization + g <- names(unlist(cellGroups, use.names = TRUE)) + if(tolower(normMethod) == "readsintss"){ + v <- getCellColData(ArchRProj, normMethod, drop = FALSE)[unlist(cellGroups),] + groupNormFactors <- unlist(lapply(split(v, g), sum)) + }else if(tolower(normMethod) == "nfrags"){ + v <- getCellColData(ArchRProj, normMethod, drop = FALSE)[unlist(cellGroups),] + groupNormFactors <- unlist(lapply(split(v, g), sum)) + }else if(tolower(normMethod) == "ncells"){ + groupNormFactors <- table(g) + }else{ + stop("Norm Method Not Recognized : ", normMethod) + } + + #Scale with Norm Factors + scaleFactors <- 10^4 / groupNormFactors + + #Normalize + groupRegionRle <- lapply(seq_along(groupRegionRle), function(x){ + groupRegionRle[[x]] * scaleFactors[names(groupRegionRle)[x]] + }) + + #Group And Average + groupRegionRle <- split(groupRegionRle, coverageMetadata$Group) + groupRegionList <- lapply(seq_along(groupRegionRle), function(x){ + Reduce("+", groupRegionRle[[x]]) / length(groupRegionRle[[x]]) + }) + names(groupRegionList) <- names(groupRegionRle) + + #Tile Region + tileSize <- floor(tileSize / 2) + regionTiles <- seq(trunc(start(region) / tileSize) - 1, trunc(end(region) / tileSize) + 1) * tileSize + + plotDF <- lapply(seq_along(groupRegionList), function(x){ + data.frame(x = regionTiles, y = as.vector(groupRegionList[[x]][regionTiles]), group = names(groupRegionList)[x]) + }) %>% Reduce("rbind", .) + + plotDF + +} + +.groupRegionCoverages <- function(coverageMetadata, region, tileSize = 100, buffer = 1000, threads = 1){ + + region <- .validGRanges(region[1]) + coverageFiles <- coverageMetadata$File + names(coverageFiles) <- coverageMetadata$Name + + covList <- .safelapply(seq_along(coverageFiles), function(x){ + .getCoverageFromRegion(coverageFiles[x], region, tileSize, buffer) + }, threads = threads) %>% {as(.,"RleList")} + names(covList) <- names(coverageFiles) + + covList + +} + +.getCoverageFromRegion <- function(coverageFile, region, tileSize, buffer){ + chr <- as.character(seqnames(region)) + cov <- Rle( + lengths = h5read(coverageFile, paste0("Coverage/",chr,"/Lengths")), + values = h5read(coverageFile, paste0("Coverage/",chr,"/Values")) + ) + w <- sum(runLength(cov)) + idx <- cumsum(runLength(cov)) %>% {which(. >= start(region) - buffer & . <= end(region) + buffer)} + runValue(cov)[-idx] <- 0 + covRanges <- ranges(cov) + mcols(covRanges)$values <- runValue(cov) + covRanges <- covRanges[mcols(covRanges)$values > 0] + start(covRanges) <- trunc(start(covRanges) / tileSize) * tileSize + end(covRanges) <- (trunc(end(covRanges) / tileSize) + 1) * tileSize - 1 + o <- coverage(covRanges, weight = mcols(covRanges)$values, width = w) +} + + +############################################################################## +# Create Average Tracks from Arrows +############################################################################## +.groupRegionSumArrows <- function(ArchRProj, groupBy, region, tileSize, normMethod, verbose){ + + #Group Info + cellGroups <- getCellColData(ArchRProj, groupBy, drop = TRUE) + tabGroups <- table(cellGroups) + cellsBySample <- split(rownames(getCellColData(ArchRProj)), getCellColData(ArchRProj, "Sample", drop = TRUE)) + groupsBySample <- split(cellGroups, getCellColData(ArchRProj, "Sample", drop = TRUE)) + uniqueGroups <- gtools::mixedsort(unique(cellGroups)) + + #Tile Region + regionTiles <- seq(trunc(start(region) / tileSize), trunc(end(region) / tileSize) + 1) * tileSize + ArrowFiles <- getArrowFiles(ArchRProj) + groupMat <- .safelapply(seq_along(ArrowFiles), function(i){ + gmi <- .regionSumArrows( + ArrowFile = ArrowFiles[i], + region = region, + regionTiles = regionTiles, + tileSize = tileSize, + cellNames = cellsBySample[[names(ArrowFiles)[i]]], + cellGroups = groupsBySample[[names(ArrowFiles)[i]]], + uniqueGroups = uniqueGroups + ) + }, threads = threads) %>% Reduce("+" , .) + + #Plot DF + df <- data.frame(which(groupMat > 0, arr.ind=TRUE)) + df$y <- groupMat[cbind(df[,1], df[,2])] + + #Minus 1 Tile Size + dfm1 <- df + dfm1$row <- dfm1$row - 1 + dfm1$y <- 0 + + #Plus 1 Size + dfp1 <- df + dfp1$row <- dfp1$row + 1 + dfp1$y <- 0 + + #Create plot DF + df <- rbind(df, dfm1, dfp1) + df <- df[!duplicated(df[,1:2]),] + df <- df[df$row > 0,] + df$x <- regionTiles[df$row] + df$group <- uniqueGroups[df$col] + + #Add In Ends + dfs <- data.frame( + col = seq_along(uniqueGroups), + row = 1, + y = 0, + x = start(region), + group = uniqueGroups + ) + + dfe <- data.frame( + col = seq_along(uniqueGroups), + row = length(regionTiles), + y = 0, + x = end(region), + group = uniqueGroups + ) + + #Final output + plotDF <- rbind(df,dfs,dfe) + plotDF <- df[order(df$group,df$x),] + plotDF <- df[,c("x", "y", "group")] + + #Normalization + g <- getCellColData(ArchRProj, groupBy, drop = TRUE) + if(tolower(normMethod) == "readsintss"){ + v <- getCellColData(ArchRProj, normMethod, drop = TRUE) + groupNormFactors <- unlist(lapply(split(v, g), sum)) + }else if(tolower(normMethod) == "nfrags"){ + v <- getCellColData(ArchRProj, normMethod, drop = TRUE) + groupNormFactors <- unlist(lapply(split(v, g), sum)) + }else if(tolower(normMethod) == "ncells"){ + groupNormFactors <- table(g) + }else{ + stop("Norm Method Not Recognized : ", normMethod) + } + + #Scale with Norm Factors + scaleFactors <- 10^4 / groupNormFactors + matchGroup <- match(paste0(plotDF$group), names(scaleFactors)) + plotDF$y <- plotDF$y * as.vector(scaleFactors[matchGroup]) + + return(plotDF) + +} + +.regionSumArrows <- function(ArrowFile, region, regionTiles, tileSize, cellNames, cellGroups, uniqueGroups){ + + cellFragsRegion <- .getFragsFromArrow( + ArrowFile = ArrowFile, + chr = paste0(seqnames(region)), + cellNames = cellNames, + out = "GRanges" + ) %>% subsetByOverlaps(., region, ignore.strand = FALSE) + + #Starts + ts <- match(trunc(start(cellFragsRegion)/tileSize) * tileSize, regionTiles, nomatch = 0) + ids <- which(ts > 0) + + #Ends + te <- match(trunc(start(cellFragsRegion)/tileSize) * tileSize, regionTiles, nomatch = 0) + ide <- which(te > 0) + + #Match + matchID <- S4Vectors::match(mcols(cellFragsRegion)$RG, cellNames) + + #Sparse Matrix + mat <- Matrix::sparseMatrix( + i = c(ts[ids], te[ide]), + j = c(matchID[ids], matchID[ide]), + x = rep(1, length(ids) + length(ide)), + dims = c(length(regionTiles), length(cellNames)) + ) + colnames(mat) <- cellNames + + mat@x[mat@x > 0] <- 1 + + #Create Group Matrix + groupMat <- matrix(0, nrow = length(regionTiles), ncol = length(uniqueGroups)) + colnames(groupMat) <- uniqueGroups + uniqueGroups <- uniqueGroups[uniqueGroups %in% unique(cellGroups)] + for(i in seq_along(uniqueGroups)){ + groupMat[,uniqueGroups[i]] <- Matrix::rowSums(mat[,which(cellGroups == uniqueGroups[i]),drop=FALSE]) + } + + return(groupMat) + +} + +####################################################### +# Gene Tracks +####################################################### +.geneTracks <- function( + geneAnnotation, + region, + baseSize = 9, + borderWidth = 0.4, + title = "Genes", + geneWidth = 2, + exonWidth = 4, + labelSize = 2, + colorMinus = "dodgerblue2", + colorPlus = "red", + ... + ){ + + .requirePackage("ggplot2") + .requirePackage("ggrepel") + + #only take first region + region <- ArchR::.validGRanges(region) + region <- subsetSeqnames(region[1],as.character(seqnames(region[1]))) + + genes <- sort(sortSeqlevels(geneAnnotation$genes), ignore.strand = TRUE) + exons <- sort(sortSeqlevels(geneAnnotation$exons), ignore.strand = TRUE) + genesO <- data.frame(subsetByOverlaps(genes, region, ignore.strand = TRUE)) + + if(nrow(genesO) > 0){ + + #Identify Info for Exons and Genes + exonsO <- data.frame(subsetByOverlaps(exons, region, ignore.strand = TRUE)) + exonsO <- exonsO[which(exonsO$symbol %in% genesO$symbol),] + genesO$facet = title + genesO$start <- matrixStats::rowMaxs(cbind(genesO$start, start(region))) + genesO$end <- matrixStats::rowMins(cbind(genesO$end, end(region))) + + #Collapse Iteratively + #backwards iteration so that the last value chosen is the lowest cluster possible to fit in. + genesO$cluster <- 0 + for(i in seq_len(nrow(genesO))){ + if(i==1){ + genesO$cluster[i] <- 1 + }else{ + for(j in seq_len(max(genesO$cluster))){ + jEnd <- rev(genesO$end)[match(rev(seq_len(max(genesO$cluster)))[j], rev(genesO$cluster))] + if(genesO$start[i] > jEnd + median(genesO$width)){ + genesO$cluster[i] <- rev(genesO$cluster)[match(rev(seq_len(max(genesO$cluster)))[j],rev(genesO$cluster))] + } + } + if(genesO$cluster[i]==0){ + genesO$cluster[i] <- genesO$cluster[i-1] + 1 + } + } + } + exonsO$cluster <- genesO$cluster[match(exonsO$symbol, genesO$symbol)] + pal <- c("-"=colorMinus,"+"=colorPlus,"*"=colorPlus) + + p <- ggplot(data = genesO, aes(color = strand, fill = strand)) + + facet_grid(facet~.) + + ################################################# + #Limits + ################################################# + ylim(c(0.5, max(genesO$cluster) + 0.5)) + + scale_x_continuous(limits = c(start(region), end(region)), expand = c(0,0)) + + ################################################# + #Segment for Not Minus Stranded + ################################################# + geom_segment(data = genesO[which(as.character(genesO$strand)!="-"),], + aes(x = start, xend = end, y = cluster, yend = cluster, color = strand),size=geneWidth) + + ################################################# + #Segment for Minus Stranded + ################################################# + geom_segment(data = genesO[which(as.character(genesO$strand)=="-"),], + aes(x = end, xend = start, y = cluster, yend = cluster, color = strand),size=geneWidth) + + ################################################# + #Segement for Exons + ################################################# + geom_segment(data = exonsO, aes(x = start, xend = end, y = cluster, + yend = cluster, color = strand),size=exonWidth) + + ################################################# + #Colors + ################################################# + scale_color_manual(values = pal, guide = FALSE) + + scale_fill_manual(values = pal) + + ################################################# + #Theme + ################################################# + theme_ArchR(baseSize = baseSize, baseLineSize = borderWidth, baseRectSize = borderWidth) + + theme(axis.title.x=element_blank(), axis.text.x=element_blank(),axis.ticks.x=element_blank()) + + theme(axis.title.y=element_blank(), axis.text.y=element_blank(),axis.ticks.y=element_blank()) + + theme(legend.text = element_text(size = baseSize), strip.text.y = element_text(angle = 0)) + + guides(fill = guide_legend(override.aes = list(colour = NA, shape = "c", size=3)), color = FALSE) + + theme(legend.position="bottom") + + theme(legend.title=element_text(size=5), legend.text=element_text(size=7), + legend.key.size = unit(0.75,"line"), legend.background = element_rect(color =NA), strip.background = element_blank()) + + #Add Labels if There are Genes with this orientation! + if(length(which(genesO$strand!="-")) > 0){ + p <- p + ggrepel::geom_label_repel(data=genesO[which(genesO$strand!="-"),], + aes(x = start, y = cluster, label = symbol, color = strand, fill = NA), + segment.color = "grey", nudge_x = -0.01*(end(region) - start(region)), nudge_y = -0.25, + size = labelSize, direction = "x") + } + + #Add Labels if There are Genes with this orientation! + if(length(which(genesO$strand=="-")) > 0){ + p <- p + ggrepel::geom_label_repel(data=genesO[which(genesO$strand=="-"),], + aes(x = end, y = cluster, label = symbol, color = strand, fill = NA), + segment.color = "grey", nudge_x = +0.01*(end(region) - start(region)), nudge_y = 0.25, + size = labelSize, direction = "x") + } + + p <- p + theme(legend.justification = c(0, 1), + legend.background = element_rect(colour = NA, fill = NA), legend.position="none") + + }else{ + + #create empty plot + df <- data.frame(facet = "GeneTrack", start = 0, end = 0, strand = "*", symbol = "none") + pal <- c("*"=colorPlus) + p <- ggplot(data = df, aes(start, end, fill = strand)) + geom_point() + + facet_grid(facet~.) + + theme_ArchR(baseSize = baseSize, baseLineSize = borderWidth, baseRectSize = borderWidth) + + scale_color_manual(values = pal) + + scale_x_continuous(limits = c(start(region), end(region)), expand = c(0,0)) + + theme(axis.title.x=element_blank(), axis.text.x=element_blank(),axis.ticks.x=element_blank()) + + theme(axis.title.y=element_blank(), axis.text.y=element_blank(),axis.ticks.y=element_blank()) + + } + + return(p) + +} + +####################################################### +# Feature Tracks +####################################################### +.featureTracks <- function( + features, + region, + title = "FeatureTrack", + pal = NULL, + baseSize = 9, + featureWidth = 2, + borderWidth = 0.4, + hideX = FALSE, + hideY = TRUE, + ... + ){ + + .requirePackage("ggplot2") + + #only take first region + region <- ArchR::.validGRanges(region) + region <- subsetSeqnames(region[1],as.character(seqnames(region[1]))) + + if(!inherits(features,"GRangesList") & !inherits(features,"GenomicRangesList")){ + features <- ArchR::.validGRanges(features) + featureList <- GenomicRanges::GenomicRangesList(features) + names(featureList) <- "FeatureTrack" + }else{ + features <- featureList + } + + featureO <- lapply(seq_along(featureList), function(x){ + featurex <- featureList[[x]] + namex <- names(featureList)[x] + mcols(featurex) <- NULL + sub <- subsetByOverlaps(featurex, region, ignore.strand = TRUE) + if(length(sub) > 0){ + data.frame(sub, name = namex) + }else{ + empty <- regionanges(as.character(seqnames(region[1])), regions = Iregions(0,0)) + data.frame(empty, name = namex) + } + + }) + + featureO <- Reduce("rbind", featureO) + featureO$facet <- title + + if(is.null(pal)){ + pal <- paletteDiscrete(set = "stallion", featureO$name) + } + + p <- ggplot(data = featureO, aes(color = name)) + + facet_grid(facet~.) + + geom_segment(data = featureO, aes(x = start, xend = end, y = name, yend = name, color = name), size=featureWidth) + + ylab("") + xlab("") + + scale_x_continuous(limits = c(start(region), end(region)), expand = c(0,0)) + + scale_color_manual(values = pal) + + theme(legend.text = element_text(size = baseSize)) + + theme_ArchR(baseSize = baseSize, baseLineSize = borderWidth, baseRectSize = borderWidth) + + guides(color = FALSE, fill = FALSE) + theme(strip.text.y = element_text(angle = 0), strip.background = element_blank()) + + if(hideX){ + p <- p + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) + } + + if(hideY){ + p <- p + theme(axis.title.y=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + } + + return(p) + +} + + diff --git a/R/ArchRProjectMethods.R b/R/ArchRProjectMethods.R new file mode 100644 index 00000000..b1e6e568 --- /dev/null +++ b/R/ArchRProjectMethods.R @@ -0,0 +1,844 @@ +########################################################################################## +# Validation Methods +########################################################################################## + +.validArchRProject <- function(ArchRProj, ...){ + if(!inherits(ArchRProj, "ArchRProject")){ + stop("Not a valid ArchRProject as input!") + }else{ + ArchRProj + } +} + +.validGeneAnnotation <- function(geneAnnotation, ...){ + if(!inherits(ArchRProj, "ArchRProject")){ + stop("Not a valid ArchRProject as input!") + }else{ + ArchRProj + } +} + +.validGenomeAnnotation <- function(genomeAnnotation, ...){ + if(!inherits(ArchRProj, "ArchRProject")){ + stop("Not a valid ArchRProject as input!") + }else{ + ArchRProj + } +} + +########################################################################################## +# Output Directory +########################################################################################## + +#' Get outputDirectory in ArchRProject +#' +#' This function gets outputDirectory from ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getOutputDirectory <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + outDir <- ArchRProj@projectMetadata$outputDirectory + return(outDir) +} + +########################################################################################## +# Sample Methods +########################################################################################## + +#' Get ArrowFiles in ArchRProject +#' +#' This function gets ArrowFiles in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getArrowFiles <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + af <- ArchRProj@sampleColData$ArrowFiles + names(af) <- rownames(ArchRProj@sampleColData) + return(af) +} + +#' Get sampleNames in ArchRProject +#' +#' This function gets sampleNames in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getSampleNames <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + snames <- rownames(ArchRProj@sampleColData) + return(snames) +} + +#' Get sampleColData in ArchRProject +#' +#' This function gets sampleColData in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param select select a subset of column names from sampleColData +#' @param drop drop if selecting only one column name +#' @param ... additional args +#' @export +getSampleColData <- function(ArchRProj, select = NULL, drop = FALSE, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + scd <- ArchRProj@sampleColData + if(!is.null(select)){ + if(all(select %in% colnames(scd))){ + scd <- scd[,select,drop=drop] + }else{ + stop("select Not Found in Colnames of sampleColData:\n", select[select %ni% colnames(scd)]) + } + } + return(scd) +} + +#' Add information to sampleColData in ArchRProject +#' +#' This function adds new data to sampleColData in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param data data to add to sampleColData +#' @param name new column name in sampleColData if already exists set force = TRUE to override +#' @param cells names of samples corresponding to data +#' @param force if name already exists in sampleColData set force = TRUE to override +#' @param ... additional args +#' @export +addSampleColData <- function(ArchRProj, data = NULL, name = NULL, samples = rownames(sampleColData(ArchRProj)), force = FALSE){ + ArchRProj <- .validArchRProject(ArchRProj) + if(is.null(samples)){ + stop("Error samples must be provided") + } + if(is.null(data)){ + stop("Error data must be provided") + } + if(is.null(name)){ + stop("Error name is required for new column name!") + } + if(length(samples) != length(data)){ + stop("Error samples has to equal length of data!") + } + if(name %in% colnames(getSampleColData(ArchRProj))){ + if(force){ + message("Overriding previous entry for ", name) + }else{ + message(paste0("Error previous entry for ", name, ", Set force = TRUE to override!")) + } + } + ArchRProj@sampleColData[,name] <- NA + ArchRProj@sampleColData[samples,name] <- data + return(ArchRProj) +} + +########################################################################################## +# Cell Methods +########################################################################################## + +#' Get cellNames in ArchRProject +#' +#' This function gets cellNames in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getCellNames <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + cnames <- rownames(ArchRProj@cellColData) + return(cnames) +} + +#' Get cellColData in ArchRProject +#' +#' This function gets sampleColData in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param select select a subset of column names from cellColData can put in a string function +#' @param drop drop if selecting only one column name +#' @param ... additional args +#' @export +getCellColData <- function(ArchRProj, select = NULL, drop = FALSE, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + ccd <- data.frame(ArchRProj@cellColData) + if(!is.null(select)){ + ccd2 <- lapply(seq_along(select), function(x){ + tryCatch({ + dplyr::mutate(ccd, tmpNewCol123=eval(parse(text=select[x])))[,"tmpNewCol123"] + }, error = function(x){ + stop("select Not Found in Colnames of cellColData:\n",x) + }) + }) %>% Reduce("cbind", .) %>% DataFrame + colnames(ccd2) <- select + rownames(ccd2) <- rownames(ccd) + ccd <- ccd2 + } + ccd <- DataFrame(ccd) + if(drop){ + ccd <- ccd[,,drop=drop] + } + return(ccd) +} + +#' Add information to cellColData in ArchRProject +#' +#' This function adds new data to cellColData in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param data data to add to cellColData +#' @param name new column name in cellColData if already exists set force = TRUE to override +#' @param cells names of cells corresponding to data +#' @param force if name already exists in cellColData set force = TRUE to override +#' @param ... additional args +#' @export +addCellColData <- function(ArchRProj, data = NULL, name = NULL, cells = getCellNames(ArchRProj), force = FALSE, ...){ + + ArchRProj <- .validArchRProject(ArchRProj) + + if(is.null(cells)){ + stop("Error cells must be provided") + } + + if(is.null(data)){ + stop("Error data must be provided") + } + + if(is.null(name)){ + stop("Error name is required for new column name!") + } + + if(length(cells) != length(data)){ + stop("Error cells has to equal length of data!") + } + + if(name %in% colnames(getCellColData(ArchRProj))){ + if(force){ + message("Overriding previous entry for ", name) + }else{ + message(paste0("Error previous entry for ", name, ", Set force = TRUE to override!")) + } + } + + ArchRProj@cellColData[,name] <- NA + ArchRProj@cellColData[cells,name] <- data + + return(ArchRProj) +} + +########################################################################################## +# PeakSet Methods +########################################################################################## + +#' Get PeakSet from ArchRProject +#' +#' This function gets peakSet from an ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getPeakSet <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + return(ArchRProj@peakSet) +} + +#' Add PeakSet to ArchRProject +#' +#' This function adds a peakSet to an ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param peakSet peakSet as a GRanges +#' @param force force overriding peakSet in ArchRProject +#' @param ... additional args +#' @export +addPeakSet <- function(ArchRProj, peakSet, force = FALSE, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(is.null(ArchRProj@peakSet) | force){ + #Index The Peak Set + peakSet <- lapply(split(peakSet, seqnames(peakSet)), function(x){ + mcols(x)$idx <- seq_along(x) + x + }) %>% Reduce("c", .) %>% sortSeqlevels %>% sort + ArchRProj@peakSet <- peakSet + }else{ + stop("Error peakSet exists! Set force=TRUE to override!") + } + return(ArchRProj) +} + +########################################################################################## +# Genome Annotation Methods +########################################################################################## + +#' Get genomeAnnotation from ArchRProject +#' +#' This function gets genomeAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getGenomeAnnotation <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + return(ArchRProj@genomeAnnotation) +} + +#' Get blacklist from ArchRProject +#' +#' This function gets the blacklist as a GRanges from genomeAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getBlacklist <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + return(ArchRProj@genomeAnnotation$blacklist) +} + +#' Get genome from ArchRProject +#' +#' This function gets the genome from genomeAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getGenome <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + return(ArchRProj@genomeAnnotation$genome) +} + +#' Get chromSizes from ArchRProject +#' +#' This function gets chromosome lengths as GRanges from genomeAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getChromSizes <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + return(ArchRProj@genomeAnnotation$chromSizes) +} + +#' Get chromLengths from ArchRProject +#' +#' This function gets chromosome lengths as a vector from genomeAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getChromLengths <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + cS <- ArchRProj@genomeAnnotation$chromSizes + cL <- end(cS) + names(cL) <- paste0(seqnames(cS)) + cL + return(cL) +} + +#' @export +.nullGenomeAnnotation <- function(){ + genome <- "none" + chromSizes <- GRanges() + blacklist <- GRanges() + SimpleList(blacklist = blacklist, genome = genome, chromSizes = chromSizes) +} + +########################################################################################## +# Gene Annotation Methods +########################################################################################## + +#' Get geneAnnotation from ArchRProject +#' +#' This function gets geneAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getGeneAnnotation <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + return(ArchRProj@geneAnnotation) +} + +#' Get TSS from ArchRProject +#' +#' This function gets TSS from geneAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getTSS <- function(ArchRProj, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + return(ArchRProj@geneAnnotation$TSS) +} + +#' Get Genes from ArchRProject +#' +#' This function gets genes from geneAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param symbols gene symbols to subset +#' @param ... additional args +#' @export +getGenes <- function(ArchRProj, symbols = NULL, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + genes <- ArchRProj@geneAnnotation$genes + genes <- genes[which(tolower(genes$symbol) %in% tolower(symbols))] + return(genes) +} + +#' Get Exons from ArchRProject +#' +#' This function gets exons from geneAnnotation in ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param ... additional args +#' @export +getExons <- function(ArchRProj, symbols = NULL, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + exons <- ArchRProj@geneAnnotation$exons + exons <- exons[which(tolower(exons$symbol) %in% tolower(symbols))] + return(exons) +} + +#' @export +.nullGeneAnnotation <- function(){ + genes <- GRanges("chr1", IRanges(1,1), symbol = "a") + genes <- genes[-1] + exons <- genes + TSS <- genes + SimpleList(genes = genes, exons = exons, TSS = TSS) +} + +########################################################################################## +# Dimensionality Reduction / Embedding Methods +########################################################################################## + +#' Get Reduced Dimensions from ArchRProject +#' +#' This function gets an embedding from an ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param reducedDims reduced dimensions name in ArchRProject +#' @param return return reduced dimensions as matrix or all info +#' @param ... additional args +#' @export +getReducedDims <- function(ArchRProj, reducedDims = "TileLSI", return = "matrix", ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(reducedDims %in% names(ArchRProj@reducedDims)){ + if(tolower(return)=="mat" | tolower(return)=="matrix"){ + out <- ArchRProj@reducedDims[[reducedDims]][[1]] + }else{ + out <- ArchRProj@reducedDims[[reducedDims]] + } + }else{ + stop("reducedDims not in computed reduced dims!") + } + return(out) +} + +#' Get Embedding from ArchRProject +#' +#' This function gets an embedding from an ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param embedding embedding name in ArchRProject +#' @param return return embedding as df or all info +#' @param ... additional args +#' @export +getEmbedding <- function(ArchRProj, embedding = "IterativeLSI", return = "df", ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(embedding %in% names(ArchRProj@embeddings)){ + if(tolower(return)=="df"){ + out <- ArchRProj@embeddings[[embedding]][[1]] + }else{ + out <- ArchRProj@embeddings[[embedding]] + } + }else{ + stop("embedding not in computed embeddings!") + } + return(out) +} + +########################################################################################## +# Annotation Methods +########################################################################################## + +#' Get Embedding from ArchRProject +#' +#' This function gets an embedding from an ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param name name of annotations +#' @param ... additional args +#' @export +getAnnotation <- function(ArchRProj, name = NULL, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(is.null(name)){ + name <- 1 + }else{ + if(name %ni% names(ArchRProj@annotations)){ + stop("Name is not in Annotations!") + } + } + ArchRProj@annotations[[name]] +} + +#' Get Annotation Positions from ArchRProject +#' +#' This function gets annotation positions from an ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param name name of annotations +#' @param annoName name to subset with annotations +#' @param ... additional args +#' @export +getPositions <- function(ArchRProj, name = NULL, annoName = NULL, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(is.null(name)){ + name <- 1 + }else{ + if(name %ni% names(ArchRProj@annotations)){ + stop("Name is not in Annotations!") + } + } + anno <- ArchRProj@annotations[[name]] + idx <- grep("positions", names(anno), ignore.case=TRUE) + if(length(idx)==0){ + stop("Annotation does not contain positions!") + } + positions <- readRDS(anno[[idx]]) + if(!is.null(annoName)){ + idx <- grep(annoName, names(positions), ignore.case=TRUE) + if(length(idx)==0){ + stop("Positons do not contain annoName!") + } + positions <- positions[idx] + } + positions +} + +#' Get Annotation Matches from ArchRProject +#' +#' This function gets annotation matches from an ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param name name of annotations +#' @param annoName name to subset with annotations +#' @param ... additional args +#' @export +getMatches <- function(ArchRProj, name = NULL, annoName = NULL, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + if(is.null(name)){ + name <- 1 + }else{ + if(name %ni% names(ArchRProj@annotations)){ + stop("Name is not in Annotations!") + } + } + anno <- ArchRProj@annotations[[name]] + idx <- grep("matches", names(anno), ignore.case=TRUE) + if(length(idx)==0){ + stop("Annotation does not contain positions!") + } + matches <- readRDS(anno[[idx]]) + if(!is.null(annoName)){ + idx <- grep(annoName, colnames(matches), ignore.case=TRUE) + if(length(idx)==0){ + stop("Matches do not contain annoName!") + } + matches <- matches[, idx, drop=FALSE] + } + matches +} + +#' Add Motif Annotations to ArchRProject +#' +#' This function adds motif postions and matches to an ArchRProject +#' +#' @param ArchRProj ArchRProject +#' @param motifSet motifSet JASPAR : JASPAR2016, JASPAR2018; chromVARmotifs : human, mouse, encode, homer +#' @param name of annotations to store as in ArchRProject +#' @param species species relevant to dataset (default will guess based on getGenome) +#' @param collection JASPAR collection (default = CORE) +#' @param cutOff pvalue cutoff for motif search (see motimatchr) +#' @param w width to consider for motif (see motimatchr) +#' @param ... additional args +#' @export +addMotifAnnotations <- function( + ArchRProj = NULL, + motifSet = "JASPAR2018", + name = "Motif", + species = NULL, + collection = "CORE", + cutOff = 5e-05, + w = 7, + ... + ){ + + .requirePackage("motifmatchr", installInfo='BiocManager::install("motifmatchr")') + ArchRProj <- .validArchRProject(ArchRProj) + + if(grepl("JASPAR",motifSet) & is.null(species)){ + if(grepl("hg19",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ + species <- "Homo sapiens" + } + if(grepl("hg38",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ + species <- "Homo sapiens" + } + if(grepl("mm9",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ + species <- "Mus musculus" + } + if(grepl("mm10",getGenomeAnnotation(ArchRProj)$genome, ignore.case = TRUE)){ + species <- "Mus musculus" + } + } + + ############################################################# + # Get PWM List adapted from chromVAR! + ############################################################# + tstart <- Sys.time() + .messageDiffTime(paste0("Gettting Motif Set, Species : ", species), tstart) + + if(tolower(motifSet)=="jaspar2018"){ + .requirePackage("JASPAR2018",installInfo='BiocManager::install("JASPAR2018")') + args <- list(species = species, collection = collection, ...) + motifs <- TFBSTools::getMatrixSet(JASPAR2018::JASPAR2018, args) + obj <- .summarizeJASPARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + }else if(tolower(motifSet)=="jaspar2016"){ + .requirePackage("JASPAR2016",installInfo='BiocManager::install("JASPAR2018")') + args <- list(species = species, collection = collection, ...) + motifs <- TFBSTools::getMatrixSet(JASPAR2016::JASPAR2016, args) + obj <- .summarizeJASPARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + }else if(tolower(motifSet)=="human"){ + .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') + data("human_pwms_v2") + motifs <- human_pwms_v2 + obj <- .summarizeChromVARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + }else if(tolower(motifSet)=="mouse"){ + .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') + data("mouse_pwms_v2") + motifs <- mouse_pwms_v2 + obj <- .summarizeChromVARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + }else if(tolower(motifSet)=="encode"){ + .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') + data("encode_pwms") + motifs <- encode_pwms + obj <- .summarizeChromVARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + }else if(tolower(motifSet)=="homer"){ + .requirePackage("chromVARmotifs",installInfo='devtools::install_github("GreenleafLab/chromVARmotifs")') + data("homer_pwms") + motifs <- homer_pwms + obj <- .summarizeChromVARMotifs(motifs) + motifs <- obj$motifs + motifSummary <- obj$motifSummary + }else{ + stop("Error MotifSet Not Recognized!") + } + + ############################################################# + # Get BSgenome Information! + ############################################################# + genome <- ArchRProj@genomeAnnotation$genome + .requirePackage(genome) + BSgenome <- eval(parse(text = genome)) + BSgenome <- .validBSgenome(BSgenome) + + ############################################################# + # Calculate Motif Positions + ############################################################# + .messageDiffTime("Finding Motif Positions with motifmatchr!", tstart) + peakSet <- ArchRProj@peakSet + motifPositions <- motifmatchr::matchMotifs( + pwms = motifs, + subject = peakSet, + genome = BSgenome, + out = "positions", + p.cutoff = cutOff, + w = w + ) + + ############################################################# + # Motif Overlap Matrix + ############################################################# + .messageDiffTime("Creating Motif Overlap Matrix", tstart) + allPositions <- unlist(motifPositions) + overlapMotifs <- findOverlaps(peakSet, allPositions, ignore.strand=TRUE) + motifMat <- Matrix::sparseMatrix( + i = queryHits(overlapMotifs), + j = match(names(allPositions),names(motifPositions))[subjectHits(overlapMotifs)], + x = rep(TRUE, length(overlapMotifs)), + dims = c(length(peakSet), length(motifPositions)) + ) + colnames(motifMat) <- names(motifPositions) + motifMat <- SummarizedExperiment::SummarizedExperiment(assays=SimpleList(matches = motifMat), rowRanges = peakSet) + .messageDiffTime("Finished Getting Motif Info!", tstart) + + out <- SimpleList( + motifSummary = motifSummary, + motifMatches = motifMat, + motifPositions = motifPositions, + motifList = motifs, + date = Sys.Date() + ) + + dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE) + savePositions <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Positions-In-Peaks.rds")) + saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Matches-In-Peaks.rds")) + + ArchRProj@annotations[[name]]$Name <- name + ArchRProj@annotations[[name]]$motifs <- motifs + ArchRProj@annotations[[name]]$motifSummary <- motifSummary + ArchRProj@annotations[[name]]$Positions <- savePositions + ArchRProj@annotations[[name]]$Matches <- saveMatches + + saveRDS(out, file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-In-Peaks-Summary.rds")), compress = FALSE) + saveRDS(out$motifPositions, savePositions, compress = FALSE) + saveRDS(out$motifMatches, saveMatches, compress = FALSE) + + return(ArchRProj) + +} + +.summarizeJASPARMotifs <- function(motifs){ + + motifNames <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex <- paste0(namex, "_", x) + namex + }) %>% unlist(.) + + motifDF <- lapply(seq_along(motifs), function(x){ + data.frame( + row.names = motifNames[x], + name = motifs[[x]]@name[[1]], + ID = motifs[[x]]@ID, + strand = motifs[[x]]@strand, + symbol = ifelse(!is.null(motifs[[x]]@tags$symbol[1]), motifs[[x]]@tags$symbol[1], NA) , + family = ifelse(!is.null(motifs[[x]]@tags$family[1]), motifs[[x]]@tags$family[1], NA), + alias = ifelse(!is.null(motifs[[x]]@tags$alias[1]), motifs[[x]]@tags$alias[1], NA), + stringsAsFactors = FALSE + ) + }) %>% Reduce("rbind", .) %>% DataFrame + + names(motifs) <- motifNames + + out <- list(motifs = motifs, motifSummary = motifDF) + + return(out) + +} + +.summarizeChromVARMotifs <- function(motifs){ + + motifNames <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex <- paste0(namex, "_", x) + namex + }) %>% unlist(.) + + motifDF <- lapply(seq_along(motifs), function(x){ + data.frame( + row.names = motifNames[x], + name = motifs[[x]]@name[[1]], + ID = motifs[[x]]@ID, + strand = motifs[[x]]@strand, + tags = motifs[[x]]@tags, + stringsAsFactors = FALSE + ) + }) %>% Reduce("rbind", .) %>% DataFrame + + names(motifs) <- motifNames + + out <- list(motifs = motifs, motifSummary = motifDF) + + return(out) + +} + +########################################################################################## +# Additional Methods +########################################################################################## + +#' Return Available Features for a given Matrix in ArrowFiles within an ArchRProject +#' +#' This function will identify available features for a matrix and return them for downstream +#' plotting utils. +#' +#' @param ArchRProj ArchRProject +#' @param useMatrix Matrix Name as in Arrow Files (ie TileMatrix, GeneScoreMatrix, ...) +#' @param select select a specific name with grep +#' @param ignore.case ignore case when searching with select +#' @param ... additional args +#' @export +availableFeatures <- function(ArchRProj, useMatrix = "GeneScoreMatrix", select = NULL, ignore.case = TRUE, ...){ + ArchRProj <- .validArchRProject(ArchRProj) + fdf <- .getFeatureDF(getArrowFiles(ArchRProj), useMatrix) + if(is.null(select)){ + if(any(duplicated(paste0(fdf$name)))){ + paste0(fdf$seqnames,":",fdf$name) + }else{ + fdf$name + } + }else{ + grepNames <- grep(select, fdf$name, value = TRUE, ignore.case = ignore.case) + if(any(duplicated(grepNames))){ + grepIdx <- grep(select, fdf$name, ignore.case = ignore.case) + grepNames <- paste0(fdf$seqnames[grepIdx],":",fdf$name[grepIdx]) + } + if(all(c("deviations", "z") %in% unique(paste0(fdf$seqnames)))){ + grepNames <- rev(grepNames) + } + grepNames + } + +} + +#' Plot PDF in outputDirectory of ArchRProject +#' +#' This function will plot PDF in output directory of an ArchRProject +#' +#' @param name name of PDF file +#' @param width width of PDF in inches +#' @param height height of PDF in inches +#' @param ArchRProj ArchRProject +#' @param addDOC add date of creation to end of plot file name +#' @param useDingbats use dingbats characters for plotting +#' @param ... additional args to pdf +#' @export +plotPDF <- function(name, width = 8, height = 8, ArchRProj = NULL, addDOC = TRUE, useDingbats = FALSE, ...){ + name <- gsub("\\.pdf", "", name) + if(is.null(ArchRProj)){ + outDir <- "Plots" + }else{ + ArchRProj <- .validArchRProject(ArchRProj) + outDir <- file.path(getOutputDirectory(ArchRProj), "Plots") + } + dir.create(outDir, showWarnings = FALSE) + if(addDOC){ + doc <- gsub(":","-",stringr::str_split(Sys.time(), pattern=" ",simplify=TRUE)[1,2]) + filename <- file.path(outDir, paste0(name, "_Date-", Sys.Date(), "_Time-", doc, ".pdf")) + }else{ + filename <- file.path(outDir, paste0(name, ".pdf")) + } + pdf(filename, width = width, height = height, useDingbats = useDingbats, ...) +} + + + + + diff --git a/R/ArrowMethods.R b/R/ArrowMethods.R new file mode 100644 index 00000000..18a1f2ac --- /dev/null +++ b/R/ArrowMethods.R @@ -0,0 +1,174 @@ +#################################################################### +# Hidden Helper Utils for Arrow Files +#################################################################### + +#' @export +.validArrow <- function(ArrowFile){ + o <- h5closeAll() + if(h5read(ArrowFile,"Class")!="Arrow"){ + stop("Not Valid Arrow!") + } + o <- h5closeAll() + return(ArrowFile) +} + +#' @export +.isProtectedArray <- function(matrixName){ + protectedArrays <- tolower(c("peakmatrix", "tilematrix", "genescorematrix")) + if(tolower(matrixName) %in% protectedArrays){ + stop(sprintf("Error %s cannot be used as this conflicts with another predefined matrix function!", matrixName)) + } + matrixName +} + +#' @export +.availableSeqnames <- function(ArrowFiles, subGroup = "Fragments"){ + o <- h5closeAll() + seqList <- lapply(seq_along(ArrowFiles), function(x){ + seqnames <- h5ls(ArrowFiles[x]) %>% {.[.$group==paste0("/",subGroup),]$name} + seqnames <- seqnames[!grepl("Info", seqnames)] + seqnames + }) + if(!all(unlist(lapply(seq_along(seqList), function(x) identical(seqList[[x]],seqList[[1]]))))){ + stop("Not All Seqnames Identical!") + } + o <- h5closeAll() + return(seqList[[1]]) +} + +#' @export +.availableChr <- function(ArrowFiles, subGroup = "Fragments"){ + seqnames <- .availableSeqnames(ArrowFiles, subGroup) + seqnames <- seqnames[grep("chr", seqnames, ignore.case = TRUE)] + if(length(seqnames) == 0){ + stop("No Chr Found in ArrowFiles!") + } + return(seqnames) +} + +#' @export +.availableCells <- function(ArrowFile, subGroup = NULL, passQC = TRUE){ + if(is.null(subGroup)){ + o <- h5closeAll() + cellNames <- h5read(ArrowFile, "Metadata/CellNames") + if(passQC){ + passQC <- tryCatch({ + h5read(ArrowFile, "Metadata/PassQC") + }, error = function(x){ + rep(1, length(cellNames)) + }) + cellNames <- cellNames[which(passQC==1)] + } + sampleName <- h5read(ArrowFile, paste0("Metadata/Sample")) + o <- h5closeAll() + }else{ + o <- h5closeAll() + cellNames <- h5read(ArrowFile, paste0(subGroup, "/Info/CellNames")) + sampleName <- h5read(ArrowFile, paste0("Metadata/Sample")) + o <- h5closeAll() + } + return(paste0(sampleName,"#",cellNames)) +} + +#' @export +.sampleName <- function(ArrowFile){ + o <- h5closeAll() + sampleName <- h5read(ArrowFile, paste0("Metadata/Sample")) + o <- h5closeAll() + return(sampleName) +} + +#' @export +.summarizeArrowContent <- function(ArrowFile){ + + o <- h5closeAll() + + #Get Contents of ArrowFile + h5DF <- h5ls(ArrowFile) + + #Re-Organize Content Info + h5DF <- h5DF[-which(h5DF$group == "/"),] + groups <- stringr::str_split(h5DF$group, pattern = "/", simplify=TRUE)[,2] + groupList <- split(h5DF, groups) + + #Split Nested Lists + groupList2 <- lapply(seq_along(groupList), function(x){ + groupDFx <- groupList[[x]] + groupx <- gsub(paste0("/", names(groupList)[x]),"",groupDFx$group) + if(all(groupx=="")){ + groupDFx + }else{ + subDF <- groupDFx[-which(groupx == ""),] + split(subDF, stringr::str_split(subDF$group, pattern = "/", simplify=TRUE)[,3]) + } + }) + names(groupList2) <- names(groupList) + + + o <- h5closeAll() + + return(groupList2) + +} + +#' @export +.getMetadata <- function(ArrowFile){ + + o <- h5closeAll() + + #Get Contents of ArrowFile + sampleName <- h5read(ArrowFile, paste0("Metadata/Sample")) + arrowMD <- .summarizeArrowContent(ArrowFile)$Metadata + + #Which are same dimensions as cell names + arrowMD <- arrowMD[which(arrowMD$dim == arrowMD$dim[arrowMD$name=="CellNames"]),] + + #Load these into a S4 DataFrame + md <- lapply(seq_len(nrow(arrowMD)), function(x){ + dfx <- DataFrame(h5read(ArrowFile, paste0(arrowMD$group[x],"/",arrowMD$name[x]))) + colnames(dfx) <- arrowMD$name[x] + dfx + }) %>% Reduce("cbind", .) + + #Correct CellNames + md$CellNames <- paste0(sampleName,"#",md$CellNames) + md$Sample <- Rle(sampleName, nrow(md)) + rownames(md) <- md$CellNames + md <- md[, -which(colnames(md)=="CellNames")] + md <- md[,order(colnames(md))] + + o <- h5closeAll() + + return(md) +} + +#' @export +.getFeatureDF <- function(ArrowFiles, subGroup = "TileMatrix"){ + + .helpFeatureDF <- function(ArrowFile, subGroup){ + o <- h5closeAll() + featureDF <- DataFrame(h5read(ArrowFile, paste0(subGroup,"/Info/FeatureDF"))) + featureDF$seqnames <- Rle(as.character(featureDF$seqnames)) + o <- h5closeAll() + return(featureDF) + } + + fdf <- .helpFeatureDF(ArrowFiles[1], subGroup = subGroup) + + if(length(ArrowFiles) > 1){ + ArrowFiles <- ArrowFiles[-1] + checkIdentical <- lapply(seq_along(ArrowFiles), function(x){ + fdfx <- .helpFeatureDF(ArrowFiles[x], subGroup = subGroup) + identical(fdfx, fdf) + }) %>% unlist %>% all + if(!checkIdentical){ + stop("Error not all FeatureDF for asssay is the same!") + } + } + + #Re-Order for Split Check! + newOrder <- split(seq_len(nrow(fdf)), fdf$seqnames) %>% {lapply(seq_along(.), function(x) .[[x]])} %>% Reduce("c", .) + fdf[newOrder,] + +} + diff --git a/R/ArrowRead.R b/R/ArrowRead.R new file mode 100644 index 00000000..67ccca51 --- /dev/null +++ b/R/ArrowRead.R @@ -0,0 +1,559 @@ +#################################################################### +# Reading fragments from Arrow Files +#################################################################### + +#' Read Fragments from Arrow +#' +#' This function for each sample will independently compute counts for each feature +#' per cell in the Arrow File +#' +#' @param ArrowFile ArchRProject or ArrowFiles +#' @param chr GRanges to count for each cell +#' @param cellNames matrix output name in ArrowFiles cannot be a protected matrix name +#' @param method ceiling for the number of counts per feature +#' @param verbose binarize matrix +#' @param ... additional params +#' @export +getFragmentsFromArrow <- function( + ArrowFile, + chr = NULL, + cellNames = NULL, + method = "fast", + verbose = TRUE, + ...){ + + ArrowFile <- .validArrow(ArrowFile) + + if(is.null(chr)){ + chr <- .availableSeqnames(ArrowFile, subGroup = "Fragments") + } + + if(any(chr %ni% .availableSeqnames(ArrowFile, subGroup = "Fragments"))){ + stop("Error Chromosome not in ArrowFile!") + } + + tstart <- Sys.time() + out <- lapply(seq_along(chr), function(x){ + .messageDiffTime(sprintf("Reading Chr %s of %s", x, length(chr)), tstart, verbose = verbose) + .getFragsFromArrow(ArrowFile = ArrowFile, chr = chr[x], out = "GRanges", method = method) + }) %>% GenomicRangesList + + .messageDiffTime("Merging", tstart, verbose = verbose) + + out <- .suppressAll(unlist(out)) + + out + +} + +#' @export +.getFragsFromArrow <- function( + ArrowFile, + chr = NULL, + out = "GRanges", + cellNames = NULL, + method = "fast", + ...){ + + if(is.null(chr)){ + stop("Need to provide chromosome to read!") + } + + o <- h5closeAll() + ArrowFile <- .validArrow(ArrowFile) + + if(chr %ni% .availableSeqnames(ArrowFile)){ + stop("Error Chromosome not in ArrowFile!") + } + + #Get Sample Name + sampleName <- .h5read(ArrowFile, paste0("Metadata/Sample"), method = method) + + o <- h5closeAll() + nFrags <- h5ls(ArrowFile, recursive = TRUE) %>% + {.[.$group==paste0("/Fragments/",chr) & .$name == "Ranges",]$dim} %>% + {gsub(" x 2","",.)} %>% as.integer + + if(nFrags==0){ + output <- IRanges(start = 1, end = 1) + mcols(output)$RG <- c("tmp") + output <- output[-1,] + if(tolower(out)=="granges"){ + output <- GRanges(seqnames = chr, ranges(output), RG = mcols(output)$RG) + } + return(output) + } + + + if(is.null(cellNames) | tolower(method) == "fast"){ + + output <- .h5read(ArrowFile, paste0("Fragments/",chr,"/Ranges"), method = method) %>% + {IRanges(start = .[,1], width = .[,2])} + mcols(output)$RG <- Rle( + values = paste0(sampleName, "#", .h5read(ArrowFile, paste0("Fragments/",chr,"/RGValues"), method = method)), + lengths = .h5read(ArrowFile, paste0("Fragments/",chr,"/RGLengths"), method = method) + ) + if(!is.null(cellNames)){ + output <- output[BiocGenerics::which(mcols(output)$RG %bcin% cellNames)] + } + + }else{ + + if(!any(cellNames %in% .availableCells(ArrowFile))){ + + stop("None of input cellNames are in ArrowFile availableCells!") + + }else{ + + barRle <- Rle(h5read(ArrowFile, paste0("Fragments/",chr,"/RGValues")), h5read(ArrowFile, paste0("Fragments/",chr,"/RGLengths"))) + barRle@values <- paste0(sampleName, "#", barRle@values) + idx <- BiocGenerics::which(barRle %bcin% cellNames) + if(length(idx) > 0){ + output <- h5read(ArrowFile, paste0("Fragments/",chr,"/Ranges"), index = list(idx, 1:2)) %>% + {IRanges(start = .[,1], width = .[,2])} + mcols(output)$RG <- barRle[idx] + }else{ + output <- IRanges(start = 1, end = 1) + mcols(output)$RG <- c("tmp") + output <- output[-1,] + } + } + + } + + o <- h5closeAll() + + if(tolower(out)=="granges"){ + if(length(output) > 0){ + output <- GRanges(seqnames = chr, ranges(output), RG = mcols(output)$RG) + }else{ + output <- IRanges(start = 1, end = 1) + mcols(output)$RG <- c("tmp") + output <- GRanges(seqnames = chr, ranges(output), RG = mcols(output)$RG) + output <- output[-1,] + } + } + + return(output) +} + +#################################################################### +# Reading Matrices/Arrays from Arrow Files +#################################################################### + +#' Read Fragments from Arrow +#' +#' This function for each sample will independently compute counts for each feature +#' per cell in the Arrow File +#' +#' @param ArrowFile ArchRProject or ArrowFiles +#' @param useMatrix matrix name to get from Arrow +#' @param useSeqnames use a subset of seqnames for matrix +#' @param cellNames ceiling for the number of counts per feature +#' @param verbose binarize matrix +#' @param ... additional params +#' @export +getMatrixFromArrow <- function( + ArrowFile, + useMatrix = "GeneScoreMatrix", + useSeqnames = NULL, + cellNames = NULL, + verbose = TRUE, + ...){ + + ArrowFile <- .validArrow(ArrowFile) + + seqnames <- .availableSeqnames(ArrowFile, subGroup = useMatrix) + featureDF <- .getFeatureDF(ArrowFile, subGroup = useMatrix) + + if(!is.null(useSeqnames)){ + seqnames <- seqnames[seqnames %in% useSeqnames] + } + + if(length(seqnames) == 0){ + stop("No seqnames available!") + } + + featureDF <- featureDF[BiocGenerics::which(featureDF$seqnames %bcin% seqnames), ] + + mat <- .getMatFromArrow( + ArrowFile = ArrowFile, + featureDF = featureDF, + cellNames = cellNames, + useMatrix = useMatrix, + binarize = binarize, + useIndex = FALSE + ) + + if(all(c("z", "deviations") %in% seqnames)){ + mat <- as(split(mat, featureDF$seqnames), "SimpleList") + featureDF <- featureDF[featureDF$seqnames=="deviations", "name", drop = FALSE] + }else{ + mat <- SimpleList(mat) + names(mat) <- useMatrix + } + + colData <- .getMetadata(ArrowFile) + + if(useMatrix == "PeakMatrix"){ + se <- SummarizedExperiment( + assays = mat, + rowRanges = getPeakSet(ArchRProj), + colData = colData[colnames(mat[[1]]),,drop=FALSE] + ) + }else{ + se <- SummarizedExperiment( + assays = mat, + rowData = featureDF, + colData = colData[colnames(mat[[1]]),,drop=FALSE] + ) + } + + se + +} + +#' @export +.getMatFromArrow <- function( + ArrowFile, + featureDF = NULL, + binarize = NULL, + cellNames = NULL, + useMatrix = "TileMatrix", + useIndex = FALSE, + threads = 1, + ... + ){ + + if(is.null(featureDF)){ + featureDF <- .getFeatureDF(ArrowFile, useMatrix) + } + + if(any(c("seqnames","idx") %ni% colnames(featureDF))){ + stop("Need to provide featureDF with columns seqnames and idx!") + } + + #Add RowNames for Check at the end + rownames(featureDF) <- paste0("f", seq_len(nrow(featureDF))) + + o <- h5closeAll() + + matClass <- h5read(ArrowFile, paste0(useMatrix,"/Info/Class")) + if(matClass %ni% c("Sparse.Binary.Matrix", "Sparse.Integer.Matrix", "Sparse.Double.Matrix")){ + stop("Arrow Mat is not a valid Sparse Matrix!") + } + if(is.null(binarize)){ + if(matClass == "Sparse.Binary.Matrix"){ + binarize <- TRUE + }else{ + binarize <- FALSE + } + } + if(matClass == "Sparse.Binary.Matrix"){ + if(!binarize){ + stop("Sparse Matrix in Arrow is Binarized! Set binarize = TRUE to use matrix!") + } + } + + matColNames <- paste0(.sampleName(ArrowFile), "#", h5read(ArrowFile, paste0(useMatrix,"/Info/CellNames"))) + if(!is.null(cellNames)){ + idxCols <- which(matColNames %in% cellNames) + }else{ + idxCols <- seq_along(matColNames) + } + + seqnames <- unique(featureDF$seqnames) + + mat <- .safelapply(seq_along(seqnames), function(x){ + + seqnamex <- seqnames[x] + featureDFx <- featureDF[BiocGenerics::which(featureDF$seqnames %bcin% seqnamex),] + idxRows <- featureDFx$idx + + j <- Rle( + values = h5read(ArrowFile, paste0(useMatrix,"/",seqnamex,"/jValues")), + lengths = h5read(ArrowFile, paste0(useMatrix,"/",seqnamex,"/jLengths")) + ) + + #Match J + matchJ <- S4Vectors::match(j, idxCols, nomatch = 0) + idxJ <- BiocGenerics::which(matchJ > 0) + if(useIndex){ + i <- h5read(ArrowFile, paste0(useMatrix,"/",seqnamex,"/i"), index = list(idxJ, 1)) + }else{ + i <- h5read(ArrowFile, paste0(useMatrix,"/",seqnamex,"/i"))[idxJ] + } + j <- matchJ[idxJ] + + #Match I + matchI <- match(i, idxRows, nomatch = 0) + idxI <- which(matchI > 0) + i <- i[idxI] + j <- j[idxI] + i <- matchI[idxI] + + if(!binarize){ + x <- h5read(ArrowFile, paste0(useMatrix,"/",seqnamex,"/x"))[idxJ][idxI] + }else{ + x <- rep(1, length(j)) + } + + mat <- Matrix::sparseMatrix( + i=as.vector(i), + j=j, + x=x, + dims = c(length(idxRows), length(idxCols)) + ) + rownames(mat) <- rownames(featureDFx) + + return(mat) + + }, threads = threads) %>% Reduce("rbind", .) + + o <- h5closeAll() + + colnames(mat) <- matColNames[idxCols] + + #Double Check Order! + mat <- mat[rownames(featureDF), , drop = FALSE] + rownames(mat) <- NULL + + return(mat) + +} + + +#################################################################### +# Helper read functioning +#################################################################### +#' @export +.getGroupMatrix <- function( + ArrowFiles, + featureDF, + groupList, + threads = 1, + useIndex = FALSE, + verbose = TRUE, + useMatrix = "TileMatrix", + tstart = NULL, + ... + ){ + + ######################################### + # Time Info + ######################################### + if(is.null(tstart)){ + tstart <- Sys.time() + } + + ######################################### + # Construct Matrix + ######################################### + seqnames <- unique(featureDF$seqnames) + rownames(featureDF) <- paste0("f", seq_len(nrow(featureDF))) + cellNames <- unlist(groupList, use.names = FALSE) + + mat <- .safelapply(seq_along(seqnames), function(x){ + + .messageDiffTime(sprintf("Constructing Group Matrix %s of %s", x, length(seqnames)), tstart, verbose = verbose) + + #Construct Matrix + seqnamex <- seqnames[x] + featureDFx <- featureDF[BiocGenerics::which(featureDF$seqnames %bcin% seqnamex), ] + + matChr <- matrix(0, nrow = nrow(featureDFx), ncol = length(groupList)) + colnames(matChr) <- names(groupList) + rownames(matChr) <- rownames(featureDFx) + + for(y in seq_along(ArrowFiles)){ + + maty <- .getMatFromArrow( + ArrowFile = ArrowFiles[y], + useMatrix = useMatrix, + featureDF = featureDFx, + cellNames = cellNames, + useIndex = useIndex + ) + + for(z in seq_along(groupList)){ + + #Check Cells In Group + cellsGroupz <- groupList[[z]] + idx <- BiocGenerics::which(colnames(maty) %in% cellsGroupz) + + #If In Group RowSums + if(length(idx) > 0){ + matChr[,z] <- Matrix::rowSums(maty[,idx,drop=FALSE]) + } + + } + + } + + matChr + + }, threads = threads) %>% Reduce("rbind", .) + + mat <- mat[rownames(featureDF), , drop = FALSE] + + .messageDiffTime("Successfully Created Group Matrix", tstart, verbose = verbose) + + return(mat) + +} + +#' @export +.getPartialMatrix <- function( + ArrowFiles, + featureDF, + cellNames, + progress = TRUE, + threads = 1, + useMatrix = "TileMatrix", + doSampleCells = FALSE, + sampledCellNames = NULL, + tmpPath = tempfile(), + useIndex = FALSE, + tstart = NULL, + verbose = TRUE, + ... + ){ + + ######################################### + # Time Info + ######################################### + if(is.null(tstart)){ + tstart <- Sys.time() + } + + ######################################### + # Construct Matrix + ######################################### + + mat <- .safelapply(seq_along(ArrowFiles), function(x){ + + .messageDiffTime(sprintf("Getting Partial Matrix %s of %s", x, length(ArrowFiles)), tstart, verbose = verbose) + + o <- h5closeAll() + matx <- .getMatFromArrow( + ArrowFile = ArrowFiles[x], + featureDF = featureDF, + cellNames = cellNames, + useMatrix = useMatrix, + useIndex = useIndex + ) + + if(doSampleCells){ + + #Save Temporary Matrix + outx <- paste0(tmpPath, "-", ArrowFiles[x], "-temp-mat.rds") + saveRDS(matx, outx, compress = FALSE) + + #Sample Matrix + matx <- matx[, which(colnames(matx) %in% sampledCellNames),drop = FALSE] + + return(list(mat = matx, out = outx)) + + }else{ + + return(matx) + + } + + }, threads = threads) + + + if(doSampleCells){ + + matFiles <- lapply(mat, function(x) x[[2]]) %>% Reduce("c", .) + mat <- lapply(mat, function(x) x[[1]]) %>% Reduce("cbind", .) + mat <- mat[,sampledCellNames] + + .messageDiffTime("Successfully Created Partial Matrix", tstart, verbose = verbose) + + return(list(mat = mat, matFiles = matFiles)) + + }else{ + + mat <- Reduce("cbind", mat) + mat <- mat[,cellNames] + + .messageDiffTime("Successfully Created Partial Matrix", tstart, verbose = verbose) + + return(mat) + + } + + +} + +######################################################################## +# Compute Summary Statistics! +######################################################################## + +#' @export +.getRowSums <- function(ArrowFiles, seqnames, useMatrix, verbose = TRUE, tstart = NULL, filter0 = FALSE, threads = 1){ + if(is.null(tstart)){ + tstart <- Sys.time() + } + #Compute RowSums + rowSumsDF <- .safelapply(seq_along(seqnames), function(x){ + o <- h5closeAll() + chr <- seqnames[x] + for(y in seq_along(ArrowFiles)){ + if(y == 1){ + sumy <- h5read(ArrowFiles[y], paste0(useMatrix, "/", chr, "/rowSums")) + }else{ + sumy1 <- h5read(ArrowFiles[y], paste0(useMatrix, "/", chr, "/rowSums")) + #The way we designed sparse matrix holds true that the rows are in order every tile even in rS = 0! + if(length(sumy1) > length(sumy)){ + sumy1[seq_along(sumy)] <- sumy1[seq_along(sumy)] + sumy + sumy <- sumy1 + }else{ + sumy[seq_along(sumy1)] <- sumy[seq_along(sumy1)] + sumy1 + } + } + } + #Return Setup In Feature DF Format (seqnames, idx columns) + DataFrame(seqnames = Rle(chr, lengths = length(sumy)), idx = seq_along(sumy), value = as.vector(sumy)) + }, threads = threads) %>% Reduce("rbind", .) + if(filter0){ + rowSumsDF <- rowSumsDF[rowSumsDF$value > 0, ] + } + .messageDiffTime("Successfully Created RowSums DataFrame", tstart, verbose = verbose) + return(rowSumsDF) +} + + +#' @export +.getColSums <- function(ArrowFile, chrToRun, useMatrix, verbose = TRUE, tstart = NULL, threads = 1){ + if(is.null(tstart)){ + tstart <- Sys.time() + } + #Compute ColSums + cS <- .safelapply(seq_along(chrToRun), function(x){ + o <- h5closeAll() + h5read(ArrowFile, paste0(useMatrix, "/", chrToRun[x], "/colSums")) + }, threads = threads) %>% Reduce("rbind", .) %>% colSums + .messageDiffTime("Successfully Computed colSums", tstart, verbose = verbose) + return(cS) +} + +# h5read implementation for optimal reading +#' @export +.h5read <- function(file, name, method = "fast", index = NULL, start = NULL, block = NULL, count = NULL, ...){ + if(tolower(method) == "fast" & is.null(index) & is.null(start) & is.null(block) & is.null(count)){ + fid <- H5Fopen(file) + dapl <- H5Pcreate("H5P_DATASET_ACCESS") + did <- .Call("_H5Dopen", fid@ID, name, dapl@ID, PACKAGE='rhdf5') + res <- .Call("_H5Dread", did, NULL, NULL, NULL, TRUE, 0L, FALSE, fid@native, PACKAGE='rhdf5') + invisible(.Call("_H5Dclose", did, PACKAGE='rhdf5')) + }else{ + res <- h5read(file = file, name = name, index = index, start = start, block = block, count = count, ...) + } + o <- h5closeAll() + return(res) +} + + + diff --git a/R/ArrowWrite.R b/R/ArrowWrite.R new file mode 100644 index 00000000..334e116c --- /dev/null +++ b/R/ArrowWrite.R @@ -0,0 +1,194 @@ +#' @export +.initializeMat <- function( + ArrowFile, + Group, + Class = "Double", + cellNames, + featureDF, + params, + date = Sys.Date(), + force = FALSE, + ...){ + + #Add Group Entry of SparseMatrix Format + #This Includes the following format + # + # Info + # - Class - Sparse.Integer.Matrix = Sparse Matrix with Integer Entries + # - Sparse.Binary.Matrix = Sparse Matrix with Binary ie no x values + # - Sparse.Double.Matrix = Sparse Matrix with Double/Numeric Entries + # - CellNames ie Colnames + # - FeatureDF dataframe that describes the rows of each seqname + # - Params Params that are used for construction to be checked when comparing Arrows + # - Date Date of Creation + # Chr1 + # - i, j (as an Rle), x, and rowSums,colSums,rowVars,etc. + # Chr2 + # Chr3 + # ... + # + + if(!suppressMessages(h5createGroup(ArrowFile, paste0(Group)))){ + if(force){ + h5delete(ArrowFile, paste0(Group)) + h5createGroup(ArrowFile, paste0(Group)) + }else{ + stop("Matrix Group Already Exists! Set force = TRUE to overwrite!") + } + } + o <- h5createGroup(ArrowFile, paste0(Group, "/Info")) + + if(tolower(Class)=="binary"){ + + o <- h5write(obj = "Sparse.Binary.Matrix", file = ArrowFile, name = paste0(Group, "/Info/Class")) + + }else if(tolower(Class)=="integer"){ + + o <- h5write(obj = "Sparse.Integer.Matrix", file = ArrowFile, name = paste0(Group, "/Info/Class")) + + }else if(tolower(Class)=="double"){ + + o <- h5write(obj = "Sparse.Double.Matrix", file = ArrowFile, name = paste0(Group, "/Info/Class")) + + }else{ + + stop("Matrix Class Not Supported!") + + } + + ########## + # Cell Names in Arrow + ########## + splitNames <- stringr::str_split(cellNames, pattern = "#", simplify=TRUE) + if(ncol(splitNames) > 2){ + stop("Found error with cell names containing multiple # characters!") + }else{ + cellNames <- splitNames[,ncol(splitNames)] + } + o <- h5write(obj = cellNames, file = ArrowFile, name = paste0(Group,"/Info/CellNames")) + + ########## + # FeatureDF in Arrow + ########## + df <- data.frame(featureDF, stringsAsFactors = FALSE) + stopifnot(all(c("seqnames","idx") %in% colnames(featureDF))) + o <- h5write(obj = df, file = ArrowFile, name = paste0(Group,"/Info/FeatureDF")) + + ########## + # Parameters for Matrix for Validity in Arrow + ########## + o <- h5write(obj = params, file = ArrowFile, name = paste0(Group,"/Info/Params")) + + ########## + # Date of Creation + ########## + o <- h5write(obj = paste0(date), file = ArrowFile, name = paste0(Group,"/Info/Date")) + + return(0) + +} + +#' @export +.addMatToArrow <- function( + mat, + ArrowFile, + Group, + binarize = FALSE, + addRowSums = FALSE, + addColSums = FALSE, + addRowVars = FALSE, + addRowMeans = FALSE, + ...){ + + stopifnot(inherits(mat, "dgCMatrix")) + + checkCells <- .availableCells(ArrowFile, dirname(Group)) + if(!identical(colnames(mat), checkCells)){ + stop("CellNames in Matrix Group do not Match CellNames in Matrix Being Written!") + } + + #Create Group + o <- h5closeAll() + o <- h5createGroup(ArrowFile, Group) + + #Convert Columns to Rle + j <- Rle(findInterval(seq(mat@x)-1,mat@p[-1]) + 1) + + #Info + lengthRle <- length(j@lengths) + lengthI <- length(mat@i) + + #Create Data Set + o <- .suppressAll(h5createDataset(ArrowFile, paste0(Group,"/i"), storage.mode = "integer", + dims = c(lengthI, 1), level = 0)) + + o <- .suppressAll(h5createDataset(ArrowFile, paste0(Group,"/jLengths"), storage.mode = "integer", + dims = c(lengthRle, 1), level = 0)) + + o <- .suppressAll(h5createDataset(ArrowFile, paste0(Group,"/jValues"), storage.mode = "integer", + dims = c(lengthRle, 1), level = 0)) + + #Write Data Set + o <- .suppressAll(h5write(obj = mat@i + 1, file = ArrowFile, name = paste0(Group,"/i"))) + o <- .suppressAll(h5write(obj = j@lengths, file = ArrowFile, name = paste0(Group,"/jLengths"))) + o <- .suppressAll(h5write(obj = j@values, file = ArrowFile, name = paste0(Group,"/jValues"))) + + #If binary dont store x + if(!binarize){ + + o <- .suppressAll(h5createDataset(ArrowFile, paste0(Group, "/x"), storage.mode = "double", + dims = c(lengthI, 1), level = 0)) + + o <- .suppressAll(h5write(obj = mat@x, file = ArrowFile, name = paste0(Group, "/x"))) + + }else{ + + mat@x[mat@x > 0] <- 1 + + } + + if(addColSums){ + cS <- Matrix::colSums(mat) + o <- .suppressAll(h5createDataset(ArrowFile, paste0(Group, "/colSums"), storage.mode = "double", + dims = c(ncol(mat), 1), level = 0)) + o <- .suppressAll(h5write(obj = cS, file = ArrowFile, name = paste0(Group, "/colSums"))) + + } + + if(addRowSums){ + rS <- Matrix::rowSums(mat) + o <- .suppressAll(h5createDataset(ArrowFile, paste0(Group, "/rowSums"), storage.mode = "double", + dims = c(nrow(mat), 1), level = 0)) + o <- .suppressAll(h5write(obj = rS, file = ArrowFile, name = paste0(Group, "/rowSums"))) + + } + + if(addRowMeans){ + rM <- Matrix::rowMeans(mat) + o <- .suppressAll(h5createDataset(ArrowFile, paste0(Group, "/rowMeans"), storage.mode = "double", + dims = c(nrow(mat), 1), level = 0)) + o <- .suppressAll(h5write(obj = rM, file = ArrowFile, name = paste0(Group, "/rowMeans"))) + + } + + if(addRowVars){ + if(!addRowMeans){ + rM <- Matrix::rowMeans(mat) + } + rV <- ArchR:::computeSparseRowVariances(mat@i + 1, mat@x, rM, n = ncol(mat)) + o <- .suppressAll(h5createDataset(ArrowFile, paste0(Group, "/rowVars"), storage.mode = "double", + dims = c(nrow(mat), 1), level = 0)) + o <- .suppressAll(h5write(obj = rV, file = ArrowFile, name = paste0(Group, "/rowVars"))) + + } + + #Clean Up Memorys + rm(j,mat) + gc() + + o <- h5closeAll() + + return(0) + +} + diff --git a/R/ColorPalettes.R b/R/ColorPalettes.R new file mode 100644 index 00000000..be0adcb7 --- /dev/null +++ b/R/ColorPalettes.R @@ -0,0 +1,157 @@ +#' List of palettes to be used in plots +#' @export +ArchR_palettes <- list( + + #Disclosure I have put here palettes that have been implemented in others + #I do not claim to have made these rather they look good + #Note all palettes in continous should contain discrete but not vice versa + #Ordered to be a discrete palette ie colors are ordered to be optimal if selected 2,3,4,5,6 not just left to right... + + #--------------------------------------------------------------- + # Primarily Discrete Palettes + #--------------------------------------------------------------- + + #20-colors + stallion = c("1"="#D51F26","2"="#272E6A","3"="#208A42","4"="#89288F","5"="#F47D2B", "6"="#FEE500","7"="#8A9FD1","8"="#C06CAB","19"="#E6C2DC", + "10"="#90D5E4", "11"="#89C75F","12"="#F37B7D","13"="#9983BD","14"="#D24B27","15"="#3BBCA8", "16"="#6E4B9E","17"="#0C727C", "18"="#7E1416","9"="#D8A767","20"="#3D3D3D"), + + calm = c("1"="#7DD06F", "2"="#844081", "3"="#688EC1", "4"="#C17E73", "5"="#484125", "6"="#6CD3A7", "7"="#597873","8"="#7B6FD0", "9"="#CF4A31", "10"="#D0CD47", + "11"="#722A2D", "12"="#CBC594", "13"="#D19EC4", "14"="#5A7E36", "15"="#D4477D", "16"="#403552", "17"="#76D73C", "18"="#96CED5", "19"="#CE54D1", "20"="#C48736"), + + kelly = c("1"="#FFB300", "2"="#803E75", "3"="#FF6800", "4"="#A6BDD7", "5"="#C10020", "6"="#CEA262", "7"="#817066", "8"="#007D34", "9"="#F6768E", "10"="#00538A", + "11"="#FF7A5C", "12"="#53377A", "13"="#FF8E00", "14"="#B32851", "15"="#F4C800", "16"="#7F180D", "17"="#93AA00", "18"="#593315", "19"="#F13A13", "20"="#232C16"), + + #16-colors + bear = c("1"="#faa818", "2"="#41a30d","3"="#fbdf72", "4"="#367d7d", "5"="#d33502", "6"="#6ebcbc", "7"="#37526d", + "8"="#916848", "9"="#f5b390", "10"="#342739", "11"="#bed678","12"="#a6d9ee", "13"="#0d74b6", + "14"="#60824f","15"="#725ca5", "16"="#e0598b"), + + #15-colors + iron_man = c("9"='#371377',"3"='#7700FF',"2"='#9E0142',"10"='#FF0080', "14"='#DC494C',"12"="#F88D51","1"="#FAD510","8"="#FFFF5F","4"='#88CFA4', + "13"='#238B45',"5"="#02401B", "7"="#0AD7D3","11"="#046C9A", "6"="#A2A475", "15"='grey35'), + + #12-colors + paired = c("9"="#A6CDE2","1"="#1E78B4","3"="#74C476","12"="#34A047","11"="#F59899","2"="#E11E26", + "10"="#FCBF6E","4"="#F47E1F","5"="#CAB2D6","8"="#6A3E98","6"="#FAF39B","7"="#B15928"), + + #11-colors + grove = c("11"="#1a1334","9"="#01545a","1"="#017351","6"="#03c383","8"="#aad962","2"="#fbbf45","10"="#ef6a32","3"="#ed0345","7"="#a12a5e","5"="#710162","4"="#3B9AB2"), + + #7-colors + summer_night = c("1"="#2a7185", "2"="#a64027", "3"="#fbdf72","4"="#60824f","5"="#9cdff0","6"="#022336","7"="#725ca5"), + + #5-colors + zissou = c("1"="#3B9AB2", "4"="#78B7C5", "3"="#EBCC2A", "5"="#E1AF00", "2"="#F21A00"), #wesanderson + darjeeling = c("1"="#FF0000", "2"="#00A08A", "3"="#F2AD00", "4"="#F98400", "5"="#5BBCD6"), #wesanderson + rushmore = c("1"="#E1BD6D", "5"="#EABE94", "2"="#0B775E", "4"="#35274A" , "3"="#F2300F"), #wesanderson + captain = c("1"="grey","2"="#A1CDE1","3"="#12477C","4"="#EC9274","5"="#67001E"), + + #--------------------------------------------------------------- + # Primarily Continuous Palettes + #--------------------------------------------------------------- + + #10-colors + horizon = c("1"='#000075',"4"='#2E00FF', "6"='#9408F7', "10"='#C729D6', "8"='#FA4AB5', "3"='#FF6A95', "7"='#FF8B74', "5"='#FFAC53', "9"='#FFCD32', "2"='#FFFF60'), + + #9-colors + horizon_extra =c("1"="#000436","4"="#021EA9","6"="#1632FB","8"="#6E34FC","3"="#C732D5","9"="#FD619D","7"="#FF9965","5"="#FFD32B","2"="#FFFC5A"), + viridis = c("1"="#352A86","2"="#343DAE","3"="#0262E0","4"="#1389D2","5"="#2DB7A3","6"="#A5BE6A","7"="#F8BA43","8"="#F6DA23","9"="#F8FA0D"), + samba_night = c("6"='#1873CC',"2"='#1798E5',"8"='#00BFFF',"5"='#4AC596',"1"='#00CC00',"4"='#A2E700',"9"='#FFFF00',"7"='#FFD200',"3"='#FFA500'), #buencolors + solar_extra = c("5"='#3361A5', "7"='#248AF3', "1"='#14B3FF', "8"='#88CEEF', "9"='#C1D5DC', "4"='#EAD397', "3"='#FDB31A',"2"= '#E42A2A', "6"='#A31D1D'), #buencolors + white_purple = c("9"='#f7fcfd',"6"='#e0ecf4',"8"='#bfd3e6',"5"='#9ebcda',"2"='#8c96c6',"4"='#8c6bb1',"7"='#88419d',"3"='#810f7c',"1"='#4d004b'), + white_blue = c("9"='#fff7fb',"6"='#ece7f2',"8"='#d0d1e6',"5"='#a6bddb',"2"='#74a9cf',"4"='#3690c0',"7"='#0570b0',"3"='#045a8d',"1"='#023858'), + white_red = c("1"="white", "2"="red"), + white_blue_purple = c("1"="#E6E7E8","2"="#3A97FF","3"="#8816A7"), + + #7-colors + green_blue = c("4"='#e0f3db',"7"='#ccebc5',"2"='#a8ddb5',"5"='#4eb3d3',"3"='#2b8cbe',"6"='#0868ac',"1"='#084081'), + + #6-colors + beach = c("4"="#87D2DB","1"="#5BB1CB","6"="#4F66AF","3"="#F15F30","5"="#F7962E","2"="#FCEE2B"), + + #5-colors + coolwarm = c("1"="#4858A7", "4"="#788FC8", "5"="#D6DAE1", "3"="#F49B7C", "2"="#B51F29"), + fireworks = c("5"="white","2"="#2488F0","4"="#7F3F98","3"="#E22929","1"="#FCB31A") + +) + +#' Optimized discrete color palette +#' +#' This function assesses the number of inputs and returns a tailored color palette for aesthetics +#' @param set continuous palette name or number +#' @param values is a vector containing the sample names used in the plot which will be given a color +#' @param reverse return reversed values +#' @param returnStructure return structure palette +#' @export +paletteDiscrete <- function(set = "stallion", values, reverse = FALSE, returnStructure = FALSE, ...){ + + #check + if(is.numeric(set)){ + stopifnot(set > 0 & set <= length(ArchR_palettes)) + name <- names(ArchR_palettes)[set] + }else{ + stopifnot(set %in% names(ArchR_palettes)) + name <- set + } + + n <- length(unique(values)) + pal <- ArchR_palettes[[set]] + palOrdered <- pal[gtools::mixedsort(names(pal))] #mixed sort gets 1,2,3,4...10,11,12 + + if(n > length(palOrdered)){ + message("Length of unique values greater than palette, interpolating...") + #since there are more than we supplied the goal is to interpolate and its important to interpolate in the order of the continous? + palOut <- colorRampPalette(pal)(n) + }else{ + palOut <- palOrdered[seq_len(n)] + } + + if(reverse){ + palOut <- rev(palOut) + } + + names(palOut) <- unique(values) + + if(returnStructure){ + out <- base::structure(palOut, class = "palette", name = name) + }else{ + out <- palOut + } + + return(out) + +} + +#' Continuous Color Palette +#' +#' @param set continuous palette name or number +#' @param n number for gradient +#' @param reverse return reversed values +#' @param returnStructure return structure palette +#' @export +paletteContinuous <- function(set = "solar_extra", n = 256, reverse = FALSE, returnStructure = FALSE){ + + #check + if(is.numeric(set)){ + stopifnot(set > 0 & set <= length(ArchR_palettes)) + }else{ + stopifnot(set %in% names(ArchR_palettes)) + } + + pal <- ArchR_palettes[[set]] + palOut <- colorRampPalette(pal)(n) + + if(reverse){ + palOut <- rev(palOut) + } + + if(returnStructure){ + out <- base::structure(palOut, class = "palette", name = name) + }else{ + out <- palOut + } + + return(out) + +} + diff --git a/R/ComputeEmbedding.R b/R/ComputeEmbedding.R new file mode 100644 index 00000000..84303817 --- /dev/null +++ b/R/ComputeEmbedding.R @@ -0,0 +1,378 @@ +#' Compute Embedding from Reduced Dimensions in ArchR Project +#' +#' This function will plot an embedding that was created from +#' computeEmbedding +#' +#' @param ArchRProj ArchRProject +#' @param reducedDims reduced dimensions to use +#' @param embedding embedding type (umap, tumap, rtsne, fftrtsne) +#' @param colorBy colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix) +#' @param name name of column in cellColData or Feature in Array in Arrows +#' @param log2Norm log2 Normalize features if they are continuous +#' @param pal custom palette to use for plotting +#' @param size size of points in plot +#' @param rastr rastr points in plot +#' @param quantCut quantile cut of continuous features +#' @param quantHex quantile evaluation for each hex in geom_hex +#' @param discreteSet discrete palette for visualizing embedding +#' @param continuousSet continuous palette for visualizing embedding +#' @param randomize randomize points prior to plotting +#' @param keepAxis keep x and y axis for plot +#' @param baseSize base size for text in plot +#' @param plotContinuous how to plot continuous features (points and hex) +#' @param plotParams additional params to pass to ggPoint/ggHex +#' @param plotWidth plot width used for creating a consistent plot independent of legend size +#' @param plotHeight plot height used for creating a consistent plot independent of legend size +#' @param ... additional args +#' @export +ComputeEmbedding <- function( + ArchRProj = NULL, + reducedDims = "IterativeLSI", + embedding = "UMAP", + embeddingOut = NULL, + saveModel = TRUE, + seed = 1, + force = FALSE, + embeddingParams = list(), + ... + ){ + + if(is.null(embeddingOut)){ + embeddingOut <- embedding + } + + if(embeddingOut %in% names(ArchRProj@embeddings)){ + if(!force){ + stop("Embedding Already Exists! Either set force = TRUE or use a different name!") + } + } + + ############################################################################################# + # Default Parameters for Input Embeddings! + ############################################################################################# + if(tolower(embedding)=="umap"){ + defaultEmbeddingParams <- list( + n_neighbors = 40, + min_dist = 0.4, + metric = "euclidean", + n_threads = floor(detectCores()/2), + verbose = TRUE + ) + }else if(tolower(embedding)=="tumap"){ + defaultEmbeddingParams <- list( + n_neighbors = 40, + min_dist = 0.4, + metric = "euclidean", + n_threads = floor(detectCores()/2), + verbose = TRUE + ) + }else if(tolower(embedding)=="rtsne"){ + defaultEmbeddingParams <- list( + perplexity = 50, + num_threads = floor(detectCores()/2), + verbose = TRUE + ) + }else if(tolower(embedding)=="fit-tsne" | toupper(embedding)=="fftrtsne"){ + defaultEmbeddingParams <- list( + perplexity = 50, + num_threads = floor(detectCores()/2), + verbose = TRUE + ) + }else{ + defaultEmbeddingParams <- list() + } + + #Merge Parameters + embeddingParams <- .mergeParams(embeddingParams, defaultEmbeddingParams) + + ############################################################################################# + # Run Embedding + ############################################################################################# + #Seed + set.seed(seed) + + if(tolower(embedding)=="umap"){ + + .requirePackage("uwot") + embeddingParams$X <- ArchRProj@reducedDims[[reducedDims]][[1]] + if(saveModel){ + embeddingParams$ret_nn <- TRUE + embeddingParams$ret_model <- TRUE + }else{ + embeddingParams$ret_nn <- FALSE + embeddingParams$ret_model <- FALSE + } + uwot_umap <- do.call(uwot::umap, embeddingParams) + if(saveModel){ + dfEmbedding <- data.frame(uwot_umap[[1]]) + }else{ + dfEmbedding <- data.frame(uwot_umap) + } + colnames(dfEmbedding) <- paste0(reducedDims,"#UMAP_Dimension_",seq_len(ncol(dfEmbedding))) + rownames(dfEmbedding) <- rownames(ArchRProj@reducedDims[[reducedDims]][[1]]) + + }else if(tolower(embedding)=="tumap"){ + + .requirePackage("uwot") + embeddingParams$X <- ArchRProj@reducedDims[[reducedDims]][[1]] + if(saveModel){ + embeddingParams$ret_nn <- TRUE + embeddingParams$ret_model <- TRUE + }else{ + embeddingParams$ret_nn <- FALSE + embeddingParams$ret_model <- FALSE + } + uwot_umap <- do.call(uwot::umap, embeddingParams) + if(saveModel){ + dfEmbedding <- data.frame(uwot_umap[[1]]) + }else{ + dfEmbedding <- data.frame(uwot_umap) + } + colnames(dfEmbedding) <- paste0(reducedDims,"#TUMAP_Dimension_",seq_len(ncol(dfEmbedding))) + rownames(dfEmbedding) <- rownames(ArchRProj@reducedDims[[reducedDims]][[1]]) + + }else if(tolower(embedding)=="rtsne"){ + + .requirePackage("Rtsne") + embeddingParams$X <- ArchRProj@reducedDims[[reducedDims]][[1]] + embeddingParams$pca <- FALSE + Rtsne_tsne <- do.call(Rtsne::Rtsne, embeddingParams) + dfEmbedding <- data.frame(Rtsne_tsne$Y) + colnames(dfEmbedding) <- paste0(reducedDims,"#RTSNE_Dimension_",seq_len(ncol(dfEmbedding))) + rownames(dfEmbedding) <- rownames(ArchRProj@reducedDims[[reducedDims]][[1]]) + + }else if(tolower(embedding)=="fftrtsne" | tolower(embedding)=="fit-tsne"){ + + embeddingParams$X <- ArchRProj@reducedDims[[reducedDims]][[1]] + embeddingParams$pca <- FALSE + fftrtsne_tsne <- do.call(.fftRtsne, embeddingParams) + dfEmbedding <- data.frame(fftrtsne_tsne) + colnames(dfEmbedding) <- paste0(reducedDims,"#FITTSNE_Dimension_",seq_len(ncol(dfEmbedding))) + rownames(dfEmbedding) <- rownames(ArchRProj@reducedDims[[reducedDims]][[1]]) + + }else{ + + stop("Embedding Method Not Currently Supported!") + + } + + ############################################################################################# + # Add Embedding to Project + ############################################################################################# + embeddingParams$X <- NULL + ArchRProj@embeddings[[embeddingOut]] <- SimpleList(df = dfEmbedding, params = embeddingParams) + return(ArchRProj) + +} + +.fftRtsne <- function( + X, + dims = 2, + perplexity = 30, + theta = 0.5, + max_iter = 1000, + fft_not_bh = TRUE, + ann_not_vptree = TRUE, + stop_early_exag_iter = 250, + exaggeration_factor = 12.0, + no_momentum_during_exag = FALSE, + start_late_exag_iter = -1.0, + late_exag_coeff = 1.0, + mom_switch_iter = 250, + momentum = 0.5, + final_momentum = 0.8, + learning_rate = 200, + n_trees = 50, + search_k = -1, + rand_seed = -1, + nterms = 3, + intervals_per_integer = 1, + min_num_intervals = 50, + K = -1, + sigma = -30, + initialization = NULL, + data_path = NULL, + result_path = NULL, + load_affinities = NULL, + fast_tsne_path = NULL, + nthreads = 0, + perplexity_list = NULL, + get_costs = FALSE, + df = 1.0 + ){ + + tstart <- Sys.time() + .messageDiffTime("Running FIt-SNE version 1.1.0 from https://github.com/KlugerLab/FIt-SNE/", tstart, addHeader = TRUE) + #version_number <- '1.1.0' + + .messageDiffTime("Checking Input", tstart) + if (is.null(fast_tsne_path)) { + if (.Platform$OS.type == "unix") { + fast_tsne_path <- file.path(FAST_TSNE_SCRIPT_DIR, "bin", "fast_tsne") + } else { + fast_tsne_path <- file.path(FAST_TSNE_SCRIPT_DIR, "bin", "FItSNE.exe") + } + } + + if (is.null(data_path)) { + data_path <- tempfile(pattern = 'fftRtsne_data_', fileext = '.dat') + } + if (is.null(result_path)) { + result_path <- tempfile(pattern = 'fftRtsne_result_', fileext = '.dat') + } + if (is.null(fast_tsne_path)) { + fast_tsne_path <- system2('which', 'fast_tsne', stdout = TRUE) + } + fast_tsne_path <- normalizePath(fast_tsne_path) + if (!file_test('-x', fast_tsne_path)) { + stop(fast_tsne_path, " does not exist or is not executable; check your fast_tsne_path parameter") + } + + is.wholenumber <- function(x, tol = .Machine$double.eps^0.5) abs(x - round(x)) < tol + if (!is.numeric(theta) || (theta < 0.0) || (theta > 1.0) ) { + stop("Incorrect theta.") + } + if (nrow(X) - 1 < 3 * perplexity){ + stop("Perplexity is too large.") + } + if (!is.matrix(X)) { + stop("Input X is not a matrix") + } + if (!(max_iter > 0)) { + stop("Incorrect number of iterations.") + } + if (!is.wholenumber(stop_early_exag_iter) || stop_early_exag_iter < 0) { + stop("stop_early_exag_iter should be a positive integer") + } + if (!is.numeric(exaggeration_factor)) { + stop("exaggeration_factor should be numeric") + } + if (!is.numeric(df)) { + stop("df should be numeric") + } + if (!is.wholenumber(dims) || dims <= 0) { + stop("Incorrect dimensionality.") + } + + if (search_k == -1) { + if (perplexity > 0) { + search_k <- n_trees * perplexity * 3 + } else if (perplexity == 0) { + search_k <- n_trees * max(perplexity_list) * 3 + } else { + search_k <- n_trees * K + } + } + + if (fft_not_bh) { + nbody_algo <- 2 + } else { + nbody_algo <- 1 + } + + if (is.null(load_affinities)) { + load_affinities <- 0 + } else { + if (load_affinities == 'load') { + load_affinities <- 1 + } else if (load_affinities == 'save') { + load_affinities <- 2 + } else { + load_affinities <- 0 + } + } + + if (ann_not_vptree) { + knn_algo <- 1 + } else { + knn_algo <- 2 + } + tX <- as.numeric(t(X)) + + .messageDiffTime("Writing Data for FIt-SNE", tstart) + f <- file(data_path, "wb") + n <- nrow(X) + D <- ncol(X) + writeBin(as.integer(n), f, size = 4) + writeBin(as.integer(D), f, size = 4) + writeBin(as.numeric(theta), f, size = 8) #theta + writeBin(as.numeric(perplexity), f, size = 8) + + if (perplexity == 0) { + writeBin(as.integer(length(perplexity_list)), f, size = 4) + writeBin(perplexity_list, f) + } + + writeBin(as.integer(dims), f, size = 4) + writeBin(as.integer(max_iter), f, size = 4) + writeBin(as.integer(stop_early_exag_iter), f, size = 4) + writeBin(as.integer(mom_switch_iter), f, size = 4) + writeBin(as.numeric(momentum), f, size = 8) + writeBin(as.numeric(final_momentum), f, size = 8) + writeBin(as.numeric(learning_rate), f, size = 8) + writeBin(as.integer(K), f, size = 4) #K + writeBin(as.numeric(sigma), f, size = 8) #sigma + writeBin(as.integer(nbody_algo), f, size = 4) #not barnes hut + writeBin(as.integer(knn_algo), f, size = 4) + writeBin(as.numeric(exaggeration_factor), f, size = 8) #compexag + writeBin(as.integer(no_momentum_during_exag), f, size = 4) + writeBin(as.integer(n_trees), f, size = 4) + writeBin(as.integer(search_k), f, size = 4) + writeBin(as.integer(start_late_exag_iter), f, size = 4) + writeBin(as.numeric(late_exag_coeff), f, size = 8) + + writeBin(as.integer(nterms), f, size = 4) + writeBin(as.numeric(intervals_per_integer), f, size = 8) + writeBin(as.integer(min_num_intervals), f, size = 4) + writeBin(tX, f) + writeBin(as.integer(rand_seed), f, size = 4) + writeBin(as.numeric(df), f, size = 8) + writeBin(as.integer(load_affinities), f, size = 4) + if (!is.null(initialization)) { writeBin( c(t(initialization)), f) } + close(f) + + .messageDiffTime("Executing system FIt-SNE", tstart) + flag <- system2(command = fast_tsne_path, + args = c(version_number, data_path, result_path, nthreads)) + if (flag != 0) { + stop('tsne call failed') + } + f <- file(result_path, "rb") + n <- readBin(f, integer(), n = 1, size = 4) + d <- readBin(f, integer(), n = 1, size = 4) + Y <- readBin(f, numeric(), n = n * d) + Y <- t(matrix(Y, nrow = d)) + if (get_costs) { + readBin(f, integer(), n = 1, size = 4) + costs <- readBin(f, numeric(), n = max_iter, size = 8) + Yout <- list(Y = Y, costs = costs) + } else { + Yout <- Y + } + close(f) + file.remove(data_path) + file.remove(result_path) + + .messageDiffTime("Successfully finished FIt-SNE", tstart) + + return(Yout) + +} + + + + + + + + + + + + + + + + + + diff --git a/R/CreateArrow.R b/R/CreateArrow.R new file mode 100644 index 00000000..9235bcfa --- /dev/null +++ b/R/CreateArrow.R @@ -0,0 +1,1097 @@ +#' Create Arrow Files +#' +#' This function will create an Arrow Files from input files +#' for downstream analysis +#' +#' @param inputFiles input files (tabixFile, bamFile or textFile) +#' @param sampleNames sample names corresponding to input files +#' @param outputNames output names prefix (ie PBMC -> PBMC.arrow) +#' @param geneAnno geneAnnotation input for TSS Scores etc. +#' @param genomeAnno genomeAnnotation input for ChromSizes Nucleotide Information etc. +#' @param filterFrags min fragments per cell to be filtered for analyses such as tileMat etc. +#' @param filterTSS min TSS Score per cell to be filtered for analyses such as tileMat etc. +#' @param removeFilteredCells remove fragments corresponding to cells pass filterFrags and filterTSS +#' @param minFrags min fragments per cell to be immediately filtered +#' @param outDir out directory for QC information from sample to be plotted / saved +#' @param nucLength nucleosome length for id'ing fragments as sub-, mono-, or multi-nucleosome spanning +#' @param TSSParams TSS parameters for computing TSS scores +#' @param excludeChr exclude these chromosomes from analysis downstream (does not apply to fragments) +#' @param nChunk number of chunks per chromosome when reading in input files +#' @param bcTag barcode tag location in bam file (see ScanBam in Rsamtools) +#' @param bamFlag list of bam flags for reading in fragments from input files (see ScanBam in Rsamtools) +#' @param offsetPlus Tn5 offset of "+" stranded insertion (see Buenrostro 2013) +#' @param offsetMinus Tn5 offset of "-" stranded insertion (see Buenrostro 2013) +#' @param addTileMat addTileMatrix to ArrowFiles +#' @param TileMatParams additional parameters to pass to addTileMatrix (see addTileMatrix) +#' @param addGeneScoreMat addGeneScoreMatrix to ArrowFiles +#' @param GeneScoreMatParams additional parameters to pass to addGeneScoreMatrix (see addGeneScoreMatrix) +#' @param force force creation of arrow files if already exist +#' @param threads number threads for parallel execution +#' @param parallelParam parallel parameters for batch style execution +#' @param ... additional args +#' @export +createArrowFiles <- function( + inputFiles = NULL, + sampleNames = NULL, + outputNames = paste0("./", sampleNames), + geneAnno = NULL, + genomeAnno = NULL, + filterFrags = 1000, + filterTSS = 4, + removeFilteredCells = TRUE, + minFrags = 500, + outDir = "QualityControl", + nucLength = 147, + TSSParams = list(), + excludeChr = c("chrM", "chrY"), + nChunk = 5, + bcTag = "qname", + gsubExpression = NULL, + bamFlag = NULL, + offsetPlus = 4, + offsetMinus = -5, + addTileMat = TRUE, + TileMatParams = list(), + addGeneScoreMat = TRUE, + GeneScoreMatParams = list(), + force = FALSE, + threads = 1, + parallelParam = NULL, + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + dir.create(outDir, showWarnings = FALSE) + + #Add args to list + args <- list() + args <- append(args, mget(names(formals()),sys.frame(sys.nframe()))) #as.list(match.call()) + args$X <- seq_along(inputFiles) + args$FUN <- .createArrow + args$threads <- min(args$threads, length(inputFiles)) + args$registryDir <- file.path(outDir, "CreateArrowsRegistry") + + #Run With Parallel or lapply + outList <- .batchlapply(args) + + return(unlist(outList)) + +} + +#Main Function! +.createArrow <- function( + i, + inputFiles = NULL, + sampleNames = NULL, + outputNames = paste0("./", sampleName), + nChunk = 3, + offsetPlus = 4, + offsetMinus = -5, + geneAnno = NULL, + genomeAnno = NULL, + minFrags = 500, + removeFilteredCells = TRUE, + filterFrags = 1000, + filterTSS = 4, + excludeChr = c("chrM", "chrY"), + gsubExpression = NULL, + bcTag = "qname", + bamFlag = NULL, + outDir = "QualityControl", + nucLength = 147, + TSSParams = list(), + addTileMat = TRUE, + TileMatParams = list(), + addGeneScoreMat = TRUE, + GeneScoreMatParams = list(), + force = FALSE, + verboseHeader = TRUE, + verboseAll = FALSE, + tstart = NULL, + ... + ){ + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + inputFile <- inputFiles[i] + sampleName <- sampleNames[i] + outputName <- outputNames[i] + prefix <- sprintf("(%s : %s of %s)", sampleName, i, length(inputFiles)) + + .requirePackage("rhdf5") + .requirePackage("Rsamtools") + + ArrowFile <- paste0(outputName, ".arrow") + if(file.exists(ArrowFile)){ + if(force){ + rmf <- file.remove(ArrowFile) + }else{ + stop("Error file already exists!") + } + } + + ############################################################# + #Determine Arrow Construction Method + ############################################################# + fe <- .fileExtension(inputFile) + if(fe == "gz" | fe == "tsv" | fe == "txt"){ + if(.isTabix(inputFile)){ + readMethod <- "tabix" + }else{ + readMethod <- "tsv" + } + }else if(fe == "bam"){ + readMethod <- "bam" + }else{ + stop(sprintf("Read Method for %s Not Recognized!", fe)) + } + + .messageDiffTime(sprintf("%s Reading In Fragments from inputFiles (readMethod = %s)", prefix, readMethod), tstart, verbose = verboseHeader, addHeader = verboseAll) + + if(tolower(readMethod) == "tsv"){ + + out <- .tsvToArrow(tsvFile = inputFile, outArrow = ArrowFile, + chromSizes = genomeAnno$chromSizes, genome = genomeAnno$genome, + minFrags = minFrags, sampleName = sampleName, prefix = prefix, + verboseHeader = verboseHeader, verboseAll = verboseAll, tstart = tstart, ...) + + }else if(tolower(readMethod)=="tabix"){ + + tmp <- .tabixToTmp(tabixFile = inputFile, chromSizes = genomeAnno$chromSizes, nChunk = nChunk, + gsubExpression = gsubExpression, prefix = prefix, verboseHeader = verboseHeader, + verboseAll = verboseAll, tstart = tstart, ...) + + out <- .tmpToArrow(tmpFile = tmp, outArrow = ArrowFile, genome = genomeAnno$genome, + minFrags = minFrags, sampleName = sampleName, prefix = prefix, + verboseHeader = verboseHeader, verboseAll = verboseAll, tstart = tstart, ...) + + }else if(tolower(readMethod)=="bam"){ + + tmp <- .bamToTmp(prefix = prefix, bamFile = inputFile, chromSizes = genomeAnno$chromSizes, bamFlag = bamFlag, + bcTag = bcTag, gsubExpression = gsubExpression, nChunk = nChunk, + offsetPlus = offsetPlus, offsetMinus = offsetMinus, prefix = prefix, + verboseHeader = verboseHeader, verboseAll = verboseAll, tstart = tstart, ...) + + out <- .tmpToArrow(tmpFile = tmp, outArrow = ArrowFile, genome = genomeAnno$genome, + minFrags = minFrags, sampleName = sampleName, prefix = prefix, + verboseHeader = verboseHeader, verboseAll = verboseAll, tstart = tstart, ...) + + }else{ + + stop(sprintf("Read Method : %s Not Recognized!", readMethod)) + + } + gc() + + ############################################################# + #Compute Fragment Information! + ############################################################# + .messageDiffTime(sprintf("%s Adding Fragment Summary", prefix), tstart, verbose = verboseHeader, addHeader = verboseAll) + + fragSummary <- .fastFragmentInfo(ArrowFile = ArrowFile, cellNames = .availableCells(ArrowFile), nucLength = nucLength) + Metadata <- fragSummary[[1]] + plot <- tryCatch({ + + dir.create(outDir, showWarnings = FALSE) + pdf(file.path(outDir,paste0(sampleName,"-Fragment_Size_Distribution.pdf")),width=3,height=2,onefile=FALSE) + plotDF <- data.frame( + x = seq_along(fragSummary[[2]]), + percent = 100 * fragSummary[[2]]/sum(fragSummary[[2]]) + ) + gg <- ggplot(plotDF, aes(x = x, y = percent)) + theme_ArchR() + + geom_line(col = "darkblue", size = 0.25) + + coord_cartesian(xlim = c(1,750), ylim = c(0, max(plotDF$percent) * 1.1), expand = FALSE) + + xlab("Size of Fragments (bp) \n") + + ylab("Fragments (%)") + + ggtitle("Fragment Size Distribution") + print(gg) + dev.off() + + }, error = function(x) { + + .messageDiffTime("Continuing through after error ggplot for Fragment Size Distribution", tstart) + print(x) + message("\n") + + }) + gc() + + ############################################################# + #Compute TSS Enrichment Scores Information! + ############################################################# + .messageDiffTime(sprintf("%s Adding TSS Enrichment Scores", prefix), tstart, verbose = verboseHeader, addHeader = verboseAll) + TSSParams$TSS <- geneAnno$TSS + TSSParams$ArrowFile <- ArrowFile + TSSParams$cellNames <- Metadata$cellNames + TSSOut <- do.call(.fastTSSEnrichment, TSSParams) + Metadata$TSSEnrichment <- TSSOut$tssScores + Metadata$ReadsInTSS <- TSSOut$tssReads + + #Filter + Metadata$Keep <- 1*(Metadata$nFrags >= filterFrags & Metadata$TSSEnrichment >= filterTSS) + message(paste0(sampleName, " : Number of Cells Pass Filter = ", sum(Metadata$Keep))) + message(paste0(sampleName, " : Median Frags = ", median(Metadata$nFrags[Metadata$Keep==1]))) + message(paste0(sampleName, " : Median TSS Enrichment = ", median(Metadata$TSSEnrichment[Metadata$Keep==1]))) + + plot <- tryCatch({ + + ggtitle <- sprintf("%s\n%s\n%s", + paste0(sampleName, " : Number of Cells Pass Filter = ", sum(Metadata$Keep)), + paste0("Median Frags = ", median(Metadata$nFrags[Metadata$Keep==1])), + paste0("Median TSS Enrichment = ", median(Metadata$TSSEnrichment[Metadata$Keep==1])) + ) + + pdf(file.path(outDir,paste0(sampleName,"-TSS_by_Unique_Frags.pdf")),width=6,height=6,onefile=FALSE) + gg <- ggPoint( + x = log10(Metadata$nFrags), + y = Metadata$TSSEnrichment, + colorDensity = TRUE, + continuousSet = "samba_night", + xlabel = "Log 10 (Unique Fragments)", + ylabel = "TSS Enrichment", + title = ggtitle, + rastr = TRUE) + + geom_hline(yintercept=filterTSS, lty = "dashed", size = 0.5) + + geom_vline(xintercept=log10(filterFrags), lty = "dashed", size = 0.5) + print(gg) + dev.off() + + }, error = function(x) { + + .messageDiffTime("Continuing through after error ggplot for TSS by Frags", tstart) + print(x) + message("\n") + + }) + + #Add To Metadata + .messageDiffTime("Adding Metadata!", tstart, addHeader = TRUE) + #Sanity Check Here to Make Sure! + stopifnot( + identical( + paste0(h5read(ArrowFile, "Metadata/CellNames")), + paste0(stringr::str_split(Metadata$cellNames, pattern = "#", simplify = TRUE)[,2]) + ) + ) + o <- h5write(obj = Metadata$Keep, file = ArrowFile, name = "Metadata/PassQC") + o <- h5write(obj = Metadata$nFrags, file = ArrowFile, name = "Metadata/nFrags") + o <- h5write(obj = Metadata$nMonoFrags, file = ArrowFile, name = "Metadata/nMonoFrags") + o <- h5write(obj = Metadata$nDiFrags, file = ArrowFile, name = "Metadata/nDiFrags") + o <- h5write(obj = Metadata$nMultiFrags, file = ArrowFile, name = "Metadata/nMultiFrags") + o <- h5write(obj = (Metadata$nDiFrags + Metadata$nMultiFrags) / Metadata$nMonoFrags, file = ArrowFile, name = "Metadata/NucleosomeRatio") + o <- h5write(obj = Metadata$TSSEnrichment, file = ArrowFile, name = "Metadata/TSSEnrichment") + o <- h5write(obj = Metadata$ReadsInTSS, file = ArrowFile, name = "Metadata/ReadsInTSS") + gc() + + ############################################################# + # Remove Cells That Are Filtered? + ############################################################# + + if(removeFilteredCells){ + .messageDiffTime(sprintf("%s Removing Fragments from Filtered Cells", prefix), tstart, verbose = verboseHeader, addHeader = verboseAll) + idx <- which(Metadata$Keep == 1) + o <- .filterCellsFromArrow(inArrow = ArrowFile, cellNames = Metadata$cellNames[idx]) + o <- h5write(obj = Metadata$Keep[idx], file = ArrowFile, name = "Metadata/PassQC") + o <- h5write(obj = Metadata$nFrags[idx], file = ArrowFile, name = "Metadata/nFrags") + o <- h5write(obj = Metadata$nMonoFrags[idx], file = ArrowFile, name = "Metadata/nMonoFrags") + o <- h5write(obj = Metadata$nDiFrags[idx], file = ArrowFile, name = "Metadata/nDiFrags") + o <- h5write(obj = Metadata$nMultiFrags[idx], file = ArrowFile, name = "Metadata/nMultiFrags") + o <- h5write(obj = ((Metadata$nDiFrags + Metadata$nMultiFrags) / Metadata$nMonoFrags)[idx], file = ArrowFile, name = "Metadata/NucleosomeRatio") + o <- h5write(obj = Metadata$TSSEnrichment[idx], file = ArrowFile, name = "Metadata/TSSEnrichment") + o <- h5write(obj = Metadata$ReadsInTSS[idx], file = ArrowFile, name = "Metadata/ReadsInTSS") + } + + + ############################################################# + # Create Tile Matrix + ############################################################# + if(addTileMat){ + .messageDiffTime(sprintf("%s Adding TileMatrix", prefix), tstart, verbose = verboseHeader, addHeader = verboseAll) + TileMatParams$ArrowFile <- ArrowFile + TileMatParams$cellNames <- Metadata$cellNames[idx] + chromLengths <- end(genomeAnno$chromSizes) + names(chromLengths) <- paste0(seqnames(genomeAnno$chromSizes)) + TileMatParams$chromLengths <- chromLengths + TileMatParams$blacklist <- genomeAnno$blacklist + TileMatParams$force <- TRUE + TileMatParams$excludeChr <- excludeChr + tileMat <- do.call(.addTileMat, TileMatParams) + gc() + } + + ############################################################# + # Add Gene Score Matrix + ############################################################# + if(addGeneScoreMat){ + .messageDiffTime(sprintf("%s Adding GeneScoreMatrix", prefix), tstart, verbose = verboseHeader, addHeader = verboseAll) + GeneScoreMatParams$ArrowFile <- ArrowFile + GeneScoreMatParams$genes <- geneAnno$genes + GeneScoreMatParams$cellNames <- Metadata$cellNames[which(Metadata$Keep==1)] + GeneScoreMatParams$blacklist <- genomeAnno$blacklist + GeneScoreMatParams$force <- TRUE + GeneScoreMatParams$excludeChr <- excludeChr + geneScoreMat <- do.call(.addGeneScoreMat, GeneScoreMatParams) + gc() + } + + o <- h5closeAll() + + .messageDiffTime(sprintf("%s Finished Creating ArrowFile", prefix), tstart, verbose = verboseHeader, addHeader = verboseAll) + + + ArrowFile <- paste0(outputName, ".arrow") + + return(ArrowFile) + +} + +######################################################################################################### +# QC Methods +######################################################################################################### + +.fastFragmentInfo <- function( + ArrowFile, + cellNames = .availableCells(ArrowFile), + nucLength = 147, + ...){ + + #Info to get + matNuc <- matrix(0, nrow = length(cellNames), ncol = 3) + nFrags <- rep(0, length(cellNames)) + fragDist <- rep(0, 1000) + + chrArrow <- .availableChr(ArrowFile) + for(x in seq_along(chrArrow)){ + + #Read in frags + fragx <- .getFragsFromArrow(ArrowFile = ArrowFile, chr = chrArrow[x], out = "IRanges", cellNames = cellNames) + mcols(fragx)$RG@values <- S4Vectors::match(mcols(fragx)$RG@values, cellNames) + nFrags <- nFrags + S4Vectors:::tabulate(mcols(fragx)$RG, nbins = length(cellNames)) + + #Get Distributions + fragDist <- fragDist + tabulate(width(fragx), nbins = 1000) + w <- trunc(width(fragx)/nucLength) + 1 + w[w > 3] <- 3 + + #Get Nuc Info + matNuc <- matNuc + ArchR:::tabulate2dCpp( + w, xmin = 1, xmax = 3, + as.integer(mcols(fragx)$RG), ymin = 1, ymax = length(cellNames) + ) + + } + + df <- DataFrame(matNuc) + colnames(df) <- c("nMonoFrags", "nDiFrags", "nMultiFrags") + df$cellNames <- cellNames + df$nFrags <- nFrags + df <- df[,c("cellNames","nFrags","nMonoFrags", "nDiFrags", "nMultiFrags")] + + out <- list(dfSummary = df, fragDistribution = fragDist) + return(out) + +} + +.fastTSSEnrichment <- function( + TSS, + ArrowFile, + cellNames = NULL, + window = 101, + norm = 100, + flank = 2000, + minNorm = 1, + ...){ + + tstart <- Sys.time() + + #Validate + ArrowFile <- .validArrow(ArrowFile) + TSS <- .validGRanges(TSS) + + if(is.null(cellNames)){ + cellNames <- .availableCells(ArrowFile) + } + + #Create Window and Flank + TSS <- resize(TSS, 1, fix = "start") + strand(TSS) <- "*" + TSS <- unique(TSS) + tssWindow <- resize(TSS, window, "center") + tssWindow$type <- "window" + tssFlank <- c( + #Positive Flank + GRanges(seqnames(TSS), IRanges(end(TSS) + flank - norm + 1, end(TSS) + flank)), + #Negative Flank + GRanges(seqnames(TSS), IRanges(start(TSS) - flank, start(TSS) - flank + norm - 1)) + ) + tssFlank$type <- "flank" + tssFeatures <- c(tssWindow, tssFlank) + + #Count + .messageDiffTime("Counting Around TSS!", tstart) + + countList <- .fastFeatureCounts(feature = tssFeatures, ArrowFile = ArrowFile, cellNames = cellNames) + + #Normalize per BP + cWn <- countList$nWindow / window + cFn <- countList$nFlank / norm + + #Compute scores + tssScores <- 2 * cWn / (pmax(cFn[names(cWn)], minNorm)) #Multiply 2 because enrichment over average 2 flanks + names(tssScores) <- cellNames + tssScores <- round(tssScores, 3) + + .messageDiffTime("Computed TSS Scores!", tstart) + + return(list(tssScores=tssScores, tssReads=cWn)) + +} + +.fastFeatureCounts <- function(feature, ArrowFile, cellNames){ + + tstart1 <- Sys.time() + featureList <- split(feature, seqnames(feature)) + chrArrow <- .availableChr(ArrowFile) + featureList <- featureList[chrArrow] + if(length(featureList)==0){ + stop("Error No Overlap in Chromosomes and TSS Chromosomes!") + } + + #Count Vector + nWindow <- rep(0, length(cellNames)) + names(nWindow) <- cellNames + + nFlank <- rep(0, length(cellNames)) + names(nFlank) <- cellNames + + #Count + #message(sprintf("Counting Insertions, %s minutes elapsed...", round(difftime(Sys.time(), tstart1, units = "mins"),3))) + #pb <- txtProgressBar(min=0,max=100,initial=0,style=3) + + for(x in seq_along(featureList)){ + + #setTxtProgressBar(pb,round(x*100/length(featureList),0)) + + ############################################################################### + # Get Fragments + ############################################################################### + featurex <- featureList[[x]] + fragments <- .getFragsFromArrow(ArrowFile = ArrowFile, chr = names(featureList)[x], out = "IRanges", cellNames = cellNames) + mcols(fragments)$RG@values <- match(mcols(fragments)$RG@values, cellNames) + mcols(featurex)$typeIdx <- match(mcols(featurex)$type, c("window", "flank")) + + ############################################################################### + # Count Each Insertion + ############################################################################### + for(y in seq_len(2)){ + + if(y==1){ + temp <- IRanges(start(fragments), width=1) + }else if(y==2){ + temp <- IRanges(end(fragments), width=1) + } + stopifnot(length(temp) == length(fragments)) + + o <- findOverlaps(ranges(featurex),temp) + remove(temp) + gc() + + mat <- ArchR:::tabulate2dCpp( + x = as.vector(mcols(fragments)$RG[subjectHits(o)]), + xmin = 1, + xmax = length(cellNames), + y = mcols(featurex)$typeIdx[queryHits(o)], + ymin = 1, + ymax = 2 + ) + + #Add To + nWindow <- nWindow + mat[1, ] + nFlank <- nFlank + mat[2, ] + rm(o, mat) + + } + + rm(fragments) + gc() + + } + + message("\n") + .messageDiffTime("Finished Counting Insertions", tstart1) + + out <- list(nWindow = nWindow, nFlank = nFlank) + + return(out) + +} + +######################################################################################################### +# Methods to Turn Input File into a Temp File that can then be Efficiently converted to an Arrow! +######################################################################################################### +.isTabix <- function(file){ + tryCatch({ + TabixFile(file) + TRUE + }, error = function(x){ + tryCatch({ + message("Attempting to index ", file," as tabix...") + indexTabix(file, format = "bed") + TRUE + }, error = function(y){ + FALSE + }) + }) +} + +.tabixToTmp <- function( + tabixFile, + tmpFile = paste0(tempfile(),".h5"), + chromSizes, + nChunk = 3, + gsubExpression = NULL, + printEvery = 1, + verboseHeader = TRUE, + verboseAll = FALSE, + prefix = "", + tstart = NULL, + ... + ){ + + .requirePackage("Rsamtools") + + ####################################################################################################### + # We will dump a chunked genome into an Hdf5 file in a memory efficient manner! + ####################################################################################################### + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + if(verboseAll){ + printEvery <- 0.25 + }else{ + printEvery <- 1 + } + + nextPrint <- printEvery + o <- h5closeAll() + o <- h5createFile(tmpFile) + o <- h5createGroup(tmpFile, paste0("Fragments")) + o <- h5createGroup(tmpFile, paste0("Metadata")) + o <- h5write(obj = "Arrow", file = tmpFile, name = "Class") + o <- h5write(obj = "tmp", file = tmpFile, name = "Metadata/Sample") + + tileChromSizes <- unlist(tile(chromSizes, nChunk)) + mcols(tileChromSizes)$chunkName <- paste0(seqnames(tileChromSizes),"#chunk",seq_along(tileChromSizes)) + for(x in seq_along(tileChromSizes)){ + + if(as.numeric(difftime(Sys.time(),tstart,units="mins")) > nextPrint){ + .messageDiffTime(sprintf("%s Reading TabixFile %s Percent", prefix, round(100*x/length(tileChromSizes)),3), tstart, + verbose = verboseHeader, addHeader = verboseAll) + nextPrint <- nextPrint + printEvery + } + + dt <- Rsamtools::scanTabix(tabixFile, param = tileChromSizes[x])[[1]] %>% + textConnection %>% + {tryCatch(read.table(.), error = function(e) NULL)} %>% + {data.table(V2=.$V2 + 1, V3=.$V3, V4=.$V4)} + + #Care for Break Points + dt <- dt[dt$V2 >= start(tileChromSizes[x]),] + + if(nrow(dt) > 0){ + if(!is.null(gsubExpression)){ + scanChunk$V4 <- gsub(gsubExpression, "", scanChunk$V4) + } + + #Order by bc + setkey(dt, V4) + dt <- dt[order(V4)] + RG <- Rle(paste0(dt$V4)) + + chrTmp <- mcols(tileChromSizes)$chunkName[x] + chrPos <- paste0("Fragments/",chrTmp,"/Ranges") + chrRGLengths <- paste0("Fragments/",chrTmp,"/RGLengths") + chrRGValues <- paste0("Fragments/",chrTmp,"/RGValues") + lengthRG <- length(RG@lengths) + o <- h5createGroup(tmpFile, paste0("Fragments/",chrTmp)) + o <- .suppressAll(h5createDataset(tmpFile, chrPos, storage.mode = "integer", dims = c(nrow(dt), 2), level = 0)) + o <- .suppressAll(h5createDataset(tmpFile, chrRGLengths, storage.mode = "integer", dims = c(lengthRG, 1), level = 0)) + o <- .suppressAll(h5createDataset(tmpFile, chrRGValues, storage.mode = "character", + dims = c(lengthRG, 1), level = 0, size = nchar(RG@values[1]) + 1)) + o <- h5write(obj = cbind(dt$V2,dt$V3-dt$V2), file = tmpFile, name = chrPos) + o <- h5write(obj = RG@lengths, file = tmpFile, name = chrRGLengths) + o <- h5write(obj = RG@values, file = tmpFile, name = chrRGValues) + + rm(dt, RG) + gc() + } + } + + return(tmpFile) + +} + +.bamToTmp <- function( + bamFile, + tmpFile = paste0(tempfile(),".h5"), + chromSizes, + bamFlag = NULL, + nChunk = 3, + bcTag = "qname", + gsubExpression = NULL, + offsetPlus = 4, + offsetMinus = -5, + verboseHeader = TRUE, + verboseAll = FALSE, + prefix = "", + tstart = NULL, + ...){ + + .requirePackage("Rsamtools") + + ####################################################################################################### + # We will dump a chunked genome into an Hdf5 file in a memory efficient manner! + ####################################################################################################### + if(is.null(bamFlag)){ + bamFlag <- scanBamFlag(isMinusStrand = FALSE, isProperPair = TRUE) + }else if(inherits(bamFlag, "list") | inherits(bamFlag, "SimpleList")){ + bamFlag$isMinusStrand <- FALSE + bamFlag$isProperPair <- TRUE + bamFlag <- do.call(scanBamFlag, bamFlag) + }else{ + stop("bamFlag must be a list or null!") + } + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + if(verboseAll){ + printEvery <- 0.25 + }else{ + printEvery <- 1 + } + + nextPrint <- printEvery + + o <- h5closeAll() + o <- h5createFile(tmpFile) + o <- h5createGroup(tmpFile, paste0("Fragments")) + o <- h5createGroup(tmpFile, paste0("Metadata")) + o <- h5write(obj = "Arrow", file = tmpFile, name = "Class") + o <- h5write(obj = "tmp", file = tmpFile, name = "Metadata/Sample") + + tileChromSizes <- unlist(tile(chromSizes, nChunk)) + mcols(tileChromSizes)$chunkName <- paste0(seqnames(tileChromSizes),"#chunk",seq_along(tileChromSizes)) + for(x in seq_along(tileChromSizes)){ + + if(as.numeric(difftime(Sys.time(),tstart,units="mins")) > nextPrint){ + .messageDiffTime(sprintf("%s Reading BamFile %s Percent", prefix, round(100*x/length(tileChromSizes)),3), tstart, + verbose = verboseHeader, addHeader = verboseAll) + nextPrint <- nextPrint + printEvery + } + + #If barcode is stored in read name use qname + #Else look for barcode tag such as RG + if(tolower(bcTag)=="qname"){ + + scanChunk <- scanBam(bamFile, + param = ScanBamParam( + flag = bamFlag, + what = c("qname", "pos", "isize"), + which = tileChromSizes[x] + ))[[1]] + + if(!is.null(gsubExpression)){ + scanChunk$qname <- gsub(gsubExpression, "", scanChunk$qname) + } + + #Create Data Table for faster indexing + dt <- data.table( + start = scanChunk$pos + offsetPlus, + end = scanChunk$pos + abs(scanChunk$isize) - 1 + offsetMinus, + RG = scanChunk$qname + ) + + }else{ + + scanChunk <- scanBam(bamFile, + param = ScanBamParam( + flag = bamFlag, + what = c("pos", "isize"), + tag = bcTag, + which = tileChromSizes[x] + ))[[1]] + + if(!is.null(gsubExpression)){ + scanChunk$tag[[bcTag]] <- gsub(gsubExpression, "", scanChunk$tag[[bcTag]]) + } + + #Create Data Table for faster indexing + dt <- data.table( + start = scanChunk$pos + offsetPlus, + end = scanChunk$pos + abs(scanChunk$isize) - 1 + offsetMinus, + RG = scanChunk$tag[[bcTag]] + ) + } + + #Clean Up Memory + rm(scanChunk) + + #Care for Break Points + dt <- dt[dt$start >= start(tileChromSizes[x]),] + + if(nrow(dt) > 0){ + + #Order by bc + setkey(dt, RG) + dt <- dt[order(RG)] + RG <- Rle(dt$RG) + + chrTmp <- mcols(tileChromSizes)$chunkName[x] + chrPos <- paste0("Fragments/",chrTmp,"/Ranges") + chrRGLengths <- paste0("Fragments/",chrTmp,"/RGLengths") + chrRGValues <- paste0("Fragments/",chrTmp,"/RGValues") + lengthRG <- length(RG@lengths) + o <- h5createGroup(tmpFile, paste0("Fragments/",chrTmp)) + o <- .suppressAll(h5createDataset(tmpFile, chrPos, storage.mode = "integer", dims = c(nrow(dt), 2), level = 0)) + o <- .suppressAll(h5createDataset(tmpFile, chrRGLengths, storage.mode = "integer", dims = c(lengthRG, 1), level = 0)) + o <- .suppressAll(h5createDataset(tmpFile, chrRGValues, storage.mode = "character", + dims = c(lengthRG, 1), level = 0, size = nchar(RG@values[1]) + 1)) + o <- h5write(obj = cbind(dt$start,dt$end-dt$start), file = tmpFile, name = chrPos) + o <- h5write(obj = RG@lengths, file = tmpFile, name = chrRGLengths) + o <- h5write(obj = RG@values, file = tmpFile, name = chrRGValues) + + rm(dt, RG) + gc() + + } + + } + + return(tmpFile) + +} + + +######################################################################################################### +# Methods to temp file to arrow! +######################################################################################################### + +.tmpToArrow <- function( + tmpFile, + outArrow, + genome, + minFrags = 500, + sampleName, + verboseHeader = TRUE, + verboseAll = FALSE, + tstart = NULL, + prefix = "", + ... + ){ + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + .messageDiffTime(sprintf("%s Creating ArrowFile", prefix), tstart, verbose = verboseHeader, addHeader = verboseAll) + + o <- h5closeAll() + o <- h5createFile(outArrow) + o <- h5write(obj = "Arrow", file = outArrow, name = "Class") + o <- h5createGroup(outArrow, paste0("Metadata")) + o <- h5write(obj = sampleName, file = outArrow, name = "Metadata/Sample") + o <- h5write(obj = paste0(Sys.Date()), file = outArrow, name = "Metadata/Date") + o <- h5createGroup(outArrow, paste0("Fragments")) + + #Get Info + chunkNames <- .availableChr(tmpFile) + + ####################################################################################################### + # First we will count the number of occurences per barcode! + ####################################################################################################### + .messageDiffTime(sprintf("%s Counting Unique Barcodes", prefix), tstart, verbose = verboseAll) + o <- h5closeAll() + h5DF <- h5ls(tmpFile, recursive = TRUE) + dtList <- lapply(seq_along(chunkNames), function(x){ + chrTmp <- chunkNames[x] + nRG <- h5DF %>% + {.[.$group==paste0("/Fragments/",chrTmp) & .$name == "RGLengths",]$dim} %>% + {gsub(" x 1","",.)} %>% as.integer + if(nRG > 0){ + dt <- data.table( + values = h5read(tmpFile, paste0("Fragments/",chrTmp,"/RGValues")), + lengths = h5read(tmpFile, paste0("Fragments/",chrTmp,"/RGLengths")) + ) + }else{ + dt <- NULL + } + dt + }) + names(dtList) <- chunkNames + dt <- Reduce("rbind", dtList) + dt <- dt[, sum(lengths.V1),by=list(values.V1)] + + #Order to reduce number of hyperslabs + dt <- dt[order(V1,decreasing=TRUE)] + bcPass <- BStringSet(dt$values.V1[dt$V1 >= minFrags]) + rm(dt) + gc() + + #Add To Metadata + o <- h5write(obj = as.character(bcPass), file = outArrow, name = "Metadata/CellNames") + + ####################################################################################################### + # Second we will dump the chunks into an Arrow File! + ####################################################################################################### + chunkChr <- stringr::str_split(chunkNames, pattern = "#", simplify=TRUE)[,1] + currentChunk <- 0 + uniqueChr <- sort(unique(chunkChr)) + + for(x in seq_along(uniqueChr)){ + + .messageDiffTime(sprintf("%s Adding Chromosome %s of %s", prefix, x, length(uniqueChr)), tstart, verbose = verboseAll) + + #Determine Ranges and RG Pre-Allocation + ix <- BiocGenerics::which(chunkChr==uniqueChr[x]) + chunkNamex <- chunkNames[ix] + dtListx <- dtList[ix] + + #Chr + chr <- uniqueChr[x] + + #Read in Fragments! + fragments <- lapply(seq_along(chunkNamex), function(x){ + .getFragsFromArrow(tmpFile, chr = chunkNamex[x], out = "IRanges") + }) %>% Reduce("c", .) + mcols(fragments)$RG@values <- stringr::str_split(mcols(fragments)$RG@values, pattern = "#", simplify=TRUE)[,2] + + #Order RG RLE based on bcPass + fragments <- fragments[mcols(fragments)$RG %bcin% bcPass] + fragments <- fragments[order(S4Vectors::match(mcols(fragments)$RG, bcPass))] + lengthRG <- length(mcols(fragments)$RG@lengths) + + #HDF5 Write + chrPos <- paste0("Fragments/",chr,"/Ranges") + chrRGLengths <- paste0("Fragments/",chr,"/RGLengths") + chrRGValues <- paste0("Fragments/",chr,"/RGValues") + o <- h5createGroup(outArrow, paste0("Fragments/",chr)) + o <- .suppressAll(h5createDataset(outArrow, chrPos, storage.mode = "integer", dims = c(length(fragments), 2), level = 0)) + o <- .suppressAll(h5createDataset(outArrow, chrRGLengths, storage.mode = "integer", dims = c(lengthRG, 1), level = 0)) + o <- .suppressAll(h5createDataset(outArrow, chrRGValues, storage.mode = "character", dims = c(lengthRG, 1), level = 0, + size = max(nchar(mcols(fragments)$RG@values)) + 1)) + o <- h5write(obj = cbind(start(fragments),width(fragments)), file = outArrow, name = chrPos) + o <- h5write(obj = mcols(fragments)$RG@lengths, file = outArrow, name = chrRGLengths) + o <- h5write(obj = mcols(fragments)$RG@values, file = outArrow, name = chrRGValues) + + #Free Some Memory! + rm(fragments) + gc() + + } + + .messageDiffTime(sprintf("%s Finished Constructing ArrowFile", prefix), tstart, verbose = verboseHeader, addHeader = verboseAll) + + return(outArrow) + +} + +######################################################################################################### +# Methods to turn input file directly to arrow! These may not be memory friendly! +######################################################################################################### + +.tsvToArrow <- function( + tsvFile, + outArrow, + chromSizes, + genome, + minFrags = 500, + sampleName, + ...){ + + tstart <- Sys.time() + o <- h5closeAll() + o <- h5createFile(outArrow) + o <- h5write(obj = "Arrow", file = outArrow, name = "Class") + o <- h5createGroup(outArrow, paste0("Metadata")) + o <- h5write(obj = paste0(Sys.Date()), file = outArrow, name = "Metadata/Date") + o <- h5write(obj = sampleName, file = outArrow, name = "Metadata/Sample") + o <- h5createGroup(outArrow, paste0("Fragments")) + + ############################################################# + #Read in TSV File... + ############################################################# + .messageDiffTime("Reading full inputTSV with data.table::fread", tstart, addHeader = TRUE) + dt <- fread(tsvFile, sep = "\t", select = c(1,2,3,4)) + setkey(dt, V4) #Set Key + dt <- dt[order(dt$V4),] #Sort Data.table + dt <- DataFrame(chr = Rle(dt$V1), start = dt$V2, end = dt$V3, RG = Rle(dt$V4)) + + #Order to reduce number of hyperslabs + reOrderRG <- dt$RG@values[order(dt$RG@lengths, decreasing=TRUE)] + dt <- dt[S4Vectors::match(dt$RG, reOrderRG),] + gc() + + ############################################################# + #Filter Minimum because this would not be worth keeping at all! + ############################################################# + idx <- BiocGenerics::which(dt$RG %bcin% dt$RG@values[dt$RG@lengths >= minFrags]) + .messageDiffTime(sprintf("Filtering Fragments less than %s Fragments (%s)", minFrags, 1 - round(length(idx) / nrow(dt),3)), tstart, addHeader = TRUE) + dt <- dt[idx,] + remove(idx) + gc() + + #Add To Metadata + o <- h5write(obj = as.character(dt$RG@values), file = outArrow, name = "Metadata/CellNames") + + ############################################################# + #Keep Those only in ChromSizes + ############################################################# + uniqueChr <- unique(paste0(dt$chr@values)) + dt <- dt[BiocGenerics::which(dt$chr %bcin% paste0(seqnames(chromSizes))),] + + ############################################################# + #Check all chromSizes represented... + ############################################################# + if(nrow(dt) == 0 | !all(paste0(seqnames(chromSizes)) %in% uniqueChr)){ + notIn <- paste0(seqnames(chromSizes)[BiocGenerics::which(seqnames(chromSizes) %bcni% uniqueChr)]) + stop(sprintf("Error no fragments in all seqnames of chromSizes (%s) are you sure this is the correct genome?",notIn)) + } + + ############################################################# + #Write Fragments + ############################################################# + dt$start <- dt$start + 1 + expAll <- 0 + obsAll <- 0 + seqL <- 0 + for(i in seq_along(uniqueChr)){ + + .messageDiffTime(sprintf("Writing Chromosome %s of %s to Arrow File!", i, length(uniqueChr)), tstart) + chri <- uniqueChr[i] + dti <- dt[BiocGenerics::which(dt$chr==chri),] + chrPos <- paste0("Fragments/",chri,"/Ranges") + chrRGLengths <- paste0("Fragments/",chri,"/RGLengths") + chrRGValues <- paste0("Fragments/",chri,"/RGValues") + lengthRG <- length(dti$RG@lengths) + o <- h5createGroup(outArrow, paste0("Fragments/",chri)) + o <- .suppressAll(h5createDataset(outArrow, chrPos, storage.mode = "integer", dims = c(nrow(dti), 2), level = 0)) + o <- .suppressAll(h5createDataset(outArrow, chrRGLengths, storage.mode = "integer", dims = c(lengthRG, 1), level = 0)) + o <- .suppressAll(h5createDataset(outArrow, chrRGValues, storage.mode = "character", dims = c(lengthRG, 1), level = 0, size = nchar(dti$RG@values[1]) + 1)) + o <- h5write(obj = cbind(dti$start,dti$end-dti$start), file = outArrow, name = chrPos) + o <- h5write(obj = dti$RG@lengths, file = outArrow, name = chrRGLengths) + o <- h5write(obj = dti$RG@values, file = outArrow, name = chrRGValues) + + rm(dti) + gc() + + } + + .messageDiffTime("Finished Constructing Arrow File!", tstart) + + #Clean Up + rm(dt) + gc() + + return(outArrow) + +} + + +######################################################################################################### +# Filtering bad fragments! +######################################################################################################### + +.filterCellsFromArrow <- function(inArrow, cellNames){ + + tstart <- Sys.time() + outArrow <- paste0(tempfile(tmpdir="."), ".arrow") + print(outArrow) + + o <- h5closeAll() + o <- h5createFile(outArrow) + o <- h5write(obj = "Arrow", file = outArrow, name = "Class") + o <- h5createGroup(outArrow, paste0("Metadata")) + o <- h5write(obj = paste0(Sys.Date()), file = outArrow, name = "Metadata/Date") + o <- h5write(obj = .sampleName(inArrow), file = outArrow, name = "Metadata/Sample") + o <- h5write(obj = paste0(stringr::str_split(cellNames, pattern = "#", simplify = TRUE)[,2]), file = outArrow, name = "Metadata/CellNames") + o <- h5createGroup(outArrow, paste0("Fragments")) + + allChr <- .availableChr(inArrow) + + for(i in seq_along(allChr)){ + + chr <- allChr[i] + fragments <- .getFragsFromArrow(inArrow, chr = chr) + fragments <- fragments[BiocGenerics::which(mcols(fragments)$RG %bcin% cellNames)] + mcols(fragments)$RG@values <- stringr::str_split(mcols(fragments)$RG@values, pattern = "#", simplify= TRUE)[,2] + lengthRG <- length(mcols(fragments)$RG@lengths) + + #HDF5 Write + chrPos <- paste0("Fragments/",chr,"/Ranges") + chrRGLengths <- paste0("Fragments/",chr,"/RGLengths") + chrRGValues <- paste0("Fragments/",chr,"/RGValues") + o <- h5createGroup(outArrow, paste0("Fragments/",chr)) + o <- .suppressAll(h5createDataset(outArrow, chrPos, storage.mode = "integer", dims = c(length(fragments), 2), level = 0)) + o <- .suppressAll(h5createDataset(outArrow, chrRGLengths, storage.mode = "integer", dims = c(lengthRG, 1), level = 0)) + o <- .suppressAll(h5createDataset(outArrow, chrRGValues, storage.mode = "character", dims = c(lengthRG, 1), level = 0, + size = max(nchar(mcols(fragments)$RG@values)) + 1)) + o <- h5write(obj = cbind(start(fragments),width(fragments)), file = outArrow, name = chrPos) + o <- h5write(obj = mcols(fragments)$RG@lengths, file = outArrow, name = chrRGLengths) + o <- h5write(obj = mcols(fragments)$RG@values, file = outArrow, name = chrRGValues) + + #Free Some Memory! + rm(fragments) + gc() + + } + + #Remove old Arrow + rmf <- file.remove(inArrow) + out <- .fileRename(from = outArrow, to = inArrow) + + .messageDiffTime("Finished Constructing Filtered Arrow File!", tstart) + + return(inArrow) + +} + +.fileRename <- function(from, to){ + + if(!file.exists(from)){ + stop("Input file does not exist!") + } + + tryCatch({ + + .suppressAll(file.rename(from, to)) + + }, error = function(x){ + + tryCatch({ + + system(paste0("mv ", from, " ", to)) + + return(to) + + }, error = function(y){ + + stop("File Moving/Renaming Failed!") + + }) + + }) + +} + + + + + + diff --git a/R/DoubletsScores.R b/R/DoubletsScores.R new file mode 100644 index 00000000..42ebd0e1 --- /dev/null +++ b/R/DoubletsScores.R @@ -0,0 +1,497 @@ +#' Add Doublet Scores to Arrows/ArchRProject +#' +#' This function for each sample will independently assign inferred doublet information +#' to each cell for removing strong heterotropic doublet based clusters downstream. +#' +#' @param input ArchRProject or ArrowFiles +#' @param useMatrix matrix name for performing analyses +#' @param k number of cells nearby a simulated doublet to consider +#' @param nTrials number of trials to simulate doublets in thousands +#' @param knnMethod dimension reduction to use for KNN (UMAP or SVD) +#' @param UMAPParams list of parameters to pass to uwot::umap +#' @param LSIParams list of parameters to pass to IterativeLSI +#' @param threads number of threads +#' @param parallelParam parallel parameters for batch style execution +#' @param outDir directory to plot and store results from analyses +#' @param threads number of therads for parallel execution +#' @param parallelParam parallel parameters for batch style execution +#' @param verboseHeader verbose sections +#' @param verboseAll verbose sections and subsections +#' @param ... additional args +#' @export +addDoubletScores <- function( + input, + useMatrix = "TileMatrix", + k = 200, + nTrials = 100, + knnMethod = "UMAP", + UMAPParams = list(), + LSIParams = list(), + useClusters = FALSE, + outDir = "QualityControl", + threads = 1, + parallelParam = NULL, + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + if(tolower(useMatrix) %ni% c("peakmatrix","tilematrix")){ + stop(sprintf("Supported Matrix Names at the moment are PeakMatrix and TileMatrix : ", useMatrix)) + } + + if(inherits(input, "ArchRProject")){ + + ArrowFiles <- getArrowFiles(input) + allCells <- rownames(getCellColData(input)) + outDir <- getOutputDirectory(input) + + }else if(inherits(input, "character")){ + + ArrowFiles <- input + allCells <- NULL + + }else{ + + stop("Error Unrecognized Input!") + + } + + if(!all(file.exists(ArrowFiles))){ + stop("Error Input Arrow Files do not all exist!") + } + + #Add args to list + args <- mget(names(formals()),sys.frame(sys.nframe()))#as.list(match.call()) + args$ArrowFiles <- ArrowFiles + args$allCells <- allCells + args$X <- seq_along(ArrowFiles) + args$FUN <- .addDoubScores + args$registryDir <- file.path(outDir, "AddDoubletsRegistry") + + #Run With Parallel or lapply + outList <- .batchlapply(args, sequential = TRUE) + names(outList) <- names(ArrowFiles) + + #Return Output + if(inherits(input, "ArchRProject")){ + + input@cellColData[,"doubletScore"] <- -1 + input@cellColData[,"doubletEnrichment"] <- -1 + + for(i in seq_along(outList)){ + input@cellColData[names(outList[[i]]$doubletScore), "doubletScore"] <- outList[[i]]$doubletScore + input@cellColData[names(outList[[i]]$doubletEnrich), "doubletEnrichment"] <- outList[[i]]$doubletEnrich + } + + return(input) + + }else{ + + return(outList) + + } + +} + +.addDoubScores <- function( + i, + ArrowFiles, + useMatrix = "TileMatrix", + allCells = NULL, + UMAPParams = list(), + LSIParams = list(sampleCells = NULL), + nTrials = 100, + k = 200, + nSample = 1000, + knnMethod = "UMAP", + outDir = "QualityControl", + useClusters = FALSE, + subThreads = 1, + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + tstart <- Sys.time() + ArrowFile <- ArrowFiles[i] + .messageDiffTime(sprintf("Computing Doublet Scores %s of %s!", i, length(ArrowFiles)), tstart, addHeader = TRUE) + + ################################################# + # 1. Create ArchRProject For Iterative LSI + ################################################# + proj <- suppressMessages(ArchRProject( + ArrowFiles = ArrowFile, + sampleNames = .sampleName(ArrowFile), + outputDirectory = tempdir(), + copyArrows = FALSE, + showLogo = FALSE, + geneAnnotation = .nullGeneAnnotation(), #this doesnt matter just needs to be valid + genomeAnnotation = .nullGenomeAnnotation() #this doesnt matter just needs to be valid + )) + if(is.null(allCells)){ + proj@cellColData <- proj@cellColData[.availableCells(ArrowFile, useMatrix),] + }else{ + proj@cellColData <- proj@cellColData[which(rownames(proj@cellColData) %in% allCells),] + } + + ################################################# + # 2. Compute Iterative LSI + ################################################# + LSIParams$ArchRProj <- proj + LSIParams$saveIterations <- FALSE + LSIParams$useMatrix <- useMatrix + LSIParams$threads <- subThreads + LSIParams$verboseHeader <- verboseHeader + LSIParams$verboseAll <- verboseAll + proj <- do.call(IterativeLSI, LSIParams) + if(useClusters){ + proj <- IdentifyClusters(proj) + } + + ################################################# + # 3. Get LSI Partial Matrix For Simulation + ################################################# + .messageDiffTime("Constructing Partial Matrix for Projection", tstart, addHeader = verboseHeader) + LSI <- proj@reducedDims[["IterativeLSI"]] + featureDF <- LSI$LSIFeatures + mat <- .getPartialMatrix( + ArrowFiles = getArrowFiles(proj), + featureDF = featureDF, + threads = subThreads, + cellNames = rownames(getCellColData(proj)), + doSampleCells = FALSE, + verbose = verboseAll + ) + cellNames <- rownames(getCellColData(proj)) + + ################################################# + # 2. Run UMAP for LSI-Projection + ################################################# + .messageDiffTime("Running LSI UMAP", tstart, addHeader = verboseHeader) + set.seed(1) # Always do this prior to UMAP + UMAPParams <- .mergeParams(UMAPParams, list(n_neighbors=40, min_dist=0.4, metric="euclidean", verbose=FALSE)) + UMAPParams$X <- LSI$matSVD + UMAPParams$ret_nn <- TRUE + UMAPParams$ret_model <- TRUE + UMAPParams$n_threads <- subThreads + uwotUmap <- do.call(uwot::umap, UMAPParams) + + ################################################# + # 4. Simulate and Project Doublets + ################################################# + .messageDiffTime("Simulating and Projecting Doublets", tstart, addHeader = verboseHeader) + simDoublets <- .simulateProjectDoublets( + mat = mat, + LSI = LSI, + clusters = if(useClusters) getCellColData(proj, "Clusters", drop = TRUE) else NULL, + sampleRatio1 = c(1/2), + sampleRatio2 = c(1/2), + nTrials = nTrials, + nSample = nSample, + k = k, + uwotUmap = uwotUmap, + knnMethod = knnMethod, + seed = 1, + threads = threads + ) + + ################################################# + # 5. Plot / Save Results + ################################################# + + #Create Plot DF + df <- data.frame(row.names = rownames(LSI$matSVD), uwotUmap[[1]], type = "experiment") + df[,"score"] <- 0 + df[,"enrichment"] <- 0 + df[names(simDoublets$doubletScore),"score"] <- simDoublets$doubletScore + df[names(simDoublets$doubletScore),"enrichment"] <- simDoublets$doubletEnrich + + doubUMAP <- simDoublets$doubletUMAP + dfDoub <- data.frame( + row.names = paste0("doublet_", seq_len(nrow(doubUMAP))), + ArchR:::getDensity(doubUMAP[,1], doubUMAP[,2]), + type = "simulated_doublet" + ) + dfDoub <- dfDoub[order(dfDoub$density), , drop = FALSE] + dfDoub$color <- dfDoub$density + + #Plot Doublet Summary + pdf(file.path(outDir, paste0(.sampleName(ArrowFile), "-Doublet-Summary.pdf")), width = 6, height = 6) + + #Plot Doublet Density + xlim <- range(df$X1) %>% extendrange(f = 0.05) + ylim <- range(df$X2) %>% extendrange(f = 0.05) + + + + if(!requireNamespace("ggrastr", quietly = TRUE)){ + + message("ggrastr is not available for rastr of points, continuing without rastr!") + + pdensity <- ggplot() + + geom_point(data = df, aes(x=X1,y=X2),color="lightgrey", size = 0.5) + + geom_point(data = dfDoub, aes(x=x,y=y,colour=color), size = 0.5) + + scale_colour_gradientn(colors = paletteContinuous(set = "white_blue_purple")) + + xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + + guides(fill = FALSE) + theme_ArchR(base_size = 6) + + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.text.y = element_blank(), axis.ticks.y = element_blank()) + + coord_equal(ratio = diff(xlim)/diff(ylim), xlim = xlim, ylim = ylim, expand = FALSE) + + ggtitle("Doublet Density Overlayed") + theme(legend.direction = "horizontal", + legend.box.background = element_rect(color = NA)) + + }else{ + + .requirePackage("ggrastr") + + pdensity <- ggplot() + + geom_point_rast(data = df, aes(x=X1,y=X2),color="lightgrey", size = 0.5) + + geom_point_rast(data = dfDoub, aes(x=x,y=y,colour=color), size = 0.5) + + scale_colour_gradientn(colors = paletteContinuous(set = "white_blue_purple")) + + xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + + guides(fill = FALSE) + theme_ArchR(base_size = 6) + + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.text.y = element_blank(), axis.ticks.y = element_blank()) + + coord_equal(ratio = diff(xlim)/diff(ylim), xlim = xlim, ylim = ylim, expand = FALSE) + + ggtitle("Doublet Density Overlayed") + theme(legend.direction = "horizontal", + legend.box.background = element_rect(color = NA)) + + } + + print(.fixPlotSize(pdensity, plotWidth = unit(6, "in"), plotHeight = unit(6, "in"))) + + #Plot Doublet Score + pscore <- ggPoint( + x = df[,1], + y = df[,2], + color = df$score, + xlim = xlim, + ylim = ylim, + discrete = FALSE, + size = 0.5, + xlab = "UMAP Dimension 1", + ylab = "UMAP Dimension 2", + continuousSet = "white_blue_purple", + title = "Doublet Scores -log10(FDR)", + rastr = TRUE, + baseSize = 6 + ) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.text.y = element_blank(), axis.ticks.y = element_blank()) + + print(.fixPlotSize(pscore, plotWidth = unit(6, "in"), plotHeight = unit(6, "in"))) + + #Plot Enrichment Summary + penrich <- ggPoint( + x = df[,1], + y = df[,2], + color = df$enrichment, + xlim = xlim, + ylim = ylim, + discrete = FALSE, + size = 0.5, + xlab = "UMAP Dimension 1", + ylab = "UMAP Dimension 2", + continuousSet = "white_blue_purple", + title = "Doublet Enrichment", + rastr = TRUE + ) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), + axis.text.y = element_blank(), axis.ticks.y = element_blank()) + + print(.fixPlotSize(penrich, plotWidth = unit(6, "in"), plotHeight = unit(6, "in"))) + + dev.off() + + summaryList <- SimpleList( + originalDataUMAP = df, + simulatedDoubletUMAP = dfDoub + ) + saveRDS(summaryList, file.path(outDir, paste0(.sampleName(ArrowFile), "-Doublet-Summary.rds"))) + + ################################################# + # 6. Add Info To Arrow! + ################################################# + allCells <- .availableCells(ArrowFile, passQC = FALSE) + + allDoubletScores <- rep(-1, length(allCells)) + names(allDoubletScores) <- allCells + allDoubletScores[names(simDoublets$doubletScore)] <- simDoublets$doubletScore + + allDoubletEnrichment <- rep(-1, length(allCells)) + names(allDoubletEnrichment) <- allCells + allDoubletEnrichment[names(simDoublets$doubletEnrich)] <- simDoublets$doubletEnrich + + o <- h5closeAll() + h5write(allDoubletScores, file = ArrowFile, "Metadata/DoubletScore") + h5write(allDoubletEnrichment, file = ArrowFile, "Metadata/DoubletEnrichment") + o <- h5closeAll() + + out <- SimpleList(doubletScore = simDoublets$doubletScore, doubletEnrich = simDoublets$doubletEnrich) + + return(out) + +} + +.simulateProjectDoublets <- function( + mat, + LSI, + uwotUmap, + clusters = NULL, + sampleRatio1 = c(0.5), + sampleRatio2 = c(0.5), + nTrials = 100, + nSample = 1000, + k = 200, + knnMethod = "UMAP", + seed = 1, + threads = 16 + ){ + + .sampleSparseMat <- function(mat, sampleRatio = 0.5){ + total <- length(mat@x) + sampleTo <- floor(total * (1-sampleRatio)) + mat@x[sample(seq_len(total), sampleTo)] <- 0 + mat <- drop0(mat) + mat + } + + set.seed(seed) + + if(is.null(clusters)){ + + simLSI <- .safelapply(seq_len(nTrials), function(y){ + + if(y %% 5 == 0){ + gc() + } + + lapply(seq_along(sampleRatio1), function(x){ + + idx1 <- sample(seq_len(ncol(mat)), nSample, replace = TRUE) + idx2 <- sample(seq_len(ncol(mat)), nSample, replace = TRUE) + + #Simulated Doublet + simulatedMat <- .sampleSparseMat(mat = mat[,idx1], sampleRatio = sampleRatio1[x]) + + .sampleSparseMat(mat = mat[,idx2], sampleRatio = sampleRatio2[x]) + + #Project LSI + lsiProject <- suppressMessages(projectLSI(simulatedMat, LSI)) + + lsiProject + + }) %>% Reduce("rbind", .) + + + }, threads = threads) %>% Reduce("rbind", .) + + }else{ + + comClust <- combn(unique(clusters), 2) + + simLSI <- .safelapply(seq_len(ncol(comClust)), function(y){ + + if(y %% 5 == 0){ + gc() + } + + lapply(seq_along(sampleRatio1), function(x){ + + idx1 <- sample(which(clusters==comClust[1,y]), nSample, replace = TRUE) + idx2 <- sample(which(clusters==comClust[2,y]), nSample, replace = TRUE) + + #Simulated Doublet + simulatedMat <- .sampleSparseMat(mat = mat[,idx1], sampleRatio = sampleRatio1[x]) + + .sampleSparseMat(mat = mat[,idx2], sampleRatio = sampleRatio2[x]) + + #Project LSI + lsiProject <- suppressMessages(projectLSI(simulatedMat, LSI)) + + lsiProject + + }) %>% Reduce("rbind", .) + + + }, threads = threads) %>% Reduce("rbind", .) + + } + + + #Project UMAP + set.seed(1) # Always do this prior to UMAP + umapProject <- data.frame(uwot::umap_transform(as.matrix(simLSI), uwotUmap, verbose = FALSE, n_threads = threads)) + + #Compute KNN + if(toupper(knnMethod) == "SVD"){ + + knnDoub <- computeKNN(LSI$matSVD, simLSI, k) + + }else if(toupper(knnMethod) == "UMAP"){ + + knnDoub <- computeKNN(uwotUmap[[1]], umapProject, k) + + }else{ + + stop("Error KNN Method Not Recognized!") + + } + + #Compile KNN Sums + countKnn <- rep(0, nrow(LSI$matSVD)) + names(countKnn) <- rownames(LSI$matSVD) + + tabDoub <- table(as.vector(knnDoub)) + countKnn[as.integer(names(tabDoub))] <- countKnn[as.integer(names(tabDoub))] + tabDoub + + #P-Values + pvalBinomDoub <- lapply(seq_along(countKnn), function(x){ + pbinom(countKnn[x] - 1, sum(countKnn), 1 / nrow(LSI$matSVD), lower.tail = FALSE) + }) %>% unlist + + #Adjust + padjBinomDoub <- p.adjust(pvalBinomDoub, method = "fdr") + + #Convert To Scores + doubletScore <- -log10(pmax(padjBinomDoub, 4.940656e-324)) + doubletEnrich <- (countKnn / sum(countKnn)) / (1 / nrow(LSI$matSVD)) + + out <- SimpleList(doubletUMAP = umapProject, doubletScore = doubletScore, doubletEnrich = doubletEnrich) + + out + +} + +#' Add Demuxlet Results to ArchR Project +#' +#' This function will read in .best file output from demuxlet and add the +#' classifications into the cellColData for the ArchR Project +#' +#' @param ArchRProj ArchR Project +#' @param bestFiles paths to best files +#' @param sampleNames sampleNames corresponding to best files (match those in ArchRProj) +#' @export +addDemuxletResults <- function(ArchRProj, bestFiles, sampleNames){ + + .requirePackage("readr") + + if(!all(sampleNames %in% rownames(getSampleColData(ArchRProj)))){ + samples <- sampleNames[sampleNames %ni% rownames(getSampleColData(ArchRProj))] + warning(sprintf("Sample %s not in sampleNames of ArchRProj!", samples)) + } + + ccd <- getCellColData(ArchRProj) + ccd[ , "DemuxletClassify"] <- "NotClassified" + + for(x in seq_along(bestFiles)){ + best <- .suppressAll(data.frame(readr::read_tsv(bestFiles[x]))) + classification <- stringr::str_split(best$BEST, pattern = "-", simplify=TRUE)[,1] + cellNames <- paste0(sampleNames[x], "#", best$BARCODE) + idx <- which(cellNames %in% rownames(ccd)) + ccd[ cellNames[idx], "DemuxletClassify"] <- classification[idx] + } + + ArchRProj@cellColData <- ccd + ArchRProj + +} + + diff --git a/R/FilterCells.R b/R/FilterCells.R new file mode 100644 index 00000000..7465664c --- /dev/null +++ b/R/FilterCells.R @@ -0,0 +1,83 @@ +#' Extend Filter then Normalize Scores for Summits +#' @param df dataframe where first column is sample names 2nd column is group information and 3rd column is MACS2 summit files +#' @param genome mm9, hg19 character or BSgenome object +#' @param blacklist regions to blacklist +#' @param extend how to extend summits (summit +- extend) +#' @param scorePerMillion normalized Score-per-million minimum to keep +#' @param selectionRules string with a formula containing n (majority = (n+1)/2, multiple samples = 2) +#' @export +FilterCells <- function(ArchRProj, filterList){ + + ccd <- getCellColData(ArchRProj) + + cellsPF <- lapply(seq_along(filterList), function(x){ + + if(names(filterList[x]) %ni% colnames(ccd)){ + stop(names(filterList[x]), " is not in colnames of cellColData") + } + + vx <- ccd[,names(filterList[x])] + + if(inherits(filterList[[x]], "numeric")){ + + cutLow <- filterList[[x]][1] + cutHigh <- if(is.na(filterList[[x]][2])) Inf else filterList[[x]][2] + idx <- rownames(ccd)[vx >= cutLow & vx <= cutHigh] + + }else if(inherits(filterList[[x]], "character")){ + + idx <- rownames(ccd)[vx %bcin% filterList[[x]]] + + }else if(inherits(filterList[[x]], "list") | inherits(filterList[[x]], "SimpleList")){ + + idx <- lapply(seq_along(filterList[[x]]), function(y){ + + if(names(filterList[[x]][y]) %ni% ccd$Sample){ + stop(names(filterList[[x]][y]), " is not in sampleNames of ArchR Project") + } + + ccdy <- ccd[BiocGenerics::which(ccd$Sample == names(filterList[[x]][y])),] + vy <- ccdy[,names(filterList[x])] + + if(inherits(filterList[[x]][[y]], "numeric")){ + + cutLow <- filterList[[x]][[y]][1] + cutHigh <- if(is.na(filterList[[x]][[y]][2])) Inf else filterList[[x]][[y]][2] + rownames(ccdy)[vy >= cutLow & vy <= cutHigh] + + }else if(inherits(filterList[[x]][[y]], "character")){ + + rownames(ccdy)[vy %bcin% filterList[[x]][[y]]] + + }else{ + + stop(names(filterList[x]), " is not a numeric, character or list!") + + } + + + }) %>% Reduce("c", .) + + }else{ + + stop(names(filterList[x]), " is not a numeric, character or list!") + + } + + idx + + }) %>% Reduce("intersect", .) + + + if(length(cellsPF) == 0){ + stop("0 Cells passing filter please consider less stringent thresholds!") + } + + ArchRProj@cellColData <- ccd[cellsPF,] + + ArchRProj + +} + + + diff --git a/R/Footprinting.R b/R/Footprinting.R new file mode 100644 index 00000000..3e94b026 --- /dev/null +++ b/R/Footprinting.R @@ -0,0 +1,421 @@ +#' Plot Group Footprints +#' +#' This function will create an Arrow Files from input files +#' for downstream analysis +#' +#' @param input ArchRProject or previous Footprint Summarized Experiment +#' @param positions sample names corresponding to input files +#' @param groupBy output names prefix (ie PBMC -> PBMC.arrow) +#' @param useGroups geneAnnotation input for TSS Scores etc. +#' @param pal genomeAnnotation input for ChromSizes Nucleotide Information etc. +#' @param flank min fragments per cell to be filtered for analyses such as tileMat etc. +#' @param flankNorm min TSS Score per cell to be filtered for analyses such as tileMat etc. +#' @param smoothWindow remove fragments corresponding to cells pass filterFrags and filterTSS +#' @param nTop min fragments per cell to be immediately filtered +#' @param normMethod normalization method for footprint plot relative to bias +#' @param threads number of therads for parallel execution +#' @param verboseHeader verbose sections +#' @param verboseAll verbose sections and subsections +#' @param ... additional args +#' @export +plotFootprint <- function( + input = NULL, + positions = NULL, + groupBy = "Clusters", + useGroups = NULL, + pal = NULL, + flank = 250, + flankNorm = 50, + smoothWindow = 10, + nTop = NULL, + normMethod = "none", + threads = 16, + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + tstart <- Sys.time() + if(inherits(input, "ArchRProject")){ + + #Validate Positions + if(!inherits(positions, "GenomicRangesList") & !inherits(positions, "list") & !inherits(positions, "SimpleList")){ + stop("Positions is not a list!") + } + positions <- as(positions, "list") + valid <- lapply(positions, function(x) inherits(x, "GRanges")) %>% unlist %>% all + if(!valid){ + stop("Positions is not a list of GenomicRanges!") + } + + #If wanted can subset top positions + if(!is.null(nTop)){ + posNames <- names(positions) + positions <- lapply(seq_along(positions), function(x){ + positions[[x]][head(order(mcols(positions[[x]])$score, decreasing=TRUE), nTop)] + }) + names(positions) <- posNames + } + + #Get Footprints + .messageDiffTime("Summarizing Footprints", tstart, addHeader = verboseAll) + seFoot <- summarizeFootprints( + ArchRProj = input, + positions = positions, + groupBy = groupBy, + useGroups = useGroups, + flank = flank, + threads = threads, + verboseHeader = verboseHeader, + verboseAll = verboseAll + ) + + }else{ + + if(inherits(input, "SummarizedExperiment")){ + seFoot <- input + rm(input) + gc() + if(!is.null(useGroups)){ + if(sum(SummarizedExperiment::colData(seFoot)[,1] %in% useGroups) == 0){ + stop("No Groups found matching useGroups!") + } + seFoot <- seFoot[,SummarizedExperiment::colData(seFoot)[,1] %in% useGroups] + } + } + + } + + for(i in seq_along(seFoot@assays)){ + print(.ggFootprint(seFoot, names(seFoot@assays)[i], pal = pal, smoothWindow = smoothWindow, flank = flank, flankNorm = flankNorm)) + } + + seFoot + +} + +.ggFootprint <- function(seFoot, name, pal, smoothWindow, flank, flankNorm, baseSize = 6, normMethod, ...){ + + #Get Footprint Info + rowDF <- SummarizedExperiment::rowData(seFoot) + footMat <- .getAssay(seFoot[BiocGenerics::which(rowDF[,2]=="footprint"),], name) + biasMat <- .getAssay(seFoot[BiocGenerics::which(rowDF[,2]=="bias"),], name) + footDF <- rowDF[BiocGenerics::which(rowDF[,2]=="footprint"),] + biasDF <- rowDF[BiocGenerics::which(rowDF[,2]=="bias"),] + + #Smooth Foot and Bias Mat because of sparsity + footMat <- apply(footMat, 2, function(x) .centerRollMean(x, smoothWindow)) + biasMat <- apply(biasMat, 2, function(x) .centerRollMean(x, smoothWindow)) + + #Normalize Foot and Bias Mat + idx <- which(abs(footDF$x) >= flank - flankNorm) + footMat <- t(t(footMat) / colMeans(footMat[idx, ,drop=FALSE])) + biasMat <- t(t(biasMat) / colMeans(biasMat[idx, ,drop=FALSE])) + + #Norm Foot By Bias + if(tolower(normMethod) == "none"){ + }else if(tolower(normMethod) == "subtract"){ + footMat <- footMat - biasMat + }else if(tolower(normMethod) == "divide"){ + footMat <- footMat / biasMat + }else{ + stop("normMethod not recognized!") + } + + #Get Mean and SD for each Assay + footMatMean <- .groupMeans(footMat, SummarizedExperiment::colData(seFoot)$Group) + footMatSd <- .groupSds(footMat, SummarizedExperiment::colData(seFoot)$Group) + biasMatMean <- .groupMeans(biasMat, SummarizedExperiment::colData(seFoot)$Group) + biasMatSd <- .groupSds(biasMat, SummarizedExperiment::colData(seFoot)$Group) + + #Create Plot Data Frames + plotFootDF <- lapply(seq_len(ncol(footMatMean)), function(x){ + data.frame( + x = footDF$x, + mean = footMatMean[,x], + sd = footMatSd[,x], + group = colnames(footMatMean)[x] + ) + }) %>% Reduce("rbind",. ) + plotFootDF$group <- factor(paste0(plotFootDF$group), levels = unique(gtools::mixedsort(paste0(plotFootDF$group)))) + + plotBiasDF <- lapply(seq_len(ncol(biasMatMean)), function(x){ + data.frame( + x = biasDF$x, + mean = biasMatMean[,x], + sd = biasMatSd[,x], + group = colnames(biasMatMean)[x] + ) + }) %>% Reduce("rbind",. ) + plotBiasDF$group <- factor(paste0(plotBiasDF$group), levels = unique(gtools::mixedsort(paste0(plotBiasDF$group)))) + + #Plot GG + if(is.null(pal)){ + pal <- paletteDiscrete(values=SummarizedExperiment::colData(seFoot)$Group) + } + + plotMax <- plotFootDF[order(plotFootDF$mean,decreasing=TRUE),] + plotMax <- plotMax[abs(plotMax$x) <= flank - flankNorm,] + plotMax <- plotMax[!duplicated(plotMax$group),] + plotMax <- plotMax[seq_len(ceiling(nrow(plotMax) / 4)), ] + plotMax$x <- 25 + + ggFoot <- ggplot(plotFootDF, aes(x = x, y = mean, color = group)) + + geom_line() + + scale_color_manual(values = pal) + + scale_fill_manual(values = pal) + + geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, linetype = NA, fill = group), alpha = 0.4) + + xlab("Distance to motif center (BP)") + + coord_cartesian( + expand = FALSE, + ylim = c(quantile(plotFootDF$mean, 0.0001), 1.15*quantile(plotFootDF$mean, 0.999)), + xlim = c(min(plotFootDF$x),max(plotFootDF$x)) + ) + theme_ArchR(baseSize = baseSize) + ggtitle(name) + + guides(fill = FALSE) + + guides(color = FALSE) + ylab("Footprint Normalized Mean") + + ggrepel::geom_label_repel(data = plotMax, aes(label = group), size = 3, xlim = c(75, NA)) + + ggBias <- ggplot(plotBiasDF, aes(x = x, y = mean, color = group)) + + geom_line() + + scale_color_manual(values = pal) + + scale_fill_manual(values = pal) + + geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, linetype = NA, fill = group), alpha = 0.4) + + xlab("Distance to motif center (BP)") + + coord_cartesian( + expand = FALSE, + ylim = c(quantile(plotBiasDF$mean, 0.0001), 1.05*quantile(plotBiasDF$mean, 0.999)), + xlim = c(min(plotBiasDF$x),max(plotBiasDF$x)) + ) + theme_ArchR(baseSize = baseSize) + ylab("Bias Normalized Mean") + + theme(legend.position = "bottom", legend.box.background = element_rect(color = NA)) + + ggAlignPlots(ggFoot, .ggSmallLegend(ggBias), sizes=c(2,1)) + +} + +.ggSmallLegend <- function(gg, pointSize = 2, baseSize = 5, spaceLegend = 0.1) { + #https://stackoverflow.com/questions/52297978/decrease-overal-legend-size-elements-and-text + gg + + guides(shape = guide_legend(override.aes = list(size = pointSize)), + color = guide_legend(override.aes = list(size = pointSize))) + + theme(legend.title = element_text(size = baseSize), + legend.text = element_text(size = baseSize), + legend.key.size = unit(spaceLegend, "lines")) +} + +##################################################################################################### +# Summarize Footprints into a Summarized Experiment for Plotting +##################################################################################################### + +#' @export +summarizeFootprints <- function( + ArchRProj = NULL, + positions, + groupBy = "Clusters", + useGroups = NULL, + flank = 250, + threads = 16, + force = FALSE, + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + if(verboseAll){ + verboseHeader <- TRUE + } + + tstart <- Sys.time() + + ##################################################### + # Compute Kmer Frequency Table + ##################################################### + coverageMetadata <- .getCoverageMetadata(ArchRProj = ArchRProj, groupBy = groupBy) + coverageParams <- .getCoverageParams(ArchRProj = ArchRProj, groupBy = groupBy) + kmerLength <- coverageParams$kmerLength + + if(!is.null(useGroups)){ + if(sum(coverageMetadata[,1] %in% useGroups) == 0){ + stop("No Groups found matching useGroups!") + } + coverageMetadata <- coverageMetadata[coverageMetadata[,1] %in% useGroups,] + } + + genome <- getGenome(ArchRProj) + .requirePackage(genome) + .requirePackage("Biostrings") + BSgenome <- eval(parse(text = genome)) + BSgenome <- .validBSgenome(BSgenome) + + .messageDiffTime("Computing Kmer Bias Table", tstart) + kmerTableList <- .kmerPositionFrequency(positions, genome = BSgenome, flank = flank, k = kmerLength, threads = 1, verbose = verboseAll) + + ##################################################### + # Compute Footprints + ##################################################### + .messageDiffTime("Computing Footprint", tstart) + footprintList <- .computeFootprints(positions, coverageMetadata$File, flank = flank, threads = threads, verbose = verboseAll) + + ##################################################### + # Compute Bias For Footprints + ##################################################### + .messageDiffTime("Computing Footprint Bias", tstart) + footprintBiasList <- .computeFootprintsBias(kmerTableList, coverageMetadata$File, threads = threads, verbose = verboseAll) + + ##################################################### + # Summarize into SE + ##################################################### + footAssay <- lapply(seq_along(positions), function(x){ + footMat <- lapply(seq_along(footprintList), function(y){ + footprintList[[y]][,x] + }) %>% Reduce("cbind", .) + colnames(footMat) <- coverageMetadata$Name + biasMat <- lapply(seq_along(footprintBiasList), function(y){ + footprintBiasList[[y]][,x] + }) %>% Reduce("cbind", .) + colnames(biasMat) <- coverageMetadata$Name + rbind(footMat, biasMat) + }) %>% SimpleList + names(footAssay) <- names(positions) + + #Clean GC + rm(footprintList, footprintBiasList) + gc() + + rowData <- DataFrame( + x = c(seq(-flank, flank), seq(-flank, flank)), + type = c(rep("footprint", flank*2+1),rep("bias", flank*2+1)) + ) + + se <- SummarizedExperiment::SummarizedExperiment( + assays = footAssay, + colData = coverageMetadata, + rowData = rowData + ) + + metadata(se)$Params <- SimpleList(kmerLength=kmerLength,flank=flank,date=Sys.Date()) + + return(se) + +} + +.computeFootprintsBias <- function(kmerTableList, coverageFiles, threads = 8, verbose = TRUE){ + tstart <- Sys.time() + out <- .safelapply(seq_along(coverageFiles), function(i){ + .messageDiffTime(sprintf("Computing Footprints Bias %s of %s:", i, length(coverageFiles)),tstart,verbose=verbose) + .computeFootprintsBiasSingle(kmerTableList, coverageFiles[i]) + }, threads = threads) %>% SimpleList + return(out) +} + +.computeFootprintsBiasSingle <- function(kmerTableList, coverageFile){ + kmerTableList <- as(kmerTableList, "list") + oe <- h5read(coverageFile, "KmerBias/ObservedKmers") / h5read(coverageFile, "KmerBias/ExpectedKmers") + names(oe) <- h5read(coverageFile, "KmerBias/Kmer") + biasDF <- lapply(seq_along(kmerTableList), function(x){ + bias <- colSums(as.matrix(kmerTableList[[x]]) * as.vector(oe[rownames(kmerTableList[[x]])])) + bias <- bias / sum(bias) + bias + }) %>% Reduce("cbind", .) %>% data.frame + gc() + biasDF +} + +.computeFootprints <- function(featureList, coverageFiles, flank = 250, threads = 8, verbose = TRUE){ + tstart <- Sys.time() + out <- .safelapply(seq_along(coverageFiles), function(i){ + .computeFootprintsSingle(featureList, coverageFiles[i], flank, gc = TRUE, + pre = sprintf("Computing Footprints %s of %s:", i, length(coverageFiles)), + tstart = tstart, + verbose = verbose + ) + }, threads = threads) %>% SimpleList + return(out) +} + +.computeFootprintsSingle <- function(featureList, coverageFile, flank = 250, gc = FALSE, pre = "", tstart, verbose = TRUE){ + window <- 2 * flank + 1 + featureNames <- names(featureList) + featureList <- as(featureList, "list") + allChr <- lapply(featureList, function(x) unique(as.character(seqnames(x)))) %>% unlist %>% unique %>% sort + cov <- .getCoverageRle(coverageFile, allChr) + footprintDF <- lapply(seq_along(featureList), function(x){ + featurex <- split(resize(featureList[[x]],1,"center"), seqnames(featureList[[x]])) + outx <- ArchR:::rleSumsStranded(cov, featurex, window, as.integer) #Rcpp + if(x %% 25 == 0 & gc){ + gc() + } + if(length(featureList) > 10){ + if(x %% max(floor(length(featureList) * .25) ,1) == 0){ + .messageDiffTime(sprintf("%s %s Percent Completed", pre, floor(x / floor(length(featureList)/25) * 25)), tstart, verbose=verbose) + } + }else{ + if(x == 1 | x == length(featureList)){ + .messageDiffTime(sprintf("%s %s Percent Completed", pre, round(100 * x / length(featureList)),1), tstart, verbose=verbose) + } + } + outx + }) %>% Reduce("cbind",.) %>% data.frame + gc() + footprintDF +} + +.getCoverageRle <- function(coverageFile, allChr){ + cov <- lapply(seq_along(allChr), function(x){ + Rle( + lengths = h5read(coverageFile, paste0("Coverage/",allChr[x],"/Lengths")), + values = h5read(coverageFile, paste0("Coverage/",allChr[x],"/Values")) + ) + }) %>% {as(.,"RleList")} + names(cov) <- allChr + cov +} + +.kmerPositionFrequency <- function(featureList, genome, flank = 250, k = 6, threads = 8, verbose = TRUE){ + + tstart <- Sys.time() + genome <- .validBSgenome(genome) + window <- 2*flank + 1 + + kmerList <- .safelapply(seq_along(featureList), function(i){ + .messageDiffTime(sprintf("Computing Kmer Tables for %s of %s features", i, length(featureList)), tstart, verbose=verbose) + bsv <- BSgenomeViews(genome , resize(featureList[[i]], window + k, "center")) + bsv <- bsv[width(bsv) == window + k] #none that are trimmed! + #BSgenome is already stranded + #kmerPositionFrequencyCpp is Rcpp export for getting kmer position frequencies from strings + kmerTable <- ArchR:::kmerPositionFrequencyCpp(as.character(bsv), rep(1L,length(bsv)), window, k, .getKmers(k)) #Rcpp + return(kmerTable) + }, threads = threads) %>% SimpleList + names(kmerList) <- names(featureList) + + .messageDiffTime("Finished Computing Kmer Tables", tstart) + + return(kmerList) +} + +.getKmers <-function(k, letters = c('A','C','G','T')){ + kmers = '' + for (i in seq_len(k)) { + kmers <- unlist(lapply(kmers, function(x) paste0(x, letters))) + } + return(kmers) +} + + + + + + + + + + + + + + + + + + + + diff --git a/R/GRangesUtils.R b/R/GRangesUtils.R new file mode 100644 index 00000000..10592d86 --- /dev/null +++ b/R/GRangesUtils.R @@ -0,0 +1,293 @@ +#-------------------------------------------------------------------------------------------- +# Helper Functions for GenomicRanges +#-------------------------------------------------------------------------------------------- + +#' Filters unwanted chr mainly underscores +#' @param x GRanges or something with seqlevels +#' @param remove remove vector +#' @param underscore remove all underscores? +#' @param standard keep standard chromosomes +#' @export +keepFilteredChromosomes <- function(x, remove = c("chrM"), underscore = TRUE, standard = TRUE, pruning.mode="coarse"){ + #first we remove all non standard chromosomes + if(standard){ + x <- GenomeInfoDb::keepStandardChromosomes(x, pruning.mode = pruning.mode) + } + #Then check for underscores or specified remove + seqNames <- seqlevels(x) + chrRemove <- c() + #first we remove all chr with an underscore + if(underscore){ + chrRemove <- c(chrRemove, which(grepl("_", seqNames))) + } + #next we remove all chr specified in remove + chrRemove <- c(chrRemove, which(seqNames %in% remove)) + if(length(chrRemove) > 0){ + chrKeep <- seqNames[-chrRemove] + }else{ + chrKeep <- seqNames + } + #this function restores seqlevels + seqlevels(x, pruning.mode=pruning.mode) <- chrKeep + return(x) +} + +#' Instead of counting overlaps get columns like max score or etc in query +#' @param query granges query +#' @param subject granges subject +#' @param colname mcols(gr)[[colname]] cannot be null +#' @param decreasing for order +#' @export +columnOverlaps <- function(query, subject, colname = "score", ignore.strand = TRUE, decreasing = TRUE){ + #First get overlaps + o <- data.frame(findOverlaps(query, subject, ignore.strand = ignore.strand)) + #Then append information + o$col <- mcols(subject)[[colname]][o[,2]] + #Order it by the factor to rank + o <- o[order(o$col, decreasing = decreasing),] + #Deduplicate + o <- o[!duplicated(o$queryHits),] + #Initialize + val <- rep(0, length(query)) + #Fill Values + val[o[,1]] <- o$col + return(val) +} + +#' Instead of counting overlaps get columns like max score or etc in query +#' @param query granges query +#' @param subject granges subject +#' @param colname mcols(gr)[[colname]] cannot be null +#' @param decreasing for order +#' @export +nonOverlappingGRanges <- function(gr, by = "score", decreasing = TRUE, verbose = FALSE){ + + stopifnot(by %in% colnames(mcols(gr))) + gr <- .validGRanges(gr) + + #----------- + # Cluster GRanges into islands using reduce and then select based on input + #----------- + clusterGRanges <- function(gr, filter = TRUE, by = "score", decreasing = TRUE){ + gr <- sort(sortSeqlevels(gr)) + r <- GenomicRanges::reduce(gr, min.gapwidth=0L, ignore.strand=TRUE) + o <- findOverlaps(gr,r, ignore.strand = TRUE) + mcols(gr)$cluster <- subjectHits(o) + gr <- gr[order(mcols(gr)[,by], decreasing = decreasing),] + gr <- gr[!duplicated(mcols(gr)$cluster),] + gr <- sort(sortSeqlevels(gr)) + mcols(gr)$cluster <- NULL + return(gr) + } + + if(verbose){ + message("Converging", appendLF = FALSE) + } + i <- 0 + grConverge <- gr + while(length(grConverge) > 0){ + if(verbose){ + message(".", appendLF = FALSE) + } + i <- i + 1 + grSelect <- clusterGRanges( + gr = grConverge, + filter = TRUE, + by = by, + decreasing = decreasing) + + grConverge <- subsetByOverlaps( + grConverge, + grSelect, + invert=TRUE, + ignore.strand = TRUE) #blacklist selected gr + + if(i == 1){ #if i=1 then set gr_all to clustered + grAll <- grSelect + + }else{ + grAll <- c(grAll, grSelect) + } + + } + message(sprintf("Converged after %s iterations!", i)) + + if(verbose){ + message("\nSelected ", length(grAll), " from ", length(gr)) + } + grAll <- sort(sortSeqlevels(grAll)) + + return(grAll) + +} + +#' Subset by Seqnames +#' @param gr grange +#' @param seqnames seqnames to subset +#' @export +subsetSeqnames <- function(gr, seqNames, useNames = FALSE){ + gr <- .validGRanges(gr) + gr <- gr[which(as.character(seqnames(gr)) %in% seqNames),] + if(useNames){ + seqlevels(gr) <- seqNames + }else{ + seqlevels(gr) <- as.character(unique(seqnames(gr))) + } + return(gr) +} + +#' Add Seqlengths to genomic ranges +#' @param gr see validGRanges +#' @param genome see validBSgenome +#' @export +addSeqLengths <- function(gr, genome){ + gr <- .validGRanges(gr) + genome <- validBSgenome(genome) + stopifnot(all(as.character(seqnames(gr)) %in% as.character(seqnames(genome)))) + seqlengths(gr) <- seqlengths(genome)[as.character(names(seqlengths(gr)))] + return(gr) +} + +#' Shuffle Genomic Ranges +#' @param subject see validGRanges +#' @param genome see validBSgenome +#' @param n nPermutations +#' @param shuffleChr shuffle across chromosomes randomly vs using previous knowledge of chromosome distribution +#' @export +shuffleGRanges <- function(subject, genome, n, shuffleChr=TRUE){ + #adapted from ChIPseeker's shuffle + cs <- getChromSizes(genome) + seqL <- seqlengths(cs) + seqL <- seqL[sort(names(seqL))] + sub <- subsetSeqnames(subject, seqNames = names(seqL)) #change + sub <- sub[order(as.character(seqnames(sub)))] + #stopifnot(identical(unique(as.character(seqnames(sub))), names(seqL))) + w <- width(sub) + name <- mcols(sub)$name + seqN <- as.character(seqnames(sub)) + if(shuffleChr){ + expected <- round(length(sub)*as.numeric(seqL)/sum(as.numeric(seqL))) + names(expected) <- names(seqL) + #hackish rounding correction + diff <- sum(length(sub))-sum(expected) + r <- sample(length(expected),1) + expected[r] <- expected[r] + diff + subPerChr <- expected + }else{ + subPerChr <- table(seqN) + } + pb <- txtProgressBar(min=0,max=100,initial=0,style=3) + grL <- lapply(seq_len(n), function(x){ + setTxtProgressBar(pb,round(x*100/n,0)) + rand <- sample(length(w)) + ws <- w[rand] + ns <- name[rand] + d <- lapply(seq_along(subPerChr), function(i){ + st_i <- sample(seqL[i],subPerChr[i]) + return(data.frame(seq = names(subPerChr)[i], start = st_i)) + }) %>% data.table::rbindlist(.) %>% data.frame + gr <- GRanges(seqnames=d[,1], ranges=IRanges(d[,2], width=ws), strand="*", name = ns) + suppressWarnings(seqlengths(gr) <- seqL) + gr <- trim(gr) + return(gr) + }) + grL <- GenomicRangesList(grL) + return(grL) +} + +#' Merge Genomic Ranges +#' @param gr see validGRanges +#' @param ignore.strand ignore strandedness for merging +#' @export +mergeGRanges <- function(gr, ignore.strand = TRUE){ + gr <- .validGRanges(gr) + grR <- reduce(gr,min.gapwidth=0L,ignore.strand = ignore.strand) + o <- DataFrame(findOverlaps(grR, gr,ignore.strand = ignore.strand)) + o$start <- start(gr[o$subjectHits]) + o$end <- end(gr[o$subjectHits]) + o$chr <- seqnames(gr[o$subjectHits]) + os <- o[order(o$start,decreasing = FALSE),] %>% {.[!duplicated(.$queryHits),c("queryHits", "start")]} + oe <- o[order(o$end,decreasing = TRUE),] %>% {.[!duplicated(.$queryHits),c("queryHits", "end")]} + oc <- o[!duplicated(o$queryHits),c("queryHits", "chr")] + df <- merge(merge(oc, os, by = "queryHits"),oe, by = "queryHits") + mGR <- GRanges(df[,2], ranges = IRanges(df[,3], df[,4])) %>% sortSeqlevels %>% sort + return(mGR) +} + +#' Merge Genomic Ranges +#' @param query see validGRanges +#' @param subject see validGRanges +#' @param ignore.strand ignore strandedness for overlaps +#' @export +extendGRanges <- function(x, upstream, downstream){ + #https://bioinformatics.stackexchange.com/questions/4390/expand-granges-object-different-amounts-upstream-vs-downstream + isMinus <- BiocGenerics::which(strand(x) == "-") + isOther <- BiocGenerics::which(strand(x) != "-") + #Forward + start(x)[isOther] <- start(x)[isOther] - upstream + end(x)[isOther] <- end(x)[isOther] + downstream + #Reverse + end(x)[isMinus] <- end(x)[isMinus] + upstream + start(x)[isMinus] <- start(x)[isMinus] - downstream + return(x) +} + +#' Merge Genomic Ranges +#' @param query see validGRanges +#' @param subject see validGRanges +#' @param ignore.strand ignore strandedness for overlaps +#' @export +overlappingBP <- function(query, subject, ignore.strand = TRUE){ + query <- .validGRanges(query) + subject <- .validGRanges(subject) + o <- findOverlaps(query, subject, ignore.strand = ignore.strand) + overlaps <- pintersect(query[queryHits(o)], subject[subjectHits(o)]) + percentOverlap <- width(overlaps) / width(subject[subjectHits(o)]) + l <- unlist(lapply(split(percentOverlap, subjectHits(o)),function(x)sum(x))) + perOverlap <- sum(l*width(subject[unique(subjectHits(o))]))/sum(width(subject)) + nBP <- perOverlap * sum(width(subject)) + type <- c("queryBP", "sharedBP", "subjectBP") + nBases <- c(sum(width(query))-nBP, nBP, sum(width(subject))-nBP) + return(data.frame(type,nBases)) +} + +#' Overlaps Many includes information from mcols(gr) +#' @param query see validGRanges +#' @param subject see validGRanges +#' @param by column in subject to split overlaps by +#' @param ignore.strand ignore strandedness for overlaps +#' @export +overlapsMany <- function(query, subject, by, ignore.strand = TRUE){ + o <- DataFrame(findOverlaps(query, subject, ignore.strand = ignore.strand)) + o$name <- mcols(subject)[o$subjectHits,by] + o$id <- match(o$name, unique(o$name)) + sparse <- Matrix::sparseMatrix( + i=o[,1], + j=o[,4], + x=rep(TRUE,nrow(o)), + dims=c(length(query),length(unique(o$name))) + ) + colnames(sparse) <- unique(o$name) + return(sparse) +} + +#' Construct GRanges seqnames start end accounting for ends before starts (adding strandedness) +#' @param seqnames seqnames of GRanges +#' @param start start of GRanges +#' @param end end of GRanges +#' @param ignore.strand ignore strandedness for overlaps +#' @export +constructGRanges <- function(seqnames, start, end, ignore.strand = TRUE){ + df <- data.frame(seqnames, start, end) + idx <- which(df[,2] > df[,3]) + df[idx,2:3] <- df[idx,3:2] + if(!ignore.strand){ + strand <- rep("+",nrow(df)) + strand[idx] <- "-" + }else{ + strand <- rep("*",nrow(df)) + } + gr <- GRanges(df[,1], IRanges(df[,2],df[,3]), strand = strand) + return(gr) +} + diff --git a/R/GgplotHelper.R b/R/GgplotHelper.R new file mode 100644 index 00000000..dca9a0ba --- /dev/null +++ b/R/GgplotHelper.R @@ -0,0 +1,587 @@ +#' GG Plot One to One Heatscatter +#' +#' @param x x +#' @param y y +#' @param size geom_point size +#' @param alpha geom_point alpha +#' @param xlabel xlabel +#' @param ylabel ylabel +#' @param title ggtitle +#' @param min xmin quantile [0,1] +#' @param max xmax quantile [0,1] +#' @param plot_n number of points to plot +#' @param kernel_n n for MASS::kde2d default = 100 +#' @param plot_n number of points to plot +#' @param baseSize base_font size +#' @param pal continuous color palette to use +#' @export +ggPoint <- function(x, y, color = NULL, discrete = TRUE, discreteSet = "stallion", + labelMeans = FALSE, continuousSet = "solar_extra", pal = NULL, colorDensity = FALSE, + size = 1, xlim = NULL, ylim = NULL, extend = 0.05, xlabel = "x", randomize = FALSE, seed = 1, + ylabel = "y", title = "", alpha = 1, baseSize = 6, ratioYX = 1, + labelType = "ggrepel", bgColor = "white", fgColor = NULL, labelSize = 1.5, + addFit = NULL, nullColor = "lightGrey", rastr = FALSE, dpi = 300){ + + stopifnot(is.numeric(x)) + stopifnot(is.numeric(y)) + stopifnot(length(y)==length(x)) + + if(randomize){ + set.seed(seed) + idx <- sample(seq_along(x), length(x)) + }else{ + idx <- seq_along(x) + } + + df <- data.frame(x = x, y = y) + include <- which(is.finite(x) & is.finite(y)) + if(length(include) != length(x)){ + message("Some values are not finite! Excluding these points!") + df <- df[include,] + x <- x[include] + y <- y[include] + if(!is.null(color)){ + color <- color[include] + } + } + if (is.null(xlim)) { + xlim <- range(df$x) %>% extendrange(f = extend) + } + if (is.null(ylim)) { + ylim <- range(df$y) %>% extendrange(f = extend) + } + ratioXY <- ratioYX * diff(xlim)/diff(ylim) + + #Plot + library(ggplot2) + + if (is.null(color) & !colorDensity) { + + p <- ggplot(df[idx,], aes(x = x, y = y)) + coord_equal(ratio = ratioXY, xlim = xlim, + ylim = ylim, expand = F) + xlab(xlabel) + ylab(ylabel) + + ggtitle(title) + theme_ArchR(baseSize = baseSize) + + if(rastr){ + if(!requireNamespace("ggrastr", quietly = TRUE)){ + message("ggrastr is not available for rastr of points, continuing without rastr!") + p <- p + geom_point(size = size, alpha = alpha, color = nullColor) + }else{ + .requirePackage("ggrastr") + p <- p + geom_point_rast(size = size, raster.dpi = dpi, alpha = alpha, color = nullColor) + } + }else{ + p <- p + geom_point(size = size, alpha = alpha, color = nullColor) + } + + }else { + + if(colorDensity){ + discrete <- FALSE + df <- getDensity(x, y, n = 100, sample = NULL) #change + df <- df[order(df$density), ,drop=FALSE] + df$color <- df$density + }else if(discrete){ + stopifnot(length(color) == nrow(df)) + df$color <- factor(color, levels = sort(unique(color))) + }else { + stopifnot(length(color) == nrow(df)) + df$color <- color + } + p <- ggplot(df[idx,], aes(x = x, y = y, color = color)) + coord_equal(ratio = ratioXY, xlim = xlim, + ylim = ylim, expand = F) + xlab(xlabel) + ylab(ylabel) + + ggtitle(title) + theme_ArchR(baseSize = baseSize) + + theme(legend.direction="horizontal" , legend.box.background = element_rect(color = NA)) + + if(rastr){ + + if(!requireNamespace("ggrastr", quietly = TRUE)){ + message("ggrastr is not available for rastr of points, continuing without rastr!") + p <- p + geom_point(size = size, alpha = alpha) + }else{ + .requirePackage("ggrastr") + p <- p + geom_point_rast(size = size, raster.dpi = dpi, alpha = alpha) + } + + }else{ + + p <- p + geom_point(size = size, alpha = alpha) + + } + + if (discrete) { + + if (!is.null(pal)) { + p <- p + scale_color_manual(values = pal) + }else { + p <- p + scale_color_manual(values = paletteDiscrete(set = discreteSet, + values = sort(unique(color)))) + } + + if (labelMeans) { + dfMean <- split(df, df$color) %>% lapply(., function(x) { + data.frame(x = mean(x[, 1]), y = mean(x[, 2]), + color = x[1, 3]) + }) %>% Reduce("rbind", .) + + if(tolower(labelType) == "repel" | tolower(labelType) == "ggrepel"){ + + if(!is.null(fgColor)){ + p <- p + ggrepel::geom_label_repel(data = dfMean, aes(x, y, label = color), color = fgColor, size = labelSize) + }else{ + p <- p + ggrepel::geom_label_repel(data = dfMean, aes(x, y, label = color), size = labelSize) + } + + }else if(tolower(labelType) == "shadow" | tolower(labelType) == "shadowtext"){ + + if(!is.null(fgColor)){ + p <- p + geom_shadowtext(data = dfMean, aes(x, y, label = color), color = fgColor, bg.colour = bgColor, size = labelSize) + }else{ + p <- p + geom_shadowtext(data = dfMean, aes(x, y, label = color), bg.colour = bgColor, size = labelSize) + } + + }else{ + stop("Error unrecognized label type!") + } + } + + }else{ + + if (!is.null(pal)) { + p <- p + scale_colour_gradientn(colors = pal) + }else { + p <- p + scale_colour_gradientn(colors = paletteContinuous(set = continuousSet)) + } + } + + } + if (!is.null(addFit)) { + p <- p + geom_smooth(data = df, aes(color = NULL), method = addFit, + color = "black") + ggtitle(paste0(title, "\nPearson = ", + round(cor(df$x, df$y), 3), "\nSpearman = ", round(cor(df$x, + df$y, method = "spearman"), 3))) + } + return(p) +} + +#' GG Plot One to One Heatscatter +#' +#' @param x x +#' @param y y +#' @param size geom_point size +#' @param alpha geom_point alpha +#' @param xlabel xlabel +#' @param ylabel ylabel +#' @param title ggtitle +#' @param min xmin quantile [0,1] +#' @param max xmax quantile [0,1] +#' @param nPlot number of points to plot +#' @param nKernel n for MASS::kde2d default = 100 +#' @param baseSize base_font size default is 12 +#' @param pal continuous color palette to use +#' @export +#' +ggOneToOne <- function (x, y, nPlot = 100 * 10^3, + nKernel = 100, size = 2, + xlabel = "x", ylabel = "y", title = "Sample Correlation", + min = 0.1, max = 0.9999, + densityMax = 0.95, extend = 0.05, + alpha = 1, baseSize = 12, + pal = paletteContinuous(set = "viridis")){ + + #Check is Numeric + stopifnot(is.numeric(x)) + stopifnot(is.numeric(y)) + + #Check for NA + idx <- which(!is.na(x) & !is.na(y) & !is.infinite(x) & !is.infinite(y)) + x <- x[idx] + y <- y[idx] + + #Ratio X/Y + lim <- quantile(c(x, y), c(min, max)) %>% extendrange(f = extend) + ratioXY <- diff(lim)/diff(lim) + + #Calculate Correlations + pearson <- round(cor(x, y, method = "pearson", use = "complete"), 3) + spearman <- round(cor(x, y, method = "spearman", use = "complete"), 3) + title <- sprintf("%s \nPearson = %s , Spearman = %s", title, pearson, spearman) + + #Get Density + message("adding denisty...") + df <- getDensity(x, y, n = nKernel, sample = nPlot) #change + df <- df[order(df[, "density"]), ] + + #GGPlot + message("plotting...") + gg <- ggPlotPoint( + x = df$x, + y = df$y, + color = df$density, + pal = pal, + xlabel = xlabel, + ylabel = ylabel, + discrete = FALSE, + xlim = lim, + ylim = lim, + size = size, + alpha = alpha, + title = title, + baseSize = baseSize + ) + geom_abline(slope = 1, intercept = 0, lty = "dashed") + return(gg) +} + +#modified from http://slowkow.com/notes/ggplot2-color-by-density/ +getDensity <- function(x, y, n = 100, sample = NULL, densityMax = 0.95){ + df <- data.frame(x=x,y=y) + dens <- MASS::kde2d(x = x, y = y, n = n) + ix <- findInterval(x, dens$x) + iy <- findInterval(y, dens$y) + ii <- cbind(ix, iy) + df$density <- dens$z[ii] + df$density[df$density > quantile(unique(df$density),densityMax)] <- quantile(unique(df$density),densityMax) #make sure the higher end doesnt bias colors + if(!is.null(sample)){ + df <- nSample(df,sample,type="r") + } + return(df) +} + +#' GG Violin Plot +#' +#' @param x categorical values to each y value +#' @param y numeric values +#' @param xlabel xlabel +#' @param ylabel ylabel +#' @param base_size base_size of theme +#' @param size size of barplot lines +#' @param pal color palette see paletteDiscrete for examples +#' @export +#' +ggViolin <- function (x, y, base_size = 12, xlabel = NULL, ylabel = NULL, points = FALSE, baseSize = 6, ratioYX = 1, + sampleRatio = 0.1, size = 1, title = "", pal = paletteDiscrete(values=x, set = "stallion")) { + stopifnot(!is.numeric(x)) + stopifnot(is.numeric(y)) + names(y) <- x + me = round(mean(stats::aggregate(y ~ names(y), FUN = mean)[, 2]), 2) + sd = round(sd(stats::aggregate(y ~ names(y), FUN = mean)[, 2]), 2) + min = round(min(y), 2) + max = round(max(y), 2) + df <- data.frame(x, y) + df$x <- factor(df$x, gtools::mixedsort(unique(paste0(df$x)))) + p <- ggplot(df, aes_string(x = "x", y = "y", color = "x")) + coord_fixed(ratioYX, expand = TRUE) + + geom_violin(aes_string(fill="x"), alpha = 0.35) + + geom_boxplot(size = size, outlier.size = 0, outlier.stroke = 0, fill = NA) + + scale_color_manual(values = pal, guide = FALSE) + + scale_fill_manual(values = pal, guide = FALSE) + + theme_ArchR(xText90 = TRUE, baseSize = baseSize) + + ggtitle(title) + + if(points){ + if(requireNamespace("ggrastr", quietly = TRUE)){ + .requirePackage("ggrastr") + p <- p + ggrastr::geom_quasirandom_rast(data = df[sample(seq_len(nrow(df)), floor(nrow(df) * sampleRatio)),], alpha = 1, + aes(x = x, y = y, color = x, fill = x), + size = 0.5, dodge.width=1) + }else{ + message("ggrastr is not available for rastr of points, continuing without points!") + } + } + + if (!is.null(xlabel)) { + p <- p + xlab(xlabel) + } + if (!is.null(ylabel)) { + p <- p + ylab(ylabel) + } + return(p) +} + + +#' GG Violin Plot +#' +#' @param x categorical values to each y value +#' @param y numeric values +#' @param xlabel xlabel +#' @param ylabel ylabel +#' @param base_size base_size of theme +#' @param size size of barplot lines +#' @param pal color palette see paletteDiscrete for examples +#' @export +#' +ggHex <- function(x, y, color, extend = 0.05, ratioYX = 1, xlim = NULL, ylim = NULL, + bins = 150, pal = paletteContinuous(set = "solar_extra"), title = "", baseSize = 12, + xlabel = "x" , ylabel ="y", fun = "median", ...){ + + df <- data.frame(x = x, y = y) + include <- which(is.finite(x) & is.finite(y)) + + if(length(include) != length(x)){ + message("Some values are not finite! Excluding these points!") + df <- df[include,] + if(!is.null(color)){ + color <- color[include] + } + } + df$color <- color + + if (is.null(xlim)) { + xlim <- range(df$x) %>% extendrange(f = extend) + } + if (is.null(ylim)) { + ylim <- range(df$y) %>% extendrange(f = extend) + } + ratioXY <- ratioYX * diff(xlim)/diff(ylim) + + p <- ggplot() + + stat_summary_hex(data = df, aes(x=x,y=y,z=color), fun = fun, bins = bins, color = NA) + + scale_fill_gradientn(colors = pal) + + xlab(xlabel) + + ylab(ylabel) + + ggtitle(title) + + theme_ArchR(baseSize = baseSize) + + coord_equal(ratio = ratioXY, xlim = xlim, ylim = ylim, expand = FALSE) + + theme(legend.direction="horizontal", legend.box.background = element_rect(color = NA)) + + p + +} + + +#' GG Plot One to One Heatscatter +#' +#' @param x x +#' @param y y +#' @param size geom_point size +#' @param alpha geom_point alpha +#' @param xlabel xlabel +#' @param ylabel ylabel +#' @param title ggtitle +#' @param min xmin quantile [0,1] +#' @param max xmax quantile [0,1] +#' @param plot_n number of points to plot +#' @param kernel_n n for MASS::kde2d default = 100 +#' @param plot_n number of points to plot +#' @param baseSize base_font size +#' @param pal continuous color palette to use +#' @export +ggLine <- function(x, y, color = NULL, discrete = TRUE, discreteSet = "stallion", + continuousSet = "solar_extra", pal = NULL, size = 1, xlim = NULL, ylim = NULL, + extend = 0.05, xlabel = "x", ylabel = "y", title = "", + alpha = 1, baseSize = 6, ratioYX = 1, + nullColor = "lightGrey"){ + + stopifnot(is.numeric(x)) + stopifnot(is.numeric(y)) + stopifnot(length(y)==length(x)) + + df <- data.frame(x = x, y = y) + include <- which(is.finite(x) & is.finite(y)) + if(length(include) != length(x)){ + message("Some values are not finite! Excluding these points!") + df <- df[include,] + x <- x[include] + y <- y[include] + if(!is.null(color)){ + color <- color[include] + } + } + if (is.null(xlim)) { + xlim <- range(df$x) %>% extendrange(f = extend) + } + if (is.null(ylim)) { + ylim <- range(df$y) %>% extendrange(f = extend) + } + ratioXY <- ratioYX * diff(xlim)/diff(ylim) + + #Plot + library(ggplot2) + + if (is.null(color)) { + + p <- ggplot(df, aes(x = x, y = y)) + coord_equal(ratio = ratioXY, xlim = xlim, + ylim = ylim, expand = F) + xlab(xlabel) + ylab(ylabel) + + ggtitle(title) + theme_ArchR(baseSize = baseSize) + + p <- p + geom_line(size = size, alpha = alpha, color = nullColor) + + }else { + + if(discrete){ + stopifnot(length(color) == nrow(df)) + df$color <- factor(color, levels = sort(unique(color))) + }else { + stopifnot(length(color) == nrow(df)) + df$color <- color + } + p <- ggplot(df, aes(x = x, y = y, color = color)) + + coord_equal(ratio = ratioXY, xlim = xlim, + ylim = ylim, expand = F) + xlab(xlabel) + ylab(ylabel) + + ggtitle(title) + theme_ArchR(baseSize = baseSize) + + theme(legend.direction="horizontal" , legend.box.background = element_rect(color = NA)) + + p <- p + geom_line(size = size, alpha = alpha) + + if (discrete) { + if (!is.null(pal)) { + p <- p + scale_color_manual(values = pal) + }else { + p <- p + scale_color_manual(values = paletteDiscrete(set = discreteSet, + values = sort(unique(color)))) + } + + }else { + if (!is.null(pal)) { + p <- p + scale_colour_gradientn(colors = pal) + }else { + p <- p + scale_colour_gradientn(colors = paletteContinuous(set = continuousSet)) + } + } + } + + return(p) +} + +#' Align GG Plots +#' @param ... ggplots +#' @param sizes sizes are a vector or list of values for each ggplot ie c(1,1) for two plots +#' @param type v,vertical or h,horizontal +#' @export +#' +ggAlignPlots <- function(..., sizes, type = "v", plotList = NULL, grobList = NULL){ + + #http://stackoverflow.com/a/21503904 + + .requirePackage("gtable") + + if(is.null(grobList)){ + + if(is.null(plotList)){ + plotList <- list(...) + } + + ## test that only passing plots + stopifnot(do.call(all, lapply(plotList, inherits, "gg"))) + + gl <- lapply(plotList, ggplotGrob) + + }else{ + + gl <- grobList + rm(grobList) + gc() + + } + + #if ncols do not match fill with empty gtables_add_cols + if(type == "v" | type == "vertical"){ + maxCol <- max(unlist(lapply(gl, ncol))) + gl <- lapply(gl, function(x){ + while(ncol(x) < max(maxCol)){ + x <- gtable::gtable_add_cols(x, unit(1, "null")) + } + return(x) + }) + } + + combined <- Reduce(function(x, y) + if(type == "v" | type == "vertical"){ + gtable:::rbind_gtable(x,y,"first") + }else{ + gtable:::cbind_gtable(x,y,"first") + }, gl[-1], gl[[1]]) + + if(type == "v" | type == "vertical"){ + combined$widths <- do.call(grid::unit.pmax, lapply(gl, "[[", "widths")) + #remove vertical spaces from background layout + combined$heights[combined$layout$t[grepl("background", combined$layout$name)][-1]] <- grid::unit(rep(0,length(combined$heights[combined$layout$t[grepl("background", combined$layout$name)][-1]])), "cm") + if(!missing(sizes)){ + sList <- lapply(seq_along(gl), function(x){ + orig <- gl[[x]]$heights[gl[[x]]$layout$t[grepl("panel", gl[[x]]$layout$name)]] + new <- rep(sizes[[x]]/length(orig),length(orig)) + return(new) + }) + s <- grid::unit(unlist(sList), "null") + combined$heights[combined$layout$t[grepl("panel", combined$layout$name)]] <- s + } + }else if(type == "h" | type == "horizontal"){ + combined$heights <- do.call(grid::unit.pmax, lapply(gl, "[[", "heights")) + if(!missing(sizes)){ + sList <- lapply(seq_along(gl), function(x){ + orig <- gl[[x]]$widths[gl[[x]]$layout$l[grepl("panel", gl[[x]]$layout$name)]] + new <- rep(sizes[[x]]/length(orig),length(orig)) + return(new) + }) + s <- grid::unit(unlist(sList), "null") + combined$widths[combined$layout$l[grepl("panel", combined$layout$name)]] <- s + } + }else{ + stop("Unrecognized type ", type) + } + grid::grid.newpage() + grid::grid.draw(combined) + +} + +#' ggplot2 default theme for ArchR +#' +#' This function returns a ggplot2 theme that is black borded with black font. +#' +#' @param color color of theme +#' @param base_size is the size of the font for the axis text and title +#' @param base_family is family for font +#' @param base_line_size is the size of line +#' @param base_rect_size is the size of rectangle boxes +#' @param plot_margin_cm plot margin in cm +#' @param legend_position where is the legend default bottom +#' @param legend_text_size 0.75*base_size +#' @param axis_tick_length_cm axis tick length in cm +#' @param rotate_x_axis_text_90 rotate x axis text 90 degrees +#' @param rotate_y_axis_text_90 rotate y axis text 90 degrees +#' @export +theme_ArchR <- function( + color = "black", + baseSize = 6, + baseFamily = "", + baseLineSize = 0.5, + baseRectSize = 0.5, + plotMarginCm = 1, + legendPosition = "bottom", + legendTextSize = 5, + axisTickCm = 0.1, + xText90 = FALSE, + yText90 = FALSE, + ... + ){ + theme <- theme_bw() + theme( + axis.text = element_text(color = color, size = baseSize), + axis.title = element_text(color = color, size = baseSize), + title = element_text(color = color, size = baseSize), + plot.margin = unit(c(plotMarginCm, plotMarginCm, plotMarginCm, plotMarginCm), "cm"), + panel.background = element_rect(fill = "transparent", colour = NA), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + panel.border = element_rect(fill = NA, color = color, size = (4/3) * baseRectSize * as.numeric(grid::convertX(grid::unit(1, "points"), "mm"))), + axis.ticks.length = unit(axisTickCm, "cm"), + axis.ticks = element_line(color = color, size = baseLineSize * (4/3) * as.numeric(grid::convertX(grid::unit(1, "points"), "mm"))), + legend.key = element_rect(fill = "transparent", colour = NA), + legend.text = element_text(color = color, size = legendTextSize), + legend.background = element_rect(fill = "transparent"), + legend.box.background = element_rect(fill = "transparent"), + strip.text = element_text(size = baseSize, color="black"), + plot.background = element_rect(fill = "transparent", color = NA) + ) + if(xText90){ + theme <- theme %+replace% theme(axis.text.x = element_text(angle = 90, hjust = 1)) + } + if(yText90){ + theme <- theme %+replace% theme(axis.text.y = element_text(angle = 90, vjust = 1)) + } + return(theme) +} + + + + + + + diff --git a/R/GroupCoverages.R b/R/GroupCoverages.R new file mode 100644 index 00000000..5fe3f9ca --- /dev/null +++ b/R/GroupCoverages.R @@ -0,0 +1,578 @@ +#' Add Group Coverages to ArchR Project +#' +#' This function will merge cells within each group into an insertion +#' coverage file +#' +#' @param ArchRProj ArchRProject +#' @param groupBy group cells by this column in cellColData +#' @param useLabels use sample labels to create sample guided subgroupings as pseudo replicates +#' @param minCells minimum cells per group for coverage files +#' @param maxCells maximum cells per group for coverage files +#' @param maxFragments maximum fragments per group for coverage files (this prevents large files created for optimizing memory) +#' @param minReplicates minimum replicates for group for coverage files +#' @param maxReplicates maximum replicates for group for coverage files +#' @param sampleRatio sampling ratio for pseudo replicates when needed +#' @param kmerLength kmer length for adding Tn5 bias estimation +#' @param threads number of threads +#' @param parallelParam parallel parameters for batch style execution +#' @param force force creating coverage files if existed +#' @param verboseHeader verbose sections +#' @param verboseAll verbose sections and subsections +#' @param ... additional args +#' @export +addGroupCoverages <- function( + ArchRProj, + groupBy = "Clusters", + useLabels = TRUE, + minCells = 40, + maxCells = 500, + maxFragments = 25*10^6, + minReplicates = 2, + maxReplicates = 5, + sampleRatio = 0.8, + kmerLength = 6, + threads = 16, + parallelParam = "mclapply", + force = FALSE, + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + if(verboseAll){ + verboseHeader <- TRUE + } + + tstart <- Sys.time() + Params <- SimpleList( + groupBy=groupBy, + minCells=minCells, + maxCells=maxCells, + minReplicates=minReplicates, + sampleRatio=sampleRatio, + kmerLength = kmerLength + ) + + if(is.null(ArchRProj@projectMetadata$GroupCoverages)){ + ArchRProj@projectMetadata$GroupCoverages <- SimpleList() + } + + if(!is.null(ArchRProj@projectMetadata$GroupCoverages[[groupBy]])){ + if(!force){ + stop("Group Coverages Already Computed, Set force = TRUE to continue!") + } + } + + ##################################################### + #Groups + ##################################################### + cellNames <- rownames(getCellColData(ArchRProj)) + groups <- getCellColData(ArchRProj, groupBy, drop = TRUE) + if(any(is.na(groups))){ + cellNames <- cellNames[!is.na(groups)] + groups <- groups[!is.na(groups)] + } + uniqueGroups <- gtools::mixedsort(unique(groups)) + tableGroups <- table(groups)[uniqueGroups] + + ##################################################### + #Create Cell Groups + ##################################################### + cellGroups <- lapply(seq_along(uniqueGroups), function(x){ + subColDat <- getCellColData(ArchRProj)[which(groups==uniqueGroups[x]),] + cellNamesx <- rownames(subColDat) + if(length(cellNamesx) < minCells){ + return(NULL) + } + if(useLabels){ + sampleLabelsx <- paste0(subColDat$Sample) + }else{ + sampleLabelsx <- NULL + } + outListx <- .identifyGroupsForPseudoBulk( + cells = cellNamesx, + sampleLabels = sampleLabelsx, + useLabels = useLabels, + minCells = minCells, + maxCells = maxCells, + minReplicates = minReplicates, + sampleRatio = sampleRatio + ) + if(is.null(outListx)){ + return(NULL) + } + if(is.null(names(outListx))){ + names(outListx) <- paste0("Rep", seq_along(outListx)) + }else if(any(names(outListx)=="")){ + names(outListx)[which(names(outListx)=="")] <- paste0("Rep", which(names(outListx)=="")) + } + outListx + }) %>% SimpleList + names(cellGroups) <- uniqueGroups + Params$cellGroups <- cellGroups + + ##################################################### + #Check For Max Fragments! + ##################################################### + it <- 0 + for(i in seq_along(cellGroups)){ + for(j in seq_along(cellGroups[[i]])){ + if(sum(getCellColData(ArchRProj, "nFrags")[cellGroups[[i]][[j]],]) > maxFragments){ + it <- it + 1 + nFrags <- getCellColData(ArchRProj, "nFrags")[cellGroups[[i]][[j]],] + cells <- cellGroups[[i]][[j]][order(nFrags)] + nFrags <- nFrags[order(nFrags)] + cellGroups[[i]][[j]] <- cells[which(cumsum(nFrags) < maxFragments)] + } + } + } + if(it > 0){ + .messageDiffTime(sprintf("Further Sampled %s Groups above the Max Fragments!", it), tstart) + } + + ##################################################### + # Arguments for Coverages + ##################################################### + + dir.create(file.path(getOutputDirectory(ArchRProj), "GroupCoverages"), showWarnings = FALSE) + dir.create(file.path(getOutputDirectory(ArchRProj), "GroupCoverages", groupBy), showWarnings = FALSE) + + args <- list() + args$X <- seq_along(unlist(cellGroups)) + args$FUN <- .createCoverages + args$cellGroups <- unlist(cellGroups) + args$genome <- getGenome(ArchRProj) + args$kmerLength <- kmerLength + args$ArrowFiles <- getArrowFiles(ArchRProj) + args$availableChr <- .availableSeqnames(getArrowFiles(ArchRProj)) + args$chromLengths <- getChromLengths(ArchRProj) + args$cellsInArrow <- split(rownames(getCellColData(ArchRProj)), getCellColData(ArchRProj)$Sample) + args$covDir <- file.path(getOutputDirectory(ArchRProj), "GroupCoverages", groupBy) + args$parallelParam <- parallelParam + args$threads <- threads + args$verbose <- verboseAll + args$tstart <- tstart + args$registryDir <- file.path(getOutputDirectory(ArchRProj), "GroupCoverages", "batchRegistry") + + ##################################################### + # Batch Apply to Create Insertion Coverage Files + ##################################################### + + #Disable Hdf5 File Locking + h5disableFileLocking() + + #Batch Apply + .messageDiffTime(sprintf("Creating Coverage Files!"), tstart, addHeader = verboseAll) + batchOut <- .batchlapply(args) + coverageFiles <- lapply(seq_along(batchOut),function(x) batchOut[[x]]$covFile) %>% unlist + nCells <- lapply(seq_along(batchOut),function(x) batchOut[[x]]$nCells) %>% unlist + nFragments <- lapply(seq_along(batchOut),function(x) batchOut[[x]]$nFragments) %>% unlist + + #Enable Hdf5 File Locking + h5enableFileLocking() + + #Add To Project + coverageMetadata <- DataFrame( + Group = stringr::str_split(names(unlist(cellGroups)), pattern = "\\.", simplify=TRUE)[,1], + Name = names(unlist(cellGroups)), + File = coverageFiles, + nCells = nCells, + nInsertions = nFragments * 2 + ) + + ##################################################### + # Compute Kmer Bias for each coverage file! + ##################################################### + .messageDiffTime(sprintf("Adding Kmer Bias to Coverage Files!"), tstart, addHeader = verboseAll) + o <- .addKmerBiasToCoverage( + coverageMetadata = coverageMetadata, + genome = getGenome(ArchRProj), + kmerLength = kmerLength, + threads = threads, + verbose = verboseAll + ) + + ArchRProj@projectMetadata$GroupCoverages[[groupBy]] <- SimpleList(Params = Params, coverageMetadata = coverageMetadata) + + ArchRProj + +} + +##################################################################################################### +# Creating Insertion (1bp) Coverage Hdf5 Files for downstream group analyses +##################################################################################################### + +.createCoverages <- function( + i, + cellGroups, + kmerBias = NULL, + kmerLength = 5, + genome, + ArrowFiles, + cellsInArrow, + availableChr, + chromLengths, + covDir, + tstart, + verbose = TRUE, + ... + ){ + + #Cells + cellGroupi <- cellGroups[[i]] + + #Dealing with sampling w/o replacement! + tableGroupi <- table(cellGroupi) + + #Coverage File! + covFile <- file.path(covDir, paste0(names(cellGroups)[i], ".insertions.coverage.h5")) + rmf <- .suppressAll(file.remove(covFile)) + + #Create Hdf5 File! + o <- h5createFile(covFile) + o <- h5createGroup(covFile, paste0("Coverage")) + o <- h5createGroup(covFile, paste0("Metadata")) + o <- h5write(obj = "ArrowCoverage", file = covFile, name = "Class") + + o <- h5createGroup(covFile, paste0("Coverage/Info")) + o <- h5write(as.character(cellGroupi), covFile, "Coverage/Info/CellNames") + + #We need to dump all the cells into a coverage file + nFragDump <- 0 + nCells <- c() + for(k in seq_along(availableChr)){ + + if(k %% 3 == 0){ + .messageDiffTime(sprintf("Group %s of %s, Read Fragments %s of %s!", i, + length(cellGroups), k, length(availableChr)), tstart, verbose = verbose) + } + + it <- 0 + for(j in seq_along(ArrowFiles)){ + cellsInI <- sum(cellsInArrow[[names(ArrowFiles)[j]]] %in% cellGroupi) + if(cellsInI > 0){ + it <- it + 1 + if(it == 1){ + fragik <- .getFragsFromArrow(ArrowFiles[j], chr = availableChr[k], out = "GRanges", cellNames = cellGroupi) + }else{ + fragik <- c(fragik, .getFragsFromArrow(ArrowFiles[j], chr = availableChr[k], out = "GRanges", cellNames = cellGroupi)) + } + } + } + + #Dealing with sampling w/o replacement! + matchRG <- as.vector(S4Vectors::match(mcols(fragik)$RG, names(tableGroupi))) + fragik <- rep(fragik, tableGroupi[matchRG]) + nCells <- c(nCells, unique(runValue(mcols(fragik)$RG))) + + #Compute Rle Coverage + covk <- coverage(IRanges(start = c( start(fragik), end(fragik) ), width = 1), width = chromLengths[availableChr[k]]) + nFragDump <- nFragDump + length(fragik) + rm(fragik) + + #Write To Hdf5 + chrLengths <- paste0("Coverage/",availableChr[k],"/Lengths") + chrValues <- paste0("Coverage/",availableChr[k],"/Values") + lengthRle <- length(covk@lengths) + o <- h5createGroup(covFile, paste0("Coverage/",availableChr[k])) + o <- .suppressAll(h5createDataset(covFile, chrLengths, storage.mode = "integer", dims = c(lengthRle, 1), level = 0)) + o <- .suppressAll(h5createDataset(covFile, chrValues, storage.mode = "integer", dims = c(lengthRle, 1), level = 0)) + o <- h5write(obj = covk@lengths, file = covFile, name = chrLengths) + o <- h5write(obj = covk@values, file = covFile, name = chrValues) + + gc() + + } + + if(length(unique(cellGroupi)) != length(unique(nCells))){ + stop("Not all cells (", length(unique(cellGroupi)), ") were found for coverage creation (", length(unique(nCells)), ")!") + } + + out <- list(covFile = covFile, nCells = length(cellGroupi), nFragments = nFragDump) + + return(out) + +} + +##################################################################################################### +# Creating Groups of Cells For Pseudobulk Coverage Files +##################################################################################################### + +.identifyGroupsForPseudoBulk <- function( + cells, sampleLabels = NULL, useLabels = TRUE, + minCells = 50, maxCells = 500, filterGroups = FALSE, + minReplicates = 2, maxReplicates = NULL, sampleRatio = 0.8){ + + .leastOverlapCells <- function(x, n = 2, nSample = 0.8 * length(l), iterations = 100, replace = FALSE){ + set.seed(1) + maxMat <- matrix(0, nrow = length(x), ncol = n) + for(i in seq_len(iterations)){ + currentMat <- matrix(0, nrow = length(x), ncol = n) + for(j in seq_len(n)){ + currentMat[sample(seq_along(x), nSample, replace = replace), j] <- 1 + } + disti <- max(dist(t(currentMat), method = "euclidean")) + if(i==1){ + maxMat <- currentMat + maxDist <- disti + }else{ + if(disti > maxDist){ + maxMat <- currentMat + maxDist <- disti + } + } + } + out <- lapply(seq_len(ncol(maxMat)), function(i){ + x[which(maxMat[,i]==1)] + }) + return(out) + } + + if(is.null(sampleLabels)){ + sampleLabels <- rep("A", length(cells)) + }else{ + if(length(cells) != length(sampleLabels)){ + stop("Length of cells need to be same length as sample labels!") + } + } + nCells <- length(cells) + nCellsPerSample <- table(sampleLabels) + nCellsPerSample <- nCellsPerSample[sample(seq_along(nCellsPerSample), length(nCellsPerSample))] + #Samples Passing Min Filter + samplesPassFilter <- sum(nCellsPerSample >= minCells) + samplesThatCouldBeMergedToPass <- floor(sum(nCellsPerSample[nCellsPerSample < minCells]) / minCells) + if(nCells >= minCells * minReplicates & useLabels){ + ############################################################ + # Identifying High-Quality peaks when Cells and Fragments are abundant + ############################################################ + #Samples Passing Min Filter + samplesPassFilter <- sum(nCellsPerSample >= minCells) + samplesThatCouldBeMergedToPass <- floor(sum(nCellsPerSample[nCellsPerSample < minCells]) / minCells) + #First Group Cells By Sample + cellGroups <- split(cells, sampleLabels) + #Identify Samples That Pass Min Cells + samples <- names(nCellsPerSample)[nCellsPerSample > minCells] + cellGroupsPass <- cellGroups[samples] + #Samples That Do Not Pass + if(!all(names(cellGroups) %in% names(cellGroupsPass))){ + cellGroupsNotPass <- cellGroups[names(cellGroups) %ni% samples] + }else{ + cellGroupsNotPass <- list() + } + if(samplesPassFilter >= minReplicates){ + ############################################################ + # If there are at least minReplicates with > minCells + ############################################################ + #If we look at the remaining cells and see that there are enough to make an additional replicate + nCellsRemaining <- length(unlist(cellGroupsNotPass)) + if(nCellsRemaining >= minCells){ + cellGroupsPass$Other <- unlist(cellGroupsNotPass) + } + }else if(samplesPassFilter + samplesThatCouldBeMergedToPass >= minReplicates){ + cellsRemaining <- unlist(cellGroupsNotPass, use.names = FALSE) + nGroupsRemaining <- minReplicates - samplesPassFilter + cellsRemaining <- sample(cellsRemaining, length(cellsRemaining)) + cellGroupsSample <- split(cellsRemaining, ceiling(seq_along(cellsRemaining)/ ceiling(length(cellsRemaining)/nGroupsRemaining))) + #Add to Groups Pass QC + if(samplesPassFilter == 0){ + cellGroupsPass <- cellGroupsSample + names(cellGroupsPass) <- paste0("Rep", seq_along(cellGroupsPass)) + }else{ + cellGroupsPass <- append(cellGroupsPass, cellGroupsSample) + } + }else{ + cellsRemaining <- unlist(cellGroupsNotPass, use.names = FALSE) + nCellsRemaining <- length(cellsRemaining) + cellsNeeded <- minCells * (minReplicates - samplesPassFilter) - nCellsRemaining + cellsFromPass <- sample(unlist(cellGroupsPass, use.names = FALSE), cellsNeeded) + cellGroupsPass <- lapply(cellGroupsPass, function(x){ + x[x %ni% cellsFromPass] + }) + cellGroupsPass[[length(cellGroupsPass) + 1]] <- c(cellsRemaining, cellsFromPass) + names(cellGroupsPass) <- paste0("Rep", seq_along(cellGroupsPass)) + } + }else{ + ############################################################ + # Identifying High-Quality peaks when Cells and Fragments are not abundant + ############################################################ + if(nCells >= minCells / sampleRatio){ + ############################################################ + # When there are more cells than min cells but not enough for robust reproducibility + ############################################################ + cellGroupsPass <- .leastOverlapCells(x = cells, n = minReplicates, nSample = minCells) #length(cells) * sampleRatio) + }else{ + ############################################################ + # Sampling With Replacement, Not Super Desirable + ############################################################ + if(filterGroups){ + return(NULL) + }else{ + cellGroupsPass <- .leastOverlapCells(x = cells, n = minReplicates, nSample = minCells, replace = TRUE) #length(cells) * sampleRatio) + } + } + } + cellGroupsPass <- as(cellGroupsPass, "SimpleList") + + for(i in seq_along(cellGroupsPass)){ + if(length(cellGroupsPass[[i]]) > maxCells){ + cellGroupsPass[[i]] <- sample(cellGroupsPass[[i]], maxCells) + } + } + if(!is.null(maxReplicates)){ + if(length(cellsGroupPass) > maxReplicates){ + cellsGroupPass <- cellsGroupPass[seq_len(maxReplicates)] + } + } + + return(cellGroupsPass) + +} + +##################################################################################################### +# Add Kmer Tn5 Bias Values to Each Coverage File! +##################################################################################################### +.addKmerBiasToCoverage <- function(coverageMetadata, genome, kmerLength, threads, verbose = TRUE, tstart = NULL){ + + .requirePackage(genome) + .requirePackage("Biostrings") + BSgenome <- eval(parse(text = genome)) + BSgenome <- .validBSgenome(BSgenome) + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + coverageFiles <- coverageMetadata$File + names(coverageFiles) <- coverageMetadata$Name + availableChr <- .availableSeqnames(coverageFiles, "Coverage") + + biasList <- .safelapply(seq_along(availableChr), function(x){ + .messageDiffTime(sprintf("Computing Kmer Bias Chr %s of %s!", x, length(availableChr)), tstart, verbose=verbose) + chrBS <- BSgenome[[availableChr[x]]] + exp <- Biostrings::oligonucleotideFrequency(chrBS, width = kmerLength) + obsList <- lapply(seq_along(coverageFiles), function(y){ + obsx <- .getCoverageInsertionSites(coverageFiles[y], availableChr[x]) %>% + {BSgenome::Views(chrBS, IRanges(start = . - floor(kmerLength/2), width = kmerLength))} %>% + {Biostrings::oligonucleotideFrequency(., width = kmerLength, simplify.as="collapsed")} + gc() + obsx + }) %>% SimpleList + names(obsList) <- names(coverageFiles) + SimpleList(expected = exp, observed = obsList) + }, threads = threads) %>% SimpleList + names(biasList) <- availableChr + + #Summarize Bias + for(i in seq_along(biasList)){ + if(i == 1){ + expAll <- biasList[[i]]$expected + obsAll <- biasList[[i]]$observed + }else{ + expAll <- expAll + biasList[[i]]$expected + for(j in seq_along(obsAll)){ + obsAll[[j]] <- obsAll[[j]] + biasList[[i]]$observed[[names(obsAll)[j]]] + } + } + } + + #Write Bias to Coverage Files + for(i in seq_along(coverageFiles)){ + obsAlli <- obsAll[[names(coverageFiles)[i]]] + if(!identical(names(expAll), names(obsAlli))){ + stop("Kmer Names in Exp and Obs not Identical!") + } + o <- h5createGroup(coverageFiles[i], "KmerBias") + o <- h5createGroup(coverageFiles[i], "KmerBias/Info") + o <- h5write(obj = genome, file = coverageFiles[i], name = "KmerBias/Info/Genome") + o <- h5write(obj = kmerLength, file = coverageFiles[i], name = "KmerBias/Info/KmerLength") + o <- h5write(obj = paste0(names(obsAlli)), file = coverageFiles[i], name = "KmerBias/Kmer") + o <- h5write(obj = obsAlli, file = coverageFiles[i], name = "KmerBias/ObservedKmers") + o <- h5write(obj = expAll, file = coverageFiles[i], name = "KmerBias/ExpectedKmers") + + } + + return(0) + +} + +##################################################################################################### +# Get Coverage Metadata and Params from ArchR Project +##################################################################################################### + +.getCoverageMetadata <- function(ArchRProj, groupBy, useGroups = NULL){ + coverageMetadata <- ArchRProj@projectMetadata$GroupCoverages[[groupBy]]$coverageMetadata + if(is.null(coverageMetadata)){ + stop("No Coverage Metadata found for : ", groupBy) + } + if(!is.null(useGroups)){ + if(sum(coverageMetadata[,1] %in% useGroups) == 0){ + stop("No Groups found matching useGroups!") + } + coverageMetadata <- coverageMetadata[coverageMetadata[,1] %in% useGroups,] + } + coverageMetadata +} + +.getCoverageParams <- function(ArchRProj, groupBy, useGroups = NULL){ + coverageParams <- ArchRProj@projectMetadata$GroupCoverages[[groupBy]]$Params + if(is.null(coverageParams)){ + stop("No Coverage Metadata found for : ", groupBy) + } + coverageParams +} + +##################################################################################################### +# Create Coverage Rle List of all chr +##################################################################################################### + +.getCoverageRle <- function(coverageFile, allChr){ + cov <- lapply(seq_along(allChr), function(x){ + Rle( + lengths = h5read(coverageFile, paste0("Coverage/",allChr[x],"/Lengths")), + values = h5read(coverageFile, paste0("Coverage/",allChr[x],"/Values")) + ) + }) %>% {as(., "RleList")} + names(cov) <- allChr + cov +} + +##################################################################################################### +# Get All Non-Zero Insertion Sites and N +##################################################################################################### + +.getCoverageInsertionSites <- function(coverageFile, chr){ + cov <- Rle( + lengths = h5read(coverageFile, paste0("Coverage/", chr, "/Lengths")), + values = h5read(coverageFile, paste0("Coverage/", chr, "/Values")) + ) + rV <- runValue(cov) + cov <- ranges(cov) + mcols(cov)$values <- rV + cov <- cov[mcols(cov)$values > 0] + cov <- unlist(start(slidingWindows(rep(cov, mcols(cov)$values), width = 1, step = 1))) + cov +} + +##################################################################################################### +# Write Coverage To Bed File for MACS2 +##################################################################################################### + +.writeCoverageToBed <- function(coverageFile, out, excludeChr = NULL){ + rmf <- .suppressAll(file.remove(out)) + allChr <- .availableSeqnames(coverageFile, "Coverage") + if(!is.null(excludeChr)){ + allChr <- allChr[allChr %ni% excludeChr] + } + if(length(allChr)==0){ + stop("No Chromosomes in Coverage after Excluding Chr!") + } + for(x in seq_along(allChr)){ + .getCoverageInsertionSites(coverageFile, allChr[x]) %>% + {data.frame(seqnames = allChr[x], start = . - 1L, end = .)} %>% + {data.table::fwrite(., out, sep = "\t", col.names = FALSE, append = TRUE)} + } + out +} + + + diff --git a/R/HelperUtils.R b/R/HelperUtils.R new file mode 100644 index 00000000..34774bd2 --- /dev/null +++ b/R/HelperUtils.R @@ -0,0 +1,511 @@ +########################################################################################## +# Validation Methods +########################################################################################## + +#' @export +.validBSgenome <- function(genome = NULL, masked = FALSE){ + stopifnot(!is.null(genome)) + if(inherits(genome, "BSgenome")){ + return(genome) + }else if(is.character(genome)){ + return(BSgenome::getBSgenome(genome, masked = masked)) + }else{ + stop("Cannot validate BSgenome options are a valid BSgenome or character for getBSgenome") + } +} + +#' @export +.validTxDb <- function(TxDb = NULL){ + stopifnot(!is.null(TxDb)) + if(inherits(TxDb, "TxDb")){ + return(TxDb) + }else if(is.character(TxDb)){ + return(getTxDb(TxDb)) #change + }else{ + stop("Cannot validate TxDb options are a valid TxDb or character for getTxDb") + } +} + +#' @export +.validOrgDb <- function(OrgDb = NULL){ + stopifnot(!is.null(OrgDb)) + if(inherits(OrgDb, "OrgDb")){ + return(OrgDb) + }else if(is.character(OrgDb)){ + return(getOrgDb(OrgDb)) #change + }else{ + stop("Cannot validate OrgDb options are a valid OrgDb or character for getOrgDb") + } +} + +#' @export +.validGRanges <- function(gr = NULL){ + stopifnot(!is.null(gr)) + if(inherits(gr, "GRanges")){ + return(gr) + }else{ + stop("Error cannot validate genomic range!") + } +} + +########################################################################################## +# S4Vectors/BiocGenerics Within Methods +########################################################################################## + +#' Negated Value Matching +#' +#' This function is the reciprocal of %in% +#' See match funciton in base R +#' x %ni% table +#' +#' @param x x search within table +#' @param table to search x in +#' @export +"%ni%" <- function(x, table) !(match(x, table, nomatch = 0) > 0) + +#Mainly used for Rle matching generic handling +#' @export +'%bcin%' <- function(x, table) S4Vectors::match(x, table, nomatch = 0) > 0 + +#Mainly used for Rle matching generic handling +#' @export +'%bcni%' <- function(x, table) !(S4Vectors::match(x, table, nomatch = 0) > 0) + +########################################################################################## +# Helper Intermediate Methods +########################################################################################## + +#' @export +.mergeParams <- function(paramInput, paramDefault){ + for(i in seq_along(paramDefault)){ + if(!(names(paramDefault)[i] %in% names(paramInput))){ + paramInput[[names(paramDefault)[i]]] <- paramDefault[[i]] + } + } + return(paramInput) +} + +#' @export +.requirePackage <- function(x, load = TRUE, installInfo = NULL){ + if(x %in% rownames(installed.packages())){ + if(load){ + suppressPackageStartupMessages(require(x, character.only = TRUE)) + }else{ + return(0) + } + }else{ + if(!is.null(installInfo)){ + stop(paste0("Required package : ", x, " is not installed/found!\n Package Can Be Installed : ", installInfo)) + }else{ + stop(paste0("Required package : ", x, " is not installed/found!")) + } + } +} + +#' @export +.messageDiffTime <- function(main = "", t1 = NULL, verbose = TRUE, addHeader = FALSE, t2 = Sys.time(), units = "mins", header = "###########", tail = "elapsed since start...", precision = 3){ + if(verbose){ + timeStamp <- tryCatch({ + dt <- abs(round(difftime(t2, t1, units = units),precision)) + if(addHeader){ + message(sprintf("%s\n%s : %s, %s %s %s\n%s", header, Sys.time(), main, dt, units, tail, header)) + }else{ + message(sprintf("%s : %s, %s %s %s", Sys.time(), main, dt, units, tail)) + } + }, error = function(x){ + message("Time Error : ", x) + }) + } + return(0) +} + +########################################################################################## +# Lapply Methods +########################################################################################## + +#' @export +.safelapply <- function(..., threads = 1, preschedule = FALSE){ + + if(tolower(.Platform$OS.type) == "windows"){ + threads <- 1 + } + + if(threads > 1){ + + o <- mclapply(..., mc.cores = threads, mc.preschedule = preschedule) + + for(i in seq_along(o)){ + if(inherits(o[[i]], "try-error")){ + stop(o) + } + } + + }else{ + + o <- lapply(...) + + } + + o + +} + +#' @export +.batchlapply <- function(args, sequential = FALSE){ + + if(is.null(args$tstart)){ + args$tstart <- Sys.time() + } + + #Determine Parallel Backend + if(inherits(args$parallelParam, "BatchtoolsParam")){ + + .messageDiffTime("Batch Execution w/ BatchTools through BiocParallel!", args$tstart) + + require(BiocParallel) + + args$parallelParam <- btParam + #Unlink registry Directory + if(dir.exists(args$registryDir)){ + #Clean Up Registry + unlink(args$registryDir, recursive = TRUE)# Delete registry directory + } + + #Set Up Registry For Runnning + args$parallelParam$registryargs <- batchtoolsRegistryargs( + file.dir = args$registryDir, + work.dir = getwd(), + packages = character(0L), + namespaces = character(0L), + source = character(0L), + load = character(0L) + ) + + #Register + BPPARAM <- args$parallelParam + register(BPPARAM) + + #Add To Args + args$BPPARAM <- BPPARAM + + if("..." %in% names(args)){ + args["..."] <- NULL + } + + #Run + outlist <- do.call(bplapply, args) + + }else{ + + .messageDiffTime("Batch Execution w/ safelapply!", args$tstart) + if(sequential){ + args$subThreads <- args$threads + args$threads <- 1 + }else{ + if(args$threads > length(args$X)){ + args$subThreads <- floor( (args$threads - length(args$X) ) / length(args$X)) + args$threads <- length(args$X) + }else{ + args$subThreads <- 1 + } + } + outlist <- do.call(.safelapply, args) + + } + + return(outlist) + +} + +########################################################################################## +# Stat/Summary Methods +########################################################################################## + +#' @export +.rowZscores <- function(m,min=-2,max=2,limit=FALSE){ + z <- sweep(m - rowMeans(m), 1, matrixStats::rowSds(m),`/`) + if(limit){ + z[z > max] <- max + z[z < min] <- min + } + return(z) +} + +#' @export +.computeROC <- function(labels, scores, name="ROC"){ + calcAUC <- function(TPR, FPR){ + # http://blog.revolutionanalytics.com/2016/11/calculating-auc.html + dFPR <- c(diff(FPR), 0) + dTPR <- c(diff(TPR), 0) + out <- sum(TPR * dFPR) + sum(dTPR * dFPR)/2 + return(out) + } + labels <- labels[order(scores, decreasing=TRUE)] + df <- data.frame( + False_Positive_Rate = cumsum(!labels)/sum(!labels), + True_Positive_Rate = cumsum(labels)/sum(labels) + ) + df$AUC <- round(calcAUC(df$True_Positive_Rate,df$False_Positive_Rate),3) + df$name <- name + return(df) +} + +#' @export +.getQuantiles <- function(v, len = length(v)){ + if(length(v) < len){ + v2 <- rep(0, len) + v2[seq_along(v)] <- v + }else{ + v2 <- v + } + p <- trunc(rank(v2))/length(v2) + if(length(v) < len){ + p <- p[seq_along(v)] + } + return(p) +} + +#' @export +.rowScale <- function(mat, min = NULL, max = NULL){ + if(!is.null(min)){ + rMin <- min + }else{ + rMin <- matrixStats::rowMins(mat) + } + if(!is.null(max)){ + rMax <- max + }else{ + rMax <- matrixStats::rowMaxs(mat) + } + rScale <- rMax - rMin + matDiff <- mat - rMin + matScale <- matDiff/rScale + out <- list(mat=matScale, min=rMax, max=rMin) + return(out) +} + +#' @export +.quantileCut <- function(x, lo = 0.025, hi = 0.975){ + q <- quantile(x, probs = c(lo,hi)) + x[x < q[1]] <- q[1] + x[x > q[2]] <- q[2] + return(x) +} + +#' @export +.normalizeCols <- function(mat, colSm = NULL, scaleTo = NULL){ + if(is.null(colSm)){ + colSm <- Matrix::colSums(mat) + } + if(!is.null(scaleTo)){ + mat@x <- scaleTo * mat@x / rep.int(colSm, Matrix::diff(mat@p)) + }else{ + mat@x <- mat@x / rep.int(colSm, Matrix::diff(mat@p)) + } + return(mat) +} + +#' @export +.confusionMatrix <- function(i,j){ + ui <- unique(i) + uj <- unique(j) + m <- Matrix::sparseMatrix( + i = match(i, ui), + j = match(j, uj), + x = rep(1, length(i)), + dims = c(length(ui), length(uj)) + ) + rownames(m) <- ui + colnames(m) <- uj + m +} + +#' @export +.safeSubset <- function(mat, subsetRows = NULL, subsetCols = NULL){ + + if(!is.null(subsetRows)){ + idxNotIn <- which(subsetRows %ni% rownames(mat)) + if(length(idxNotIn) > 0){ + subsetNamesNotIn <- subsetRows[idxNotIn] + matNotIn <- Matrix::sparseMatrix(i=1,j=1,x=0,dims=c(length(idxNotIn), ncol = ncol(mat))) + rownames(matNotIn) <- subsetNamesNotIn + mat <- rbind(mat, matNotIn) + } + mat <- mat[subsetRows,] + } + + if(!is.null(subsetCols)){ + idxNotIn <- which(subsetCols %ni% colnames(mat)) + if(length(idxNotIn) > 0){ + subsetNamesNotIn <- subsetCols[idxNotIn] + matNotIn <- Matrix::sparseMatrix(i=1,j=1,x=0,dims=c(nrow(mat), ncol = length(idxNotIn))) + colnames(matNotIn) <- subsetNamesNotIn + mat <- cbind(mat, matNotIn) + } + mat <- mat[,subsetCols] + } + + mat + +} + +#' @export +.groupMeans <- function(mat, groups=NULL, na.rm = TRUE, sparse = FALSE){ + stopifnot(!is.null(groups)) + stopifnot(length(groups)==ncol(mat)) + gm <- lapply(unique(groups), function(x){ + if(sparse){ + Matrix::rowMeans(mat[,which(groups==x),drop=F], na.rm=na.rm) + }else{ + rowMeans(mat[,which(groups==x),drop=F], na.rm=na.rm) + } + }) %>% Reduce("cbind",.) + colnames(gm) <- unique(groups) + return(gm) +} + +#' @export +.groupSums <- function(mat, groups=NULL, na.rm = TRUE, sparse = FALSE){ + stopifnot(!is.null(groups)) + stopifnot(length(groups)==ncol(mat)) + gm <- lapply(unique(groups), function(x){ + if(sparse){ + Matrix::rowSums(mat[,which(groups==x),drop=F], na.rm=na.rm) + }else{ + rowSums(mat[,which(groups==x),drop=F], na.rm=na.rm) + } + }) %>% Reduce("cbind",.) + colnames(gm) <- unique(groups) + return(gm) +} + +#' @export +.groupSds <- function(mat, groups = NULL, na.rm = TRUE, sparse = FALSE){ + stopifnot(!is.null(groups)) + stopifnot(length(groups)==ncol(mat)) + gs <- lapply(unique(groups), function(x){ + if (sparse){ + matrixStats::rowSds(as.matrix(mat[, which(groups == x), drop = F]), na.rm = na.rm) + }else{ + matrixStats::rowSds(mat[, which(groups == x), drop = F], na.rm = na.rm) + } + }) %>% Reduce("cbind",.) + colnames(gs) <- unique(groups) + return(gs) +} + +#' @export +.centerRollMean <- function(v, k){ + o1 <- data.table::frollmean(v, k, align = "right", na.rm = FALSE) + if(k%%2==0){ + o2 <- c(rep(o1[k], floor(k/2)-1), o1[-seq_len(k-1)], rep(o1[length(o1)], floor(k/2))) + }else if(k%%2==1){ + o2 <- c(rep(o1[k], floor(k/2)), o1[-seq_len(k-1)], rep(o1[length(o1)], floor(k/2))) + }else{ + stop("Error!") + } + o2 +} + +########################################################################################## +# Miscellaneous Methods +########################################################################################## + +#' @export +.suppressAll <- function(expr){ + suppressPackageStartupMessages(suppressMessages(suppressWarnings(expr))) +} + +#' @export +.getAssay <- function(se, assayName = NULL){ + assayNames <- function(se){ + names(SummarizedExperiment::assays(se)) + } + if(is.null(assayName)){ + o <- SummarizedExperiment::assay(se) + }else if(assayName %in% assayNames(se)){ + o <- SummarizedExperiment::assays(se)[[assayName]] + }else{ + stop(sprintf("assayName '%s' is not in assayNames of se : %s", assayName, paste(assayNames(se),collapse=", "))) + } + return(o) +} + +#' Get File Extension +#' @param x character string refering to a file you want to get the extension from +#' @export +.fileExtension <- function (x){ + pos <- regexpr("\\.([[:alnum:]]+)$", x) + ifelse(pos > -1L, substring(x, pos + 1L), "") +} + +#' Check path for utility +#' @param u utility that you want to check is in path +#' @param path check on top of path a custom path +#' @param error cause error if not in path +#' @export +.checkPath <- function(u=NULL, path=NULL, error = TRUE){ + if(is.null(u)){ + out <- TRUE + } + out <- lapply(u, function(x, error = TRUE){ + if (Sys.which(x) == "") { + if(!is.null(path) && file.exists(file.path(path,x))){ + o <- TRUE + }else{ + if(error){ + stop(x, " not found in path, please add ", x, " to path!") + }else{ + o <- FALSE + } + } + }else{ + o <- TRUE + } + return(o) + }) %>% unlist %>% all + return(out) +} + +#' This function returns ascii archr LOGO or arrow etc. +#' @param ascii logo, arrow, target +#' @export +.ArchRLogo <- function(ascii = "Logo"){ + Ascii <- list( + Package = c(" + ___ .______ ______ __ __ .______ + / \\\ | _ \\\ / || | | | | _ \\\ + / ^ \\\ | |_) | | ,----'| |__| | | |_) | + / /_\\\ \\\ | / | | | __ | | / + / _____ \\\ | |\\\ \\\\___ | `----.| | | | | |\\\ \\\\___. + /__/ \\__\\ | _| `._____| \\______||__| |__| | _| `._____| + "), + + #modified from cyu@athena.mit.edu + Logo = c(" + / | + / \\\ + . / |. + \\\\\\ / |. + \\\\\\ / `|. + \\\\\\ / |. + \\\ / |\\\ + \\\\#####\\\ / || + ==###########> / || + \\\\##==......\\\ / || + ______ = =|__ /__ || \\\\\\\ + ,--' ,----`-,__ ___/' --,-`-===================##========> + \\\ ' ##_______ _____ ,--,__,=##,__ /// + , __== ___,-,__,--'#' ===' `-' | ##,-/ + -,____,---' \\\\####\\\\________________,--\\\\_##,/ + ___ .______ ______ __ __ .______ + / \\\ | _ \\\ / || | | | | _ \\\ + / ^ \\\ | |_) | | ,----'| |__| | | |_) | + / /_\\\ \\\ | / | | | __ | | / + / _____ \\\ | |\\\ \\\\___ | `----.| | | | | |\\\ \\\\___. + /__/ \\__\\ | _| `._____| \\______||__| |__| | _| `._____| + ") + ) + message(Ascii[[ascii]]) +} + diff --git a/R/IdentifyClusters.R b/R/IdentifyClusters.R new file mode 100644 index 00000000..3b5ce85d --- /dev/null +++ b/R/IdentifyClusters.R @@ -0,0 +1,335 @@ +#' Identify Clusters for Single Cell Data +#' +#' This function will identify clusters for single cell reduced dimensions supplied or from and ArchRProject +#' +#' @param input ArchRProject or matrix for cluster identification +#' @param reducedDims reducedDims of ArchRProject if provided +#' @param name name of cluster column if input is ArchRProject +#' @param method supported methods are Seurat and LouvainJaccard +#' @param dimsToUse reduced dims to use +#' @param knnAssign number of nearest neighbors for assignment of outliers and estimation +#' @param nOutlier number of cells required for a cluster to be called if not then these will be considered an outlier +#' @param seed seed +#' @param ... arguments to provide Seurat::FindClusters or ArchR:::.clustLouvain (knn = 50, jaccard = TRUE) +#' @export +#' +IdentifyClusters <- function( + input, + reducedDims = "IterativeLSI", + name = "Clusters", + sampleCells = NULL, + seed = 1, + method = "seurat", + dimsToUse = NULL, + knnAssign = 10, + nOutlier = 20, + verbose = TRUE, + tstart = NULL, + ... + ){ + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + if(inherits(input, "ArchRProject")){ + if(reducedDims %ni% names(input@reducedDims)){ + stop("Error reducedDims not available!") + } + matDR <- input@reducedDims[[reducedDims]][[1]] + }else if(inherits(input, "matrix")){ + matDR <- input + }else{ + stop("Requires an ArchRProject or Cell by Reduced Dims Matrix!") + } + if(is.null(dimsToUse)){ + dimsToUse <- seq_len(ncol(matDR)) + } + + #Subset Matrix + set.seed(seed) + matDR <- matDR[,dimsToUse] + nr <- nrow(matDR) + + if(!is.null(sampleCells)){ + if(sampleCells < nrow(matDR)){ + .messageDiffTime("Estimating Clusters by Sampling", tstart, verbose = verbose) + estimatingClusters <- 1 + idx <- sample(seq_len(nrow(matDR)), sampleCells) + matDRAll <- matDR + matDR <- matDR[idx,] + }else{ + estimatingClusters <- 0 + } + }else{ + estimatingClusters <- 0 + } + + + ################################################################################# + # Decide on which clustering setup to use + ################################################################################# + if(grepl("seurat",tolower(method))){ + + clustParams <- list(dims = dimsToUse, ...) + clustParams$verbose <- verbose + clustParams$tstart <- tstart + clust <- .clustSeurat(mat = matDR, clustParams = clustParams) + + }else if(grepl("louvainjaccard",tolower(method))){ + + clust <- .clustLouvain(matDR, ...) + + }else{ + + stop("Clustering Method Not Recognized!") + + } + + ################################################################################# + # If estimating clsuters we will assign to nearest neighbor cluster + ################################################################################# + if(estimatingClusters == 1){ + .messageDiffTime("Finding Nearest Clusters", tstart, verbose = verbose) + knnAssigni <- FNN::get.knnx(matDR, matDRAll[-idx,], knnAssign)[[1]] + clustUnique <- unique(clust) + clustMatch <- match(clust, clustUnique) + knnAssigni <- apply(knnAssigni, 2, function(x) clustMatch[x]) + + .messageDiffTime("Assigning Nearest Clusters", tstart, verbose = verbose) + clustAssign <- lapply(seq_along(clustUnique), function(x){ + rowSums(knnAssigni == x) + }) %>% Reduce("cbind", .) %>% apply(., 1, which.max) + clustOld <- clust + clust <- rep(NA, nr) + clust[idx] <- clustOld + clust[-idx] <- clustUnique[clustAssign] + matDR <- matDRAll + remove(matDRAll) + gc() + } + + ################################################################################# + # Test if clusters are outliers identified as cells with fewer than nOutlier + ################################################################################# + .messageDiffTime("Testing Outlier Clusters", tstart, verbose = verbose) + tabClust <- table(clust) + clustAssign <- which(tabClust < nOutlier) + if(length(clustAssign) > 0){ + .messageDiffTime(sprintf("Assigning Outlier Clusters (n = %s, nOutlier < %s cells) to NN", length(clustAssign), nOutlier), tstart, verbose = verbose) + for(i in seq_along(clustAssign)){ + clusti <- names(clustAssign[i]) + idxi <- which(clust==clusti) + knni <- FNN::get.knnx(matDR[-idxi,], matDR[idxi,], knnAssign)[[1]] + clustf <- unlist(lapply(seq_len(nrow(knni)), function(x) names(sort(table(clust[-idxi][knni[x,]]),decreasing=TRUE)[1]))) + clust[idxi] <- clustf + } + } + + ################################################################################# + # Renaming Clusters based on Proximity in Reduced Dimensions + ################################################################################# + .reLabel <- function(labels, oldLabels, newLabels){ + labels <- paste0(labels) + oldLabels <- paste0(oldLabels) + newLabels <- paste0(newLabels) + labelsNew <- labels + for(i in seq_along(oldLabels)){ + labelsNew[labels == oldLabels[i]] <- newLabels[i] + } + paste0(labelsNew) + } + .messageDiffTime(sprintf("Assigning Cluster Names to %s Clusters", length(unique(clust))), tstart, verbose = verbose) + meanSVD <- t(.groupMeans(t(matDR), clust)) + meanKNN <- FNN::get.knnx(meanSVD, meanSVD, nrow(meanSVD))[[1]] + idx <- sample(seq_len(nrow(meanSVD)), 1) + clustOld <- c() + clustNew <- c() + for(i in seq_len(nrow(meanSVD))){ + clustOld[i] <- rownames(meanSVD)[idx] + clustNew[i] <- paste0("Cluster", i) + if(i != nrow(meanSVD)){ + idx <- meanKNN[idx, ][which(rownames(meanSVD)[meanKNN[idx, ]] %ni% clustOld)][1] + } + } + out <- .reLabel(clust, oldLabels = clustOld, newLabels = clustNew) + + if(inherits(input, "ArchRProject")){ + input <- addCellColData( + input, + data = out, + name = name, + cells = rownames(matDR), + force = TRUE + ) + }else if(!inherits(input, "ArchRProject")){ + return(out) + } + +} + +#Simply a wrapper on Seurats FindClusters +.clustSeurat <- function(mat, clustParams){ + + suppressPackageStartupMessages(require(Seurat)) + .messageDiffTime("Running Seurats FindClusters (Stuart et al. Cell 2019)", clustParams$tstart, verbose=clustParams$verbose) + set.seed(1) + + #Arxiv Seurat 2.3.4 method + tmp <- matrix(rnorm(nrow(mat) * 3, 10), ncol = nrow(mat), nrow = 3) + colnames(tmp) <- rownames(mat) + rownames(tmp) <- paste0("t",seq_len(nrow(tmp))) + + obj <- Seurat::CreateSeuratObject(tmp, project='scATAC', min.cells=0, min.features=0) + obj[['pca']] = Seurat::CreateDimReducObject(embeddings=mat, key='PC_', assay='RNA') + clustParams$object <- obj + clustParams$reduction <- "pca" + + obj <- suppressWarnings(do.call(Seurat::FindNeighbors, clustParams)) + clustParams$object <- obj + obj <- suppressWarnings(do.call(Seurat::FindClusters, clustParams)) + + #Get Output + clust <- obj@meta.data[,ncol(obj@meta.data)] + clust <- paste0("Cluster",match(clust, unique(clust))) + +} + +#Need to work on making this work +.clustLouvain <- function(matDR, knn = 50, jaccard = TRUE){ + + getEdges <- function(X, knn, jaccard) { + nearest <- RANN::nn2(X, X, k = knn + 1, treetype = "bd", searchtype = "priority") + nearest$nn.idx <- nearest$nn.idx[, -1] + nearest$nn.dists <- nearest$nn.dists[, -1] + nearest$nn.sim <- 1 * (nearest$nn.dists >= 0) + edges <- reshape2::melt(t(nearest$nn.idx)) + colnames(edges) = c("B", "A", "C") + edges = edges[, c("A", "B", "C")] + edges$B <- edges$C + edges$C <- 1 + edges <- unique(transform(edges, A = pmin(A, B), B = pmax(A, B))) + if (jaccard) { + message("Calculating Jaccard Distance...") + a <- Matrix::tcrossprod(nearest$nn.idx[edges[,1],], nearest$nn.idx[edges[,2],]) + bi <- Matrix::rowSums(nearest$nn.idx[edges[,1],]) + bj <- Matrix::rowSums(nearest$nn.idx[edges[,2],]) + jaccardDist <- a / (rep(ncol(a), bi) + t(rep(nrow(a), bj)) - a) + pb <- txtProgressBar(min=0,max=100,initial=0,style=3) + #RCPPP? + # jaccardDist <- unlist(lapply(seq_len(nrow(edges)), function(x){ + # setTxtProgressBar(pb,round(x*100/nrow(edges),0)) + # jInt <- intersect(nearest$nn.idx[edges[x,1],], nearest$nn.idx[edges[x,2],]) + # jUnion <- union(nearest$nn.idx[edges[x,1],], nearest$nn.idx[edges[x,2],]) + # length(jInt) / length(jUnion) + # })) + edges$C <- jaccardDist + edges <- subset(edges, C != 0) + edges$C <- edges$C/max(edges$C) + } + edges <- Matrix::sparseMatrix(i = edges$A, j = edges$B, x = edges$C, dims = c(nrow(X),nrow(X)), symmetric = TRUE) + return(edges) + } + + assignClusters <- function(edges, jaccard) { + if (jaccard) { + weights <- TRUE + }else { + weights <- NULL + } + g <- igraph::graph.adjacency(edges, mode = "undirected", weighted = weights) + graphOut <- igraph::cluster_louvain(g) + clustAssign <- factor(graphOut$membership, levels = sort(unique(graphOut$membership))) + names(clustAssign) <- graphOut$names + k = order(table(clustAssign), decreasing = TRUE) + newLevels <- rep(1, length(unique(graphOut$membership))) + newLevels[k] <- seq_len(length(unique(graphOut$membership))) + levels(clustAssign) <- newLevels + clustAssign <- factor(clustAssign, levels = seq_len(length(unique(graphOut$membership)))) + return(paste0("Cluster", clustAssign)) + } + + require(RANN) + require(cluster) + require(igraph) + require(Matrix) + message("Running Louvian Jaccard Graph Clustering...") + message("Adapted from Comprehensive Classification of Retinal Bipolar Neurons by Single-Cell Transcriptomics. Cell 2016.") + + message("Calculating Edges...") + edges <- getEdges(X = matDR, knn = knn, jaccard = jaccard) + message("\nAssigning Clusters...") + clustAssign <- assignClusters(edges = edges, jaccard = jaccard) + + return(clustAssign) + +} + +#' Group Means +#' @export +computeKNN <- function(data, query = NULL, k = 50, method = NULL, includeSelf = FALSE, ...){ + + if(is.null(query)){ + query <- data + searchSelf <- TRUE + }else{ + searchSelf <- FALSE + } + + if(is.null(method)){ + if(requireNamespace("nabor", quietly = TRUE)){ + method <- "nabor" + }else if(requireNamespace("RANN", quietly = TRUE)){ + method <- "RANN" + }else if(requireNamespace("FNN", quietly = TRUE)){ + method <- "FNN" + }else{ + stop("Computing KNN requires package nabor, RANN or FNN") + } + } + + if(tolower(method)=="nabor"){ + + .requirePackage("nabor") + if(searchSelf & !includeSelf){ + knnIdx <- nabor::knn(data = data, query = query, k = k + 1, ...)$nn.idx + knnIdx <- knnIdx[,-1] + }else{ + knnIdx <- nabor::knn(data = data, query = query, k = k, ...)$nn.idx + } + + }else if(tolower(method)=="rann"){ + + .requirePackage("RANN") + if(searchSelf & !includeSelf){ + knnIdx <- RANN::nn2(data = data, query = query, k = k + 1, ...)$nn.idx + knnIdx <- knnIdx[,-1] + }else{ + knnIdx <- RANN::nn2(data = data, query = query, k = k, ...)$nn.idx + } + + }else if(tolower(method)=="fnn"){ + + .requirePackage("FNN") + if(searchSelf & !includeSelf){ + knnIdx <- FNN::get.knnx(data = data, query = query, k = k + 1, ...)$nn.index + knnIdx <- knnIdx[,-1] + }else{ + knnIdx <- FNN::get.knnx(data = data, query = query, k = k, ...)$nn.index + } + + }else{ + + stop(sprintf("KNN Method %s not Recognized!", method)) + + } + + knnIdx + +} + + + + + + diff --git a/R/LatentSemanticIndexing.R b/R/LatentSemanticIndexing.R new file mode 100644 index 00000000..94818c90 --- /dev/null +++ b/R/LatentSemanticIndexing.R @@ -0,0 +1,527 @@ +#' Compute Iterative LSI +#' +#' This function will compute an iterative LSI dimensionality reduction +#' on an ArchRProject. +#' +#' @param ArchRProj ArchRProject +#' @param useMatrix use matrix for LSI clustering from Arrow +#' @param reducedDimsOut name of dimensionality reduction to be stored as +#' @param iterations number of LSI iterations to perform +#' @param dimsToUse number of dimensions to compute and use from LSI (TFIDF-SVD) for clustering +#' @param binarize binarize matrix prior to LSI +#' @param sampleCells number of cells to sample for LSI estimation +#' @param varFeatures number of variable features to use for LSI +#' @param selectionMethod selection method for variable features (var or vmr) +#' @param scaleTo scaleTo for Cluster Averages for variance calculation +#' @param totalFeatures number of features to consider (ranked by total number of counts) use for LSI +#' @param filterQuantile filter features for initial LSI that are above this quantile +#' @param saveIterations save LSI iterations as rds in the outDir +#' @param outDir output directory for saving LSI iterations +#' @param clusterParams additional params to pass to IdentifyClusters +#' @param runHarmony run harmony batch correction through the iterations +#' @param harmonyParams additional params to pass to harmony +#' @param threads number of threads for parallel execution +#' @param seed seed for analysis +#' @param verboseHeader verbose sections +#' @param verboseAll verbose sections and subsections +#' @param force verbose sections and subsections +#' @param ... additional args +#' @export +IterativeLSI <- function( + ArchRProj = NULL, + useMatrix = "TileMatrix", + reducedDimsOut = "IterativeLSI", + iterations = 3, + dimsToUse = 1:25, + binarize = TRUE, + sampleCells = 5000, + varFeatures = 50000, + selectionMethod = "var", + scaleTo = 10000, + totalFeatures = 500000, + filterQuantile = 0.99, + saveIterations = TRUE, + outDir = getOutputDirectory(ArchRProj), + clusterParams = list(), + runHarmony = FALSE, + harmonyParams = list(), + threads = 1, + seed = 1, + verboseHeader = TRUE, + verboseAll = FALSE, + force = FALSE, + ...){ + + .requirePackage("Matrix") + tstart <- Sys.time() + + if(!is.null(ArchRProj@reducedDims[[reducedDimsOut]])){ + if(!force){ + stop("Error ReducedDimsOut Already Exists! Set force = TRUE or pick a different name!") + } + } + + #What Parameters To Pass + defaultClustParams <- list( + method = "Seurat", + resolution = c(0.4, 0.6), + n.start = c(10, 10), + verbose = TRUE + ) + clusterParams <- .mergeParams(clusterParams, defaultClustParams) + + #Set Seed + set.seed(seed) + outDir <- file.path(outDir, reducedDimsOut) + dir.create(outDir, showWarnings = FALSE, recursive = TRUE) + + #All the Cell Names + cellNames <- rownames(getCellColData(ArchRProj)) + if(!is.null(sampleCells)){ + if(length(cellNames) < sampleCells){ + sampleCells <- NULL + } + } + + #Check if Matrix is supported + stopifnot(any(tolower(useMatrix) %in% c("tilematrix","peakmatrix"))) + if(tolower(useMatrix) == "tilematrix"){ + useMatrix <- "TileMatrix" + } + if(tolower(useMatrix) == "peakmatrix"){ + useMatrix <- "PeakMatrix" + } + + tstart <- Sys.time() + .messageDiffTime(paste0("Computing IterativeLSI on ", useMatrix), tstart, addHeader = TRUE, verbose = verboseHeader) + + #MatrixFiles + ArrowFiles <- getSampleColData(ArchRProj)[,"ArrowFiles"] + chrToRun <- .availableSeqnames(ArrowFiles, subGroup = useMatrix) + + #Compute Row Sums Across All Samples + .messageDiffTime("Computing Total Accessibility Across All Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + totalAcc <- .getRowSums(ArrowFiles = ArrowFiles, useMatrix = useMatrix, seqnames = chrToRun) + gc() + + #Identify the top features to be used here + .messageDiffTime("Computing Top Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + nFeature <- varFeatures[1] + rmTop <- floor((1-filterQuantile) * totalFeatures) + topIdx <- head(order(totalAcc$value, decreasing=TRUE), nFeature + rmTop)[-seq_len(rmTop)] + topFeatures <- totalAcc[sort(topIdx),] + + #Compute Partial Matrix LSI + outLSI <- .LSIPartialMatrix( + ArrowFiles = ArrowFiles, + featureDF = topFeatures, + cellNames = cellNames, + sampleNames = getCellColData(ArchRProj)$Sample, + dimsToUse = dimsToUse, + binarize = binarize, + sampleCells = sampleCells, + threads = threads, + useIndex = FALSE, + tstart = tstart + ) + outLSI$LSIFeatures <- topFeatures + gc() + + if(runHarmony){ + .messageDiffTime("Harmonizing LSI output on the Top Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + .requirePackage("harmony") + harmonyParams$data_mat <- outLSI$matSVD + harmonyParams$meta_data <- data.frame(row.names = rownames(outLSI$matSVD), Group = stringr::str_split(rownames(outLSI$matSVD), pattern = "#", simplify=TRUE)[,1]) + harmonyParams$do_pca <- FALSE + harmonyParams$vars_use <- "Group" + harmonyParams$plot_convergence <- FALSE + harmonyParams$verbose <- verboseAll + #Harmonize the LSI Results + outLSI$matSVD <- do.call(HarmonyMatrix, harmonyParams) + } + + #Time to compute clusters + .messageDiffTime("Identifying Clusters", tstart, addHeader = verboseAll, verbose = verboseHeader) + parClust <- lapply(clusterParams, function(x) x[[1]]) + parClust$input <- outLSI$matSVD + parClust$sampleCells <- sampleCells + parClust$verbose <- verboseAll + clusters <- do.call(IdentifyClusters, parClust) + + #Save Output + if(saveIterations){ + .messageDiffTime("Saving LSI Iteration", tstart, addHeader = verboseAll, verbose = verboseHeader) + outj <- SimpleList(LSI = outLSI, clusters = clusters, params = parClust[-length(parClust)]) + saveRDS(outj, file.path(outDir, paste0("Save-LSI-Iteration-1.rds"))) + } + + j <- 1 + while(j < iterations){ + + #Jth iteration + j <- j + 1 + + .messageDiffTime(sprintf("Running LSI %s of %s on Variable Features", j, iterations), tstart, addHeader = TRUE, verbose = verboseHeader) + + #Create Group Matrix + .messageDiffTime("Creating Cluster Matrix on the total Group Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + groupList <- SimpleList(split(rownames(outLSI$matSVD), clusters)) + groupFeatures <- totalAcc[sort(head(order(totalAcc$value, decreasing = TRUE), totalFeatures)),] + groupMat <- .getGroupMatrix( + ArrowFiles = ArrowFiles, + featureDF = groupFeatures, + threads = threads, + groupList = SimpleList(split(rownames(outLSI$matSVD), clusters)), + useIndex = FALSE, + verbose = verboseAll + ) + + if(length(varFeatures) > 1){ + nFeature <- varFeatures[j] + }else{ + nFeature <- varFeatures + } + + if(tolower(selectionMethod) == "var"){ + + #Log-Normalize + groupMat <- log2(t(t(groupMat) / colSums(groupMat)) * scaleTo + 1) + var <- matrixStats::rowVars(groupMat) + idx <- sort(head(order(var,decreasing=TRUE), nFeature)) + variableFeatures <- groupFeatures[idx,] + + }else if(tolower(selectionMethod) == "vmr"){ + + #Variance-to-Mean Ratio + vmr <- matrixStats::rowVars(groupMat) / rowMeans(groupMat) + idx <- sort(head(order(vmr, decreasing=TRUE), nFeature)) + variableFeatures <- groupFeatures[idx,] + + }else{ + + stop("Error Selection Method is not Valid requires var or vmr") + + } + + #Compute Partial Matrix LSI + outLSI <- .LSIPartialMatrix( + ArrowFiles = ArrowFiles, + featureDF = variableFeatures, + cellNames = cellNames, + sampleNames = getCellColData(ArchRProj)$Sample, + dimsToUse = dimsToUse, + binarize = binarize, + sampleCells = sampleCells, + threads = threads, + useIndex = FALSE, + tstart = tstart + ) + outLSI$LSIFeatures <- variableFeatures + + if(runHarmony){ + .messageDiffTime("Harmonizing LSI output on the Variable Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + harmonyParams$data_mat <- outLSI$matSVD + harmonyParams$meta_data <- data.frame(row.names = rownames(outLSI$matSVD), Group = stringr::str_split(rownames(outLSI$matSVD), pattern = "#", simplify=TRUE)[,1]) + harmonyParams$do_pca <- FALSE + harmonyParams$vars_use <- "Group" + harmonyParams$plot_convergence <- FALSE + harmonyParams$verbose <- verboseAll + #Harmonize the LSI Results + outLSI$matSVD <- do.call(HarmonyMatrix, harmonyParams) + } + + if(j != iterations){ + + #Time to compute clusters + .messageDiffTime("Identifying Clusters", tstart, addHeader = verboseAll, verbose = verboseHeader) + parClust <- lapply(clusterParams, function(x){ + if(length(x) > 1){ + return(x[[j]]) + }else{ + return(x[[1]]) + } + }) + parClust$input <- outLSI$matSVD + parClust$sampleCells <- sampleCells + parClust$verbose <- verboseAll + clusters <- do.call(IdentifyClusters, parClust) + + #Save Output + if(saveIterations){ + .messageDiffTime("Saving LSI Iteration", tstart, addHeader = verboseAll, verbose = verboseHeader) + outj <- SimpleList(LSI = outLSI, clusters = clusters, params = parClust[-length(parClust)]) + saveRDS(outj, file.path(outDir, paste0("Save-LSI-Iteration-",j,".rds"))) + } + + } + + } + + #Organize Output + .messageDiffTime("Finished Running IterativeLSI", tstart, addHeader = verboseAll, verbose = verboseHeader) + ArchRProj@reducedDims[[reducedDimsOut]] <- outLSI + + return(ArchRProj) + +} + +.LSIPartialMatrix <- function( + ArrowFiles, + featureDF, + cellNames, + sampleNames, + dimsToUse, + binarize = TRUE, + sampleCells = 5000, + threads = 1, + useIndex = FALSE, + tstart = NULL, + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + if(is.null(sampleCells)){ + + #Construct Matrix + .messageDiffTime("Creating Partial Matrix of Top Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + + mat <- .getPartialMatrix( + ArrowFiles = ArrowFiles, + featureDF = featureDF, + cellNames = cellNames, + doSampleCells = FALSE, + threads = threads, + verbose = verboseAll + ) + + #Compute LSI + .messageDiffTime("Running LSI on the Top Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + outLSI <- computeLSI(mat, nDimensions = max(dimsToUse), binarize = binarize, verbose = verboseAll, tstart = tstart) + + }else{ + + set.seed(1) + .messageDiffTime("Sampling Cells for Estimated LSI", tstart, addHeader = verboseAll, verbose = verboseHeader) + sampleN <- floor(sampleCells * table(sampleNames) / length(sampleNames)) + splitCells <- split(cellNames, sampleNames) + sampledCellNames <- lapply(seq_along(splitCells), function(x){ + sample(splitCells[[x]], sampleN[names(splitCells)[x]]) + }) %>% unlist %>% sort + + #Construct Sampled Matrix + .messageDiffTime("Creating Sampled Partial Matrix of Top Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + tmpPath <- tempfile() + o <- h5closeAll() + out <- .getPartialMatrix( + ArrowFiles = ArrowFiles, + featureDF = featureDF, + cellNames = cellNames, + doSampleCells = TRUE, + sampledCellNames = sampledCellNames, + tmpPath = tmpPath, + useIndex = useIndex, + threads = threads, + verbose = verboseAll + ) + gc() + + #Perform LSI on Partial Sampled Matrix + .messageDiffTime("Running Sampled LSI on the Top Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + outLSI <- computeLSI(out$mat, nDimensions = max(dimsToUse), binarize = binarize, verbose = verboseAll, tstart = tstart) + tmpMatFiles <- out[[2]] + rm(out) + gc() + + #Read In Matrices and Project into Manifold + .messageDiffTime("Projecting Matrices with the Top Features", tstart, addHeader = verboseAll, verbose = verboseHeader) + pLSI <- lapply(seq_along(tmpMatFiles), function(x){ + projectLSI(mat = readRDS(tmpMatFiles[x]), LSI = outLSI, verbose = FALSE, tstart = tstart) + }) %>% Reduce("rbind", .) + + #Remove Temporary Matrices + rmf <- file.remove(tmpMatFiles) + + #Set To LSI the SVD Matrices + outLSI$exlcude <- cellNames[which(cellNames %ni% rownames(pLSI))] + outLSI$matSVD <- as.matrix(pLSI[cellNames[which(cellNames %in% rownames(pLSI))],]) + + } + + return(outLSI) + +} + +#' Compute LSI +#' +#' This function will compute a LSI transform (TF-IDF followed by SVD) +#' +#' @param mat sparseMatrix (dgcMatrix) for LSI +#' @param nDimensions number of LSI dimensions to compute +#' @param binarize binarize matrix prior to LSI +#' @param seed seed for analysis +#' @param verbose verbose +#' @param tstart time stamp to pass +#' @param ... additional args +#' @export +computeLSI <- function(mat, nDimensions = 50, binarize = TRUE, seed = 1, verbose = TRUE, tstart = NULL, ...){ + + set.seed(seed) + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + .messageDiffTime(sprintf("Running LSI, Input Matrix = %s GB", round(object.size(mat)/10^9, 3)), tstart, addHeader = verbose, verbose = verbose) + + #TF IDF LSI adapted from flyATAC + if(binarize){ + .messageDiffTime("Binarizing Matrix", tstart, addHeader = FALSE, verbose = verbose) + mat@x[mat@x > 0] <- 1 + } + + #Clean up zero rows + .messageDiffTime("Removing 0 Sum Rows", tstart, addHeader = FALSE, verbose = verbose) + rowSm <- Matrix::rowSums(mat) + idx <- which(rowSm > 0) + mat <- mat[idx,] + rowSm <- rowSm[idx] + + #TF + .messageDiffTime("Computing Term Frequency", tstart, addHeader = FALSE, verbose = verbose) + colSm <- Matrix::colSums(mat) + if(any(colSm == 0)){ + exclude <- which(colSm==0) + mat <- mat[,-exclude] + colSm <- colSm[-exclude] + }else{ + exclude <- c() + } + mat@x <- mat@x / rep.int(colSm, Matrix::diff(mat@p)) + + #IDF + .messageDiffTime("Computing Inverse Document Frequency", tstart, addHeader = FALSE, verbose = verbose) + idf <- as(log(1 + ncol(mat) / rowSm), "sparseVector") + + #TF-IDF + .messageDiffTime("Computing TF-IDF Matrix", tstart, addHeader = FALSE, verbose = verbose) + mat <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% mat + gc() + + #Calc SVD then LSI + .messageDiffTime("Computing SVD using irlba", tstart, addHeader = FALSE, verbose = verbose) + svd <- irlba::irlba(mat, nDimensions, nDimensions) + svdDiag <- matrix(0, nrow=nDimensions, ncol=nDimensions) + diag(svdDiag) <- svd$d + matSVD <- t(svdDiag %*% t(svd$v)) + rownames(matSVD) <- colnames(mat) + colnames(matSVD) <- paste0("PC",seq_len(ncol(matSVD))) + + #Return Object + .messageDiffTime("Finished LSI (TF-IDF SVD) using irlba", tstart, addHeader = FALSE, verbose = verbose) + out <- SimpleList( + matSVD = matSVD, + rowSm = rowSm, + colSm = colSm, + exclude = exclude, + idx = idx, + svd = svd, + binarize = binarize, + nDimensions = nDimensions, + date = Sys.Date(), + seed = seed + ) + + rm(mat) + gc() + + out +} + +#' Project LSI +#' +#' This function will compute a LSI Projection (TF-IDF followed by SVD projection) +#' +#' @param mat sparseMatrix (dgcMatrix) for LSI +#' @param LSI previous LSI transform to project into +#' @param returnModel return projection information +#' @param verbose verbose +#' @param tstart time stamp to pass +#' @param ... additional args +#' @export +projectLSI <- function(mat, LSI, returnModel = FALSE, verbose = TRUE, tstart = NULL, ...){ + + require(Matrix) + set.seed(LSI$seed) + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + .messageDiffTime(sprintf("Projecting LSI, Input Matrix = %s GB", round(object.size(mat)/10^9, 3)), tstart, addHeader = verbose, verbose = verbose) + + #Get Same Features + .messageDiffTime("Subsetting by Non-Zero features in inital Matrix", tstart, addHeader = FALSE, verbose = verbose) + mat <- mat[LSI$idx,] + + #Binarize Matrix + if(LSI$binarize){ + .messageDiffTime("Binarizing Matrix", tstart, addHeader = FALSE, verbose = verbose) + mat@x[mat@x > 0] <- 1 + } + + #TF + .messageDiffTime("Computing Term Frequency", tstart, addHeader = FALSE, verbose = verbose) + colSm <- Matrix::colSums(mat) + if(any(colSm == 0)){ + exclude <- which(colSm==0) + mat <- mat[,-exclude] + colSm <- colSm[-exclude] + } + mat@x <- mat@x / rep.int(colSm, Matrix::diff(mat@p)) + + #IDF + .messageDiffTime("Computing Inverse Document Frequency of initial Matrix", tstart, addHeader = FALSE, verbose = verbose) + idf <- as(log(1 + length(LSI$colSm) / LSI$rowSm), "sparseVector") + + #TF-IDF + .messageDiffTime("Computing TF-IDF Transform", tstart, addHeader = FALSE, verbose = verbose) + mat <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% mat + + #Clean Up Matrix + idxNA <- Matrix::which(is.na(mat),arr.ind=TRUE) + if(length(idxNA) > 0){ + .messageDiffTime(sprintf("Zeroing %s NA elements", length(idxNA)), tstart, addHeader = FALSE, verbose = verbose) + mat[idxNA] <- 0 + } + + #Calc V + .messageDiffTime("Calculating V Matrix", tstart, addHeader = FALSE, verbose = verbose) + V <- Matrix::t(mat) %*% LSI$svd$u %*% Matrix::diag(1/LSI$svd$d) + + #LSI Diagonal + .messageDiffTime("Computing Projected Coordinates", tstart, addHeader = FALSE, verbose = verbose) + svdDiag <- matrix(0, nrow=LSI$nDimensions, ncol=LSI$nDimensions) + diag(svdDiag) <- LSI$svd$d + matSVD <- Matrix::t(svdDiag %*% Matrix::t(V)) + matSVD <- as.matrix(matSVD) + rownames(matSVD) <- colnames(mat) + colnames(matSVD) <- paste0("PC",seq_len(ncol(matSVD))) + + if(returnModel){ + .messageDiffTime("Calculating Re-Projected Matrix", tstart, addHeader = FALSE, verbose = verbose) + X <- LSI$svd$u %*% diag(LSI$svd$d) %*% t(V) + out <- list(matSVD = matSVD, V = V, X = X) + }else{ + out <- matSVD + } + + return(out) +} + + + diff --git a/R/MarkerFeatures.R b/R/MarkerFeatures.R new file mode 100644 index 00000000..17320c7b --- /dev/null +++ b/R/MarkerFeatures.R @@ -0,0 +1,527 @@ +#' Identify Marker Features for each Group +#' +#' This function will identify a null set of cells that match biases per cell +#' while maintaining the input group proportions. Then it will compute a pairwise +#' test of the group vs the null set. +#' +#' @param ArchRProj ArchR Project +#' @param groupBy group cells by this column in cellColData +#' @param useGroups use subset of groups in group column in cellColData +#' @param useMatrix matrix name in Arrow Files that will be used for identifying features +#' @param bias biases to account for in selecting null group using info from cellColData +#' @param normBy normalize by column in cellColData prior to test +#' @param testMethod pairwise test method group vs null +#' @param minCells minimum cells per group for testing +#' @param maxCells maximum cells per group for testing +#' @param k knn for matching cell biases +#' @param bufferRatio buffering ratio for matching cell biases +#' @param binarize binarize prior to testing +#' @param method marker identification method +#' @param useSeqnames specific seqnames to use only +#' @param verboseHeader verbose sections +#' @param verboseAll verbose sections and subsections +#' @param ... additional args +#' @export +markerFeatures <- function( + ArchRProj = NULL, + groupBy = "Clusters", + useGroups = NULL, + useMatrix = "GeneScoreMatrix", + bias = c("TSSEnrichment", "log10(nFrags)"), + normBy = NULL, + testMethod = "wilcoxon", + minCells = 50, + maxCells = 500, + threads = 1, + k = 100, + bufferRatio = 0.8, + binarize = FALSE, + useSeqnames = NULL, + method = "ArchR", + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + args <- append(args, mget(names(formals()),sys.frame(sys.nframe()))) + + if(tolower(method) == "archr"){ + + out <- do.call(.MarkersSC, args) + + }else if(tolower(method) == "venice"){ + + .requirePackage("signac", installInfo = 'devtools::install_github("bioturing/signac")') + + }else{ + + stop("Input Method Not Available!") + + } + + args$ArchRProj <- NULL + metadata(out)$Params <- args + + return(out) + +} + +################################################################################################## +# Single Cell Implementation! +################################################################################################## +.MarkersSC <- function( + ArchRProj = NULL, + groupBy = "Clusters", + useGroups = NULL, + normBy = NULL, + minCells = 50, + maxCells = 500, + bufferRatio = 0.8, + useSeqnames = NULL, + bias = NULL, + k = 100, + threads = 8, + binarize = FALSE, + testMethod = "wilcoxon", + useMatrix = "GeneScoreMatrix", + markerParams = list(), + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + tstart <- Sys.time() + + ##################################################### + # Feature Info + ##################################################### + ArrowFiles <- getArrowFiles(ArchRProj) + featureDF <- .getFeatureDF(ArrowFiles, useMatrix) + if(!is.null(useSeqnames)){ + featureDF <- featureDF[BiocGenerics::which(featureDF$seqnames %bcin% useSeqnames),] + } + if(all(c("deviations","z") %in% unique(paste0(featureDF$seqnames)))){ + message("Detected using deviations matrix without using deviations or z!\nDefaulting to using z values!\nTo use deviations set useSeqnames='deviations'") + featureDF <- featureDF[BiocGenerics::which(featureDF$seqnames %bcin% "z"),] + } + + ##################################################### + # Match Bias Groups + ##################################################### + .messageDiffTime("Matching Known Biases", tstart, addHeader = verboseAll) + groups <- getCellColData(ArchRProj, groupBy, drop = TRUE) + if(!is.null(useGroups)){ + if(any(useGroups %ni% groups)){ + stop("Not all useGroups in Group names!") + } + groups <- groups[groups %in% useGroups] + } + + matchObj <- .matchBiasCellGroups( + input = getCellColData(ArchRProj), + groups = groups, + bias = bias, + k = k, + n = maxCells + ) + + ##################################################### + # Pairwise Test Per Seqnames + ##################################################### + .messageDiffTime("Computing Pairwise Tests", tstart, addHeader = verboseAll) + if(is.null(normBy)){ + if(tolower(useMatrix) %in% c("tilematrix","peakmatrix")){ + normBy <- "ReadsInTSS" + normFactors <- getCellColData(ArchRProj, normBy, drop=FALSE) + normFactors[,1] <- median(normFactors[,1]) / normFactors[,1] + }else{ + normFactors <- NULL + } + }else{ + normFactors <- NULL + } + + diffList <- .safelapply(seq_along(matchObj[[1]]), function(x){ + .testMarkerSC( + ArrowFiles = ArrowFiles, + matchObj = matchObj, + group = names(matchObj[[1]])[x], + testMethod = testMethod, + threads = 1, + useMatrix = useMatrix, + featureDF = featureDF, + normFactors = normFactors, + binarize = binarize + ) + }, threads = threads) + + ##################################################### + # Summarize Output + ##################################################### + if(tolower(testMethod) == "wilcoxon"){ + pse <- SummarizedExperiment::SummarizedExperiment( + assays = + SimpleList( + Log2FC = lapply(seq_along(diffList), function(x) diffList[[x]]$log2FC) %>% Reduce("cbind",.), + Mean = lapply(seq_along(diffList), function(x) diffList[[x]]$mean1) %>% Reduce("cbind",.), + FDR = lapply(seq_along(diffList), function(x) diffList[[x]]$fdr) %>% Reduce("cbind",.), + AUC = lapply(seq_along(diffList), function(x) diffList[[x]]$auc) %>% Reduce("cbind",.), + MeanBDG = lapply(seq_along(diffList), function(x) diffList[[x]]$mean2) %>% Reduce("cbind",.) + ), + rowData = featureDF + ) + }else if(tolower(testMethod) == "ttest"){ + pse <- SummarizedExperiment::SummarizedExperiment( + assays = + SimpleList( + Log2FC = lapply(seq_along(diffList), function(x) diffList[[x]]$log2FC) %>% Reduce("cbind",.), + Mean = lapply(seq_along(diffList), function(x) diffList[[x]]$mean1) %>% Reduce("cbind",.), + Variance = lapply(seq_along(diffList), function(x) diffList[[x]]$var1) %>% Reduce("cbind",.), + FDR = lapply(seq_along(diffList), function(x) diffList[[x]]$fdr) %>% Reduce("cbind",.), + AUC = lapply(seq_along(diffList), function(x) diffList[[x]]$auc) %>% Reduce("cbind",.), + MeanBDG = lapply(seq_along(diffList), function(x) diffList[[x]]$mean2) %>% Reduce("cbind",.), + VarianceBDG = lapply(seq_along(diffList), function(x) diffList[[x]]$var2) %>% Reduce("cbind",.) + ), + rowData = featureDF + ) + }else{ + stop("Error Unrecognized Method!") + } + colnames(pse) <- names(matchObj[[1]]) + + .messageDiffTime("Completed Pairwise Tests", tstart, addHeader = TRUE) + + return(pse) + +} + +.matchBiasCellGroups <- function(input, groups, bias, k = 100, n = 500, bufferRatio = 0.8){ + + #Summary Function + .summarizeColStats <- function(m, name = NULL){ + med <- apply(m, 2, median) + mean <- colMeans(m) + sd <- apply(m, 2, sd) + loQ <- apply(m, 2, function(x) quantile(x, 0.25)) + hiQ <- apply(m, 2, function(x) quantile(x, 0.75)) + summaryDF <- t(data.frame( + median = med, + mean = mean, + sd = sd, + lowerQuartile = loQ, + upperQuartile = hiQ + )) %>% data.frame + colnames(summaryDF) <- colnames(m) + if(!is.null(name)){ + summaryDF$name <- name + } + summaryDF + } + + #Set Seed + set.seed(1) + + #Make sure input is dataframe + input <- data.frame(input) + + #Norm using input string ie log10(nfrags) + inputNorm <- lapply(seq_along(bias), function(x){ + plyr::mutate(input, o=eval(parse(text=bias[x])))$o + }) %>% Reduce("cbind", .) + + #Quantile Normalization + inputNormQ <- lapply(seq_len(ncol(inputNorm)), function(x){ + .getQuantiles(inputNorm[,x]) + }) %>% Reduce("cbind", .) + + #Add Colnames + colnames(inputNorm) <- bias + colnames(inputNormQ) <- bias + + #Get proportion of each group + prob <- table(groups) / length(groups) + + pb <- txtProgressBar(min=0,max=100,initial=0,style=3) + matchList <- lapply(seq_along(prob), function(x){ + + setTxtProgressBar(pb,round(x*100/length(prob),0)) + + ############# + # Organize + ############# + probx <- prob[-x]/sum(prob[-x]) + id <- which(groups==names(prob)[x]) + knnx <- computeKNN(inputNormQ[-id,],inputNormQ[id,], k = k) + sx <- sample(seq_len(nrow(knnx)), nrow(knnx)) + minTotal <- min(n, length(sx) * bufferRatio) + nx <- sort(floor(minTotal * probx)) + + ############### + # ID Matching + ############### + idX <- c() + idY <- c() + it <- 0 + + if(any(nx <= 0)){ + nx[which(nx <= 0)] <- Inf + nx <- sort(nx) + } + + while(it < length(sx) & length(idX) < minTotal){ + + it <- it + 1 + knnit <- knnx[sx[it],] + groupit <- match(groups[-id][knnit],names(nx)) + selectUnique <- FALSE + selectit <- 0 + oit <- order(groupit) + + while(!selectUnique){ + selectit <- selectit + 1 + itx <- which(oit==selectit) + cellx <- knnit[itx] + groupitx <- groupit[itx] + if(is.infinite(nx[groupitx])){ + if(selectit == k){ + itx <- NA + cellx <- NA + selectUnique <- TRUE + } + }else{ + if(cellx %ni% idY){ + selectUnique <- TRUE + } + if(selectit == k){ + itx <- NA + cellx <- NA + selectUnique <- TRUE + } + } + } + + if(!is.na(itx)){ + idX <- c(idX, sx[it]) + idY <- c(idY, cellx) + nx[groupitx] <- nx[groupitx] - 1 + if(any(nx <= 0)){ + nx[which(nx <= 0)] <- Inf + nx <- sort(nx) + } + } + + if(all(is.infinite(nx))){ + it <- length(sx) + } + + } + + ##################### + # Convert Back to Normal Indexing + ##################### + idX <- seq_len(nrow(inputNormQ))[id][idX] + idY <- seq_len(nrow(inputNormQ))[-id][idY] + + ##################### + # Matching Stats Groups + ##################### + estBdg <- sort(floor(minTotal * probx)) + obsBdg <- rep(0, length(estBdg)) + names(obsBdg) <- names(estBdg) + tabGroups <- table(groups[idY]) + obsBdg[names(tabGroups)] <- tabGroups + estBdgP <- round(100 * estBdg / sum(estBdg),3) + obsBdgP <- round(100 * obsBdg / sum(obsBdg),3) + + ##################### + # Matching Stats Bias Norm Values + ##################### + forBias <- .summarizeColStats(inputNorm[idX,], name = "foreground") + bdgBias <- .summarizeColStats(inputNorm[idY,], name = "background") + + out <- list( + cells = idX, + bdg = idY, + summaryCells = forBias, + summaryBdg = bdgBias, + bdgGroups = rbind(estBdg, obsBdg), + bdgGroupsProbs = rbind(estBdgP, obsBdgP), + corBdgGroups = cor(estBdgP, obsBdgP), + n = length(sx), + p = it / length(sx), + group = names(prob)[x] + ) + + return(out) + + }) %>% SimpleList + names(matchList) <- names(prob) + + message("\n") + + outList <- SimpleList( + matchBdg = matchList, + info = SimpleList( + cells = rownames(input), + groups = groups, + biasNorm = inputNorm, + biasNormQ = inputNormQ + ) + ) + + return(outList) + +} + +.testMarkerSC <- function(ArrowFiles, matchObj, group = NULL, testMethod = "ttest", useMatrix, + threads = 1, featureDF, binarize = FALSE, normFactors = NULL){ + + matchx <- matchObj[[1]][[group]] + cellsx <- matchObj[[2]]$cells[matchx$cells] + bdgx <- matchObj[[2]]$cells[matchx$bdg] + + if(!is.null(normFactors)){ + cellNF <- normFactors[cellsx,1] + bdgNF <- normFactors[bdgx,1] + } + + #Add RowNames for Check at the end + rownames(featureDF) <- paste0("f", seq_len(nrow(featureDF))) + seqnames <- unique(featureDF$seqnames) + + pairwiseDF <- lapply(seq_along(seqnames), function(y){ + + featureDFy <- featureDF[BiocGenerics::which(featureDF$seqnames %bcin% seqnames[y]), ] + + scMaty <- suppressMessages(.getPartialMatrix( + ArrowFiles, + featureDF = featureDFy, + threads = threads, + useMatrix = useMatrix, + cellNames = c(cellsx, bdgx), + progress = FALSE + )) + rownames(scMaty) <- rownames(featureDFy) + + if(binarize){ + scMaty@x[scMaty@x > 0] <- 1 + } + + args <- list() + args$mat1 <- scMaty[, cellsx, drop=FALSE] + args$mat2 <- scMaty[, bdgx, drop=FALSE] + + if(!is.null(normFactors)){ + cellNF <- normFactors[cellsx,1] + bdgNF <- normFactors[bdgx,1] + } + + if(tolower(testMethod) == "wilcoxon"){ + + .suppressAll(do.call(.sparseMatWilcoxon, args)) + + }else if(tolower(testMethod) == "ttest"){ + + .suppressAll(do.call(.sparseMatTTest, args)) + + }else{ + + stop("Error Unrecognized Method!") + + } + + }) %>% Reduce("rbind", .) + + idxFilter <- rowSums(pairwiseDF[,c("mean1","mean2")]) != 0 + pairwiseDF$fdr <- NA + pairwiseDF$fdr[idxFilter] <- p.adjust(pairwiseDF$pval[idxFilter], method = "fdr") + pairwiseDF <- pairwiseDF[rownames(featureDF), , drop = FALSE] + pairwiseDF + +} + +#Wilcoxon Row-wise two matrices +.sparseMatWilcoxon <- function(mat1, mat2){ + offset <- quantile(c(mat1@x,mat2@x), 0.99) * 10^-3 + .requirePackage("presto", installInfo = 'devtools::install_github("immunogenomics/presto")') + df <- wilcoxauc(cbind(mat1,mat2), c(rep("Top", ncol(mat1)),rep("Bot", ncol(mat2)))) + df <- df[which(df$group=="Top"),] + out <- data.frame( + log2Mean = log2(df$avgExpr + offset), + log2FC = df$logFC, + fdr = df$padj, + pval = df$pval, + mean1 = Matrix::rowMeans(mat1, na.rm=TRUE), + mean2 = Matrix::rowMeans(mat2, na.rm=TRUE), + n = ncol(mat1), + auc = df$auc + ) + return(out) +} + +#T-Test Row-wise two matrices +.sparseMatTTest <- function(mat1, mat2, m0 = 0){ + offset <- quantile(c(mat1@x,mat2@x), 0.99) * 10^-3 + #Get Population Values + n1 <- ncol(mat1) + n2 <- ncol(mat2) + n <- n1 + n2 + #Sparse Row Means + m1 <- Matrix::rowMeans(mat1, na.rm=TRUE) + m2 <- Matrix::rowMeans(mat2, na.rm=TRUE) + #Sparse Row Variances + v1 <- ArchR:::computeSparseRowVariances(mat1@i + 1, mat1@x, m1, n1) + v2 <- ArchR:::computeSparseRowVariances(mat2@i + 1, mat2@x, m2, n2) + #Calculate T Statistic + se <- sqrt( (1/n1 + 1/n2) * ((n1-1)*v1 + (n2-1)*v2)/(n1+n2-2) ) + tstat <- (m1-m2-m0)/se + pvalue <- 2*pt(-abs(tstat), n - 2) + fdr <- p.adjust(pvalue, method = "fdr") + out <- data.frame( + log2Mean = log2(((m1+offset) + (m2+offset)) / 2), + log2FC = log2((m1+offset)/(m2+offset)), + fdr = fdr, + pval = pvalue, + mean1 = m1, + mean2 = m2, + var1 = v2, + var2 = v2, + n = n1 + ) + return(out) +} + +#Binomial Test Row-wise two matrices +.sparseMatBinomTest <- function(mat1, mat2){ + offset <- quantile(c(mat1@x,mat2@x), 0.99) * 10^-3 + #Get Population Values + n1 <- ncol(mat1) + n2 <- ncol(mat2) + n <- n1 + n2 + #Sparse Row Stats + s1 <- Matrix::rowSums(mat1, na.rm=TRUE) + m1 <- s1 / n1 + m2 <- Matrix::rowMeans(mat2, na.rm=TRUE) + #Combute Binom.test + pb <- txtProgressBar(min=0,max=100,initial=0,style=3) + pval <- sapply(seq_along(s1), function(x){ + setTxtProgressBar(pb,round(x*100/length(s1),0)) + binom.test(s1[x], n1, m2[x], alternative="two.sided")$p.value + }) + fdr <- p.adjust(pval, method = "fdr", length(pval)) + out <- data.frame( + log2Mean = log2(((m1+offset) + (m2+offset)) / 2), + log2FC = log2((m1+offset) / (m2+offset)), + fdr = fdr, + pval = pval, + mean1 = m1, + mean2 = m2, + n = n1 + ) + return(out) +} + + + + + diff --git a/R/MarkerHeatmap.R b/R/MarkerHeatmap.R new file mode 100644 index 00000000..b5ec2d99 --- /dev/null +++ b/R/MarkerHeatmap.R @@ -0,0 +1,445 @@ +#' Plot a Heatmap of Identified Marker Features +#' +#' This function will plot a heatmap of the results from markerFeatures +#' +#' @param seMarker Summarized Experiment result from markerFeatures +#' @param FDR False-Discovery Rate Cutoff to Be called a Marker +#' @param log2FC Log2 Fold Change Cutoff to Be called a Marker +#' @param log2Norm log2 Normalization prior to plotting set true for counting assays (not DeviationsMatrix!) +#' @param scaleTo scale to prior to log2 Normalization, if log2Norm is FALSE this does nothing +#' @param scaleRows compute row z-scores on matrix +#' @param limits heatmap color limits +#' @param grepExclude remove features by grep +#' @param pal palette for heatmap, default will use solar_extra +#' @param binaryClusterRows fast clustering implementation for row clustering by binary sorting +#' @param labelMarkers label specific markers by name on heatmap (matches rownames of seMarker) +#' @param labelTop label the top features for each column in seMarker +#' @param labelRows label all rows +#' @param returnMat return final matrix that is used for plotting heatmap +#' @param ... additional args +#' @export +markerHeatmap <- function( + seMarker, + FDR = 0.001, + log2FC = 0.1, + log2Norm = TRUE, + scaleTo = 10^4, + scaleRows = TRUE, + limits = c(-2,2), + grepExclude = NULL, + pal = NULL, + binaryClusterRows = TRUE, + labelMarkers = NULL, + labelTop = NULL, + labelRows = FALSE, + returnMat = FALSE, + ... + ){ + + passMat <- SummarizedExperiment::assays(seMarker)[["Log2FC"]] >= log2FC & SummarizedExperiment::assays(seMarker)[["FDR"]] <= FDR + mat <- SummarizedExperiment::assays(seMarker)[["Mean"]] + idx <- which(rowSums(passMat, na.rm = TRUE) > 0 & matrixStats::rowVars(mat) != 0) + if(log2Norm){ + mat <- log2(t(t(mat)/colSums(mat)) * scaleTo + 1) + } + mat <- mat[idx,] + passMat <- passMat[idx,] + + if(scaleRows){ + mat <- sweep(mat - rowMeans(mat), 1, matrixStats::rowSds(mat), `/`) + mat[mat > max(limits)] <- max(limits) + mat[mat < min(limits)] <- min(limits) + } + + if(nrow(mat) == 0){ + stop("No Makers Found!") + } + + #add rownames + rd <- SummarizedExperiment::rowData(seMarker)[idx,] + if(is.null(rd$name)){ + rn <- paste0(rd$seqnames,":",rd$start,"-",rd$end) + }else{ + if(sum(duplicated(rd$name)) > 0){ + rn <- paste0(rd$seqnames,":",rd$name) + }else{ + rn <- rd$name + } + } + rownames(mat) <- rn + rownames(passMat) <- rn + + #identify to remove + if(!is.null(grepExclude) & !is.null(rownames(mat))){ + idx2 <- which(!grepl(grepExclude, rownames(mat))) + mat <- mat[idx2,] + } + + if(nrow(mat)==0){ + stop("No Makers Found!") + } + + if(!is.null(labelTop)){ + spmat <- passMat / rowSums(passMat) + idx2 <- lapply(seq_len(ncol(spmat)), function(x){ + head(order(spmat[,x], decreasing = TRUE), labelTop) + }) %>% unlist %>% unique %>% sort + mat <- mat[idx2,] + labelRows <- TRUE + } + + if(binaryClusterRows){ + bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat)]) + mat <- bS[[1]][,colnames(mat)] + clusterRows <- FALSE + clusterCols <- bS[[2]] + }else{ + clusterRows <- TRUE + clusterCols <- TRUE + } + + if(!is.null(labelMarkers)){ + mn <- match(tolower(labelMarkers), tolower(rownames(mat)), nomatch = 0) + mn <- mn[mn > 0] + }else{ + mn <- NULL + } + + if(nrow(mat) == 0){ + stop("No Makers Found!") + } + + message(sprintf("Identified %s markers!", nrow(mat))) + + if(is.null(pal)){ + if(is.null(metadata(seMarker)$Params$useMatrix)){ + pal <- paletteContinuous(set = "solar_extra", n = 100) + }else if(tolower(metadata(seMarker)$Params$useMatrix)=="genescorematrix"){ + pal <- paletteContinuous(set = "viridis", n = 100) + }else{ + pal <- paletteContinuous(set = "solar_extra", n = 100) + } + } + + ht <- .ArchRHeatmap( + mat = mat, + scale = FALSE, + limits = c(min(mat), max(mat)), + color = pal, + clusterCols = clusterCols, + clusterRows = clusterRows, + labelRows = labelRows, + labelCols = TRUE, + customRowLabel = mn, + showColDendrogram = TRUE, + ... + ) + + if(returnMat){ + return(mat) + }else{ + return(0) + } + +} + +######################################################################################################## +# Helpers for Nice Heatmap with Bioconductors ComplexHeamtap +######################################################################################################## + +.ArchRHeatmap <- function( + mat, + scale = FALSE, + limits = c(min(mat), max(mat)), + colData = NULL, + color = paletteContinuous(set = "solar_extra", n = 100), + clusterCols = TRUE, + clusterRows = FALSE, + labelCols = FALSE, + labelRows = FALSE, + colorMap = NULL, + useRaster = TRUE, + rasterQuality = 5, + split = NULL, + fontsize = 6, + colAnnoPerRow = 4, + showRowDendrogram = FALSE, + showColDendrogram = FALSE, + customRowLabel = NULL, + customRowLabelIDs = NULL, + customColLabel = NULL, + customColLabelIDs = NULL, + customLabelWidth = 0.75, + rasterDevice = "png", + padding = 45, + borderColor = NA, + draw = TRUE, + name = ""){ + + #Packages + .requirePackage("ComplexHeatmap") + .requirePackage("circlize") + + #Z-score + if (scale) { + message("Scaling Matrix...") + mat <- .rowZscores(mat, limit = FALSE) + name <- paste0(name," Z-Scores") + } + + #Get A Color map if null + if (is.null(colorMap)) { + colorMap <- .colorMapAnno(colData) + } + + #Prepare ColorMap format for Complex Heatmap + if (!is.null(colData)){ + colData = data.frame(colData) + colorMap <- .colorMapForCH(colorMap, colData) #change + showLegend <- .checkShowLegend(colorMap[match(names(colorMap), colnames(colData))]) #change + }else { + colorMap <- NULL + showLegend <- NULL + } + + #Prepare Limits if needed + breaks <- NULL + if (!is.null(limits)) { + mat[mat > max(limits)] <- max(limits) + mat[mat < min(limits)] <- min(limits) + breaks <- seq(min(limits), max(limits), length.out = length(color)) + color <- circlize::colorRamp2(breaks, color) + } + + if(exists('anno_mark', where='package:ComplexHeatmap', mode='function')){ + anno_check_version_rows <- ComplexHeatmap::anno_mark + anno_check_version_cols <- ComplexHeatmap::anno_mark + }else{ + anno_check_version_rows <- ComplexHeatmap::row_anno_link + anno_check_version_cols <- ComplexHeatmap::column_anno_link + } + + #Annotation Heatmap + if(!is.null(colData) & !is.null(customColLabel)){ + message("Adding Annotations...") + if(is.null(customColLabelIDs)){ + customColLabelIDs <- colnames(mat)[customRowLabel] + } + ht1Anno <- HeatmapAnnotation( + df = colData, + col = colorMap, + show_legend = showLegend, + show_annotation_name = TRUE, + gp = gpar(col = "NA"), + annotation_legend_param = + list( + nrow = min(colAnnoPerRow, max(round(nrow(colData)/colAnnoPerRow), 1)) + ), + link = anno_check_version_cols( + at = customColLabel, labels = customColLabelIDs), + width = unit(customLabelWidth, "cm") + max_text_width(customColLabelIDs) + + ) + }else if(!is.null(colData)){ + message("Adding Annotations...") + ht1Anno <- HeatmapAnnotation( + df = colData, + col = colorMap, + show_legend = showLegend, + show_annotation_name = TRUE, + gp = gpar(col = "NA"), + annotation_legend_param = + list( + nrow = min(colAnnoPerRow, max(round(nrow(colData)/colAnnoPerRow), 1)) + ) + ) + }else if(is.null(colData) & !is.null(customColLabel)){ + if(is.null(customColLabelIDs)){ + customColLabelIDs <- colnames(mat)[customRowLabel] + } + message("Adding Annotations...") + ht1Anno <- HeatmapAnnotation( + link = anno_check_version_cols( + at = customColLabel, labels = customColLabelIDs), + width = unit(customLabelWidth, "cm") + max_text_width(customColLabelIDs) + ) + }else{ + ht1Anno <- NULL + } + + message("Preparing Main Heatmap...") + ht1 <- Heatmap( + + #Main Stuff + matrix = mat, + name = name, + col = color, + + #Heatmap Legend + heatmap_legend_param = + list(color_bar = "continuous", + legend_direction = "horizontal", + legend_width = unit(5, "cm") + ), + rect_gp = gpar(col = borderColor), + + #Column Options + show_column_names = labelCols, + cluster_columns = clusterCols, + show_column_dend = showColDendrogram, + clustering_method_columns = "ward.D2", + column_names_gp = gpar(fontsize = fontsize), + column_names_max_height = unit(100, "mm"), + + #Row Options + show_row_names = labelRows, + row_names_gp = gpar(fontsize = fontsize), + cluster_rows = clusterRows, + show_row_dend = showRowDendrogram, + clustering_method_rows = "ward.D2", + split = split, + + #Annotation + top_annotation = ht1Anno, + + #Raster Info + use_raster = useRaster, + raster_device = rasterDevice, + raster_quality = rasterQuality + ) + + if(!is.null(customRowLabel)){ + if(is.null(customRowLabelIDs)){ + customRowLabelIDs <- rownames(mat)[customRowLabel] + } + ht1 <- ht1 + rowAnnotation(link = + anno_check_version_rows(at = customRowLabel, labels = customRowLabelIDs), + width = unit(customLabelWidth, "cm") + max_text_width(customRowLabelIDs)) + } + + if(draw){ + draw(ht1, + padding = unit(c(padding, padding, padding, padding), "mm"), + heatmap_legend_side = "bot", + annotation_legend_side = "bot") + }else{ + ht1 + } + +} + +.colorMapForCH <- function(colorMap, colData){ + colorMap <- colorMap[which(names(colorMap) %in% colnames(colData))] + colorMapCH <- lapply(seq_along(colorMap), function(x){ + if(attr(colorMap[[x]],"discrete")){ + colorx <- colorMap[[x]] + }else{ + vals <- colData[[names(colorMap)[x]]][!is.na(colData[[names(colorMap)[x]]])] + s <- seq(min(vals), max(vals), length.out = length(colorMap[[x]])) + colorx <- circlize::colorRamp2(s, colorMap[[x]]) + } + if(any(is.na(names(colorx)))){ + names(colorx)[is.na(names(colorx))] <- paste0("NA",seq_along(names(colorx)[is.na(names(colorx))])) + } + return(colorx) + }) + names(colorMapCH) <- names(colorMap) + return(colorMapCH) +} + +.checkShowLegend <- function(colorMap, max_discrete = 30){ + show <- lapply(seq_along(colorMap), function(x){ + if(attr(colorMap[[x]],"discrete") && length(unique(colorMap[[x]])) > max_discrete){ + sl <- FALSE + }else{ + sl <- TRUE + } + return(sl) + }) %>% unlist + names(show) <- names(colorMap) + return(show) +} + +.colorMapAnno <- function(colData, customAnno = NULL, discreteSet = "stallion", continuousSet = "solar_extra"){ + discreteCols <- sapply(colData,function(x) !is.numeric(x)) + if(!is.null(customAnno)){ + colorMap <- lapply(seq_along(discreteCols),function(x){ + if(discreteCols[x]){ + colors <- paletteDiscrete(values = colData[[names(discreteCols[x])]], set = discreteSet) + names(colors) <- unique(colData[[names(discreteCols[x])]]) + attr(colors, "discrete") <- TRUE + }else{ + colors <- paletteContinuous(set = continuousSet) + attr(colors, "discrete") <- FALSE + } + if(length(which(customAnno[,1] %in% names(discreteCols[x]))) > 0){ + if(length(which(customAnno[,2] %in% names(colors))) > 0){ + customAnnox <- customAnno[which(customAnno[,2] %in% names(colors)),] + colors[which(names(colors) %in% customAnnox[,2])] <- paste0(customAnnox[match(names(colors),customAnnox[,2]),3]) + } + } + return(colors) + }) + names(colorMap) <- colnames(colData) + return(colorMap) + }else{ + colorMap <- lapply(seq_along(discreteCols), function(x){ + if(discreteCols[x]){ + colors <- paletteDiscrete(values = colData[[names(discreteCols[x])]], set = discreteSet) + names(colors) <- unique(colData[[names(discreteCols[x])]]) + attr(colors, "discrete") <- TRUE + }else{ + colors <- paletteContinuous(set = continuousSet) + attr(colors, "discrete") <- FALSE + } + return(colors) + }) + names(colorMap) <- colnames(colData) + return(colorMap) + } + +} + +.binarySort <- function(m, scale = FALSE, cutOff = 1, lmat = NULL){ + + if(is.null(lmat)){ + #Compute Row-Zscores + if(scale){ + lmat <- sweep(m - rowMeans(m), 1, matrixStats::rowSds(m), `/`) + }else{ + lmat <- m + } + lmat <- lmat >= cutOff + } + + #Transpose + m <- t(m) + lmat <- t(lmat) + + #Identify Column Ordering + hc <- hclust(dist(m)) + colIdx <- hc$order + m <- t(m[colIdx,]) + lmat <- t(lmat[colIdx,]) + + #Identify Row Ordering + rowIdx <- do.call("order", c(as.data.frame(lmat)[seq_len(ncol(lmat))], list(decreasing = TRUE))) + m <- t(m[rowIdx,]) + lmat <- t(lmat[rowIdx,]) + + #Transpose + m <- t(m) + lmat <- t(lmat) + + return(list(mat = m, hclust = hc)) + +} + + + + + + + + + diff --git a/R/MatrixDeviations.R b/R/MatrixDeviations.R new file mode 100644 index 00000000..d59da120 --- /dev/null +++ b/R/MatrixDeviations.R @@ -0,0 +1,508 @@ +#' Add DeviationsMatrix to Arrow Files in ArchRProject +#' +#' This function for each sample will independently compute counts for each tile +#' per cell and then infer gene activity scores. +#' +#' @param ArchRProj ArchRProject +#' @param annotations annotaions name stored in ArchRProject +#' @param matrixName matrixName to be stored as in Arrow Files +#' @param out save ouptut matrices deviations and/or z +#' @param binarize binarize peaks prior to computing deviations +#' @param threads number of threads for parallel execution +#' @param parallelParam parallel parameters for batch style execution +#' @param force force overwriting previous TileMatrix in ArrowFile +#' @export +addDeviationsMatrix <- function( + ArchRProj, + annotations = NULL, + matrixName = NULL, + out = c("z", "deviations"), + binarize = FALSE, + threads = 1, + parallelParam = NULL, + force = FALSE, + ... + ){ + + .requirePackage("SummarizedExperiment") + + set.seed(1) + tstart <- Sys.time() + if(!inherits(ArchRProj, "ArchRProject")){ + stop("Error Needs to be ArchR Project for Input!") + } + ArrowFiles <- getSampleColData(ArchRProj)$ArrowFiles + threads <- min(length(ArrowFiles), threads) + allCells <- rownames(getCellColData(ArchRProj)) + outDir <- getOutputDirectory(ArchRProj) + if(!all(file.exists(ArrowFiles))){ + stop("Error Input Arrow Files do not all exist!") + } + + ############################################################## + #Annotations Matrix! + ############################################################## + anno <- getAnnotation(ArchRProj, annotations) + annotationsMatrix <- SummarizedExperiment::assay(readRDS(anno$Matches)) + if(is.null(matrixName)){ + matrixName <- paste0(anno$Name, "Matrix") + } + annotationsMatrix <- as(annotationsMatrix, "dgCMatrix") + rownames(annotationsMatrix) <- NULL + gc() + + ############################################################## + #Get Row Sums for Expectation! + ############################################################## + .messageDiffTime("Computing Expectations!", tstart, addHeader = TRUE) + useMatrix <- "PeakMatrix" + availableChr <- .availableSeqnames(ArrowFiles, useMatrix) + rS <- .getRowSums( + ArrowFiles = ArrowFiles, + seqnames = availableChr, + useMatrix = useMatrix, + filter0 = FALSE + ) + rS$start <- start(ArchRProj@peakSet) + rS$end <- end(ArchRProj@peakSet) + rS$GC <- ArchRProj@peakSet$GC + + if(!is.null(metadata(getPeakSet(ArchRProj))$backgroundPeaks)){ + + if(file.exists(metadata(getPeakSet(ArchRProj))$backgroundPeaks)){ + .messageDiffTime("Using Previous Background Peaks!", tstart, addHeader = TRUE) + bdgPeaks <- readRDS(metadata(getPeakSet(ArchRProj))$backgroundPeaks) + } + .messageDiffTime("Previous Background Peaks file does not exists! Identifying Background Peaks!", tstart, addHeader = TRUE) + bdgPeaks <- .getBackgroundPeaks(rS$value, rS$GC) + + }else{ + + .messageDiffTime("Identifying Background Peaks!", tstart, addHeader = TRUE) + bdgPeaks <- .getBackgroundPeaks(rS$value, rS$GC) + + } + if(length(getPeakSet(ArchRProj)) != nrow(bdgPeaks)){ + stop("Number of rows in background peaks does not match peakSet!") + } + + #Save Background Peaks + outFile <- file.path(getOutputDirectory(ArchRProj), "Background-Peaks.rds") + metadata(ArchRProj@peakSet)$backgroundPeaks <- outFile + saveRDS(bdgPeaks, outFile, compress = FALSE) + + #Create args list + args <- mget(names(formals()),sys.frame(sys.nframe()))#as.list(match.call()) + + #Add args to list + args$annotations <- NULL + rm(annotations) + args$annotationsMatrix <- annotationsMatrix + args$bdgPeaks <- bdgPeaks + args$featureDF <- rS + args$useMatrix <- useMatrix + args$ArrowFiles <- ArrowFiles + args$allCells <- allCells + args$matrixName <- matrixName + args$X <- seq_along(ArrowFiles) + args$FUN <- .addDeviationsMatrix + args$registryDir <- file.path(getOutputDirectory(ArchRProj), paste0(matrixName,"DeviationsRegistry")) + + #Run With Parallel or lapply + outList <- .batchlapply(args) + .messageDiffTime("Completed Computing Deviations!", tstart, addHeader = TRUE) + gc() + + return(ArchRProj) + +} + +.addDeviationsMatrix <- function( + i, + ArrowFiles, + annotationsMatrix, + out = c("z", "deviations"), + cellNames = NULL, + allCells = NULL, + featureDF = NULL, + bdgPeaks = NULL, + binarize = FALSE, + useMatrix = "PeakMatrix", + matrixName = "Motif", + force = FALSE, + profileMemory = TRUE, + debug = FALSE, + tstart = NULL, + ... + ){ + + gc() + + if(is.null(tstart)){ + tstart <- Sys.time() + } + + ArrowFile <- ArrowFiles[i] + cellNames <- .availableCells(ArrowFile, subGroup=useMatrix) + + if(!is.null(allCells)){ + cellNames <- cellNames[cellNames %in% allCells] + } + + #Get Matrix and Run ChromVAR! + .messageDiffTime(sprintf("Computing chromVAR-based deviations %s of %s (see Schep et. al (2017)!", i, length(ArrowFiles)), tstart, addHeader = TRUE) + dev <- .getMatFromArrow( + ArrowFile, + featureDF = featureDF, + binarize = binarize, + useMatrix = useMatrix, + cellNames = cellNames + ) %>% {.customDeviations( + countsMatrix = ., + annotationsMatrix = annotationsMatrix, + backgroudPeaks = bdgPeaks, + expectation = featureDF$value/sum(featureDF$value), + out = out + )} + gc() + + ####################################### + # Initialize Matrix Group + ####################################### + if(length(out)==1){ + featureDF <- data.frame(seqnames = out, idx = seq_len(nrow(dev)), name = rownames(dev), stringsAsFactors = FALSE) + }else if(length(out)==2){ + featureDF <- rbind( + data.frame(seqnames = out[1], idx = seq_len(nrow(dev)), name = rownames(dev), stringsAsFactors = FALSE), + data.frame(seqnames = out[2], idx = seq_len(nrow(dev)), name = rownames(dev), stringsAsFactors = FALSE) + ) + }else{ + stop("out can only be up to 2 items deviations,z") + } + + featureDF <- featureDF[order(featureDF[,1]),] + o <- .initializeMat( + ArrowFile = ArrowFile, + Group = matrixName, + Class = "Double", + cellNames = colnames(dev), + params = "chromVAR", + featureDF = featureDF, + force=TRUE + ) + + ####################################### + # Write Matrices To Arrow + ####################################### + if("z" %in% tolower(out)){ + o <- .addMatToArrow( + mat = as(SummarizedExperiment::assays(dev)[["z"]], "dgCMatrix"), + ArrowFile = ArrowFile, + Group = paste0(matrixName,"/z"), + binarize = FALSE, + addRowSums = FALSE, + addColSums = FALSE, + addRowVars = TRUE, + addRowMeans = TRUE + ) + } + + if("deviations" %in% tolower(out)){ + o <- .addMatToArrow( + mat = as(SummarizedExperiment::assays(dev)[["deviations"]], "dgCMatrix"), + ArrowFile = ArrowFile, + Group = paste0(matrixName,"/deviations"), + binarize = FALSE, + addRowSums = FALSE, + addColSums = FALSE, + addRowVars = TRUE, + addRowMeans = TRUE + ) + } + + .messageDiffTime("Finished Computing Deviations!", tstart) + return(0) + +} + +############################################################################ +# Adapted from chromVAR +############################################################################ +.customDeviations <- function( + countsMatrix, + annotationsMatrix, + backgroudPeaks, + expectation, + out = c("deviations", "z") + ){ + + tstart <- Sys.time() + #lets not do this check because we are running on partial matrix + #if (min(getFragmentsPerPeak(countsMatrix)) <= 0) + # stop("All peaks must have at least one fragment in one sample") + stopifnot(nrow(countsMatrix) == nrow(backgroudPeaks)) + stopifnot(length(expectation) == nrow(countsMatrix)) + colData <- DataFrame(seq_len(ncol(countsMatrix)), row.names = colnames(countsMatrix))[,FALSE] + norm_expectation <- expectation / sum(expectation) #Double check this sums to 1! + countsPerSample <- Matrix::colSums(countsMatrix) + + results <- lapply(seq_len(ncol(annotationsMatrix)), function(x){ + if(x %% floor(ncol(annotationsMatrix)/20) == 0){ + .messageDiffTime(sprintf("Computing Deviations for Annotation %s of %s", x, ncol(annotationsMatrix)), tstart) + } + if(x %% max(floor(ncol(annotationsMatrix)/20), 10) == 0){ + gc() + } + .customDeviationsSingle( + annotationsVector = annotationsMatrix[, x, drop=FALSE], + countsMatrix = countsMatrix, + backgroudPeaks = backgroudPeaks, + countsPerSample = countsPerSample, + expectation = norm_expectation, + out = out + ) + }) + cn <- colnames(countsMatrix) + rm(countsMatrix) + gc() + + if("z" %in% tolower(out)){ + z <- t(vapply(results, function(x) x[["z"]], rep(0, length(cn)))) + }else{ + z <- matrix(0, nrow = ncol(annotationsMatrix), ncol = length(cn)) + } + if("deviations" %in% tolower(out)){ + dev <- t(vapply(results, function(x) x[["dev"]], rep(0, length(cn)))) + }else{ + dev <- matrix(0, nrow = ncol(annotationsMatrix), ncol = length(cn)) + } + colnames(z) <- cn + colnames(dev) <- cn + + #Check First + nullOverlap <- is.null(results[[1]]$overlap) + rowData <- lapply(seq_along(results), function(x){ + resx <- results[[x]] + if(nullOverlap){ + data.frame(fractionMatches = resx$matches) + }else{ + data.frame(fractionMatches = resx$matches, fractionBackgroundOverlap = resx$overlap) + } + }) %>% Reduce("rbind",.) + rownames(rowData) <- colnames(annotationsMatrix) + + se <- SummarizedExperiment::SummarizedExperiment( + assays = list( + deviations = dev, + z = z + ), + colData = colData, + rowData = rowData + ) + SummarizedExperiment::assays(se) <- SummarizedExperiment::assays(se)[tolower(out)] + + return(se) + +} + +.customDeviationsSingle <- function( + annotationsVector, + countsMatrix, + countsPerSample, + backgroudPeaks, + out = c("deviations", "z"), + expectation = NULL, + intermediate_results = FALSE, + threshold = 1 + ){ + + binarizeMat <- function(mat){ + mat@x[mat@x > 0] <- 1 + mat + } + + if (length(annotationsVector@x) == 0) { + out <- list( + z = rep(NA, ncol(countsMatrix)), + dev = rep(NA, ncol(countsMatrix)), + expFG = NA, + expBG = NA, + matches = 0, + overlap = NA + ) + return(out) + } + + ################################ + # Fore Ground Deviations + ################################ + .requirePackage("Matrix") + observed <- as.vector(Matrix::t(annotationsVector) %*% countsMatrix) + expected <- as.vector(Matrix::t(annotationsVector) %*% expectation %*% countsPerSample) + observed_deviation <- (observed - expected)/expected + + #Filter those with no matches at all + fail_filter <- which(expected == 0) + + ################################ + # Back Ground Deviations + ################################ + if("z" %in% tolower(out)){ + + #Compute Background Null Per Iteration + niterations <- ncol(backgroudPeaks) + sampleMat <- Matrix::sparseMatrix( + j = as.vector(backgroudPeaks[annotationsVector@i + 1, seq_len(niterations)]), + i = rep(seq_len(niterations), each = length(annotationsVector@x)), + x = rep(annotationsVector@x, niterations), + dims = c(niterations, nrow(countsMatrix)) + ) + sampled <- as.matrix(sampleMat %*% countsMatrix) + sampledExpected <- sampleMat %*% expectation %*% countsPerSample + sampledDeviation <- (sampled - sampledExpected)/sampledExpected + bgOverlap <- Matrix::mean(binarizeMat(sampleMat) %*% binarizeMat(annotationsVector)) / length(annotationsVector@x) + + #Summary + meanSampledDeviation <- Matrix::colMeans(sampledDeviation) + sdSampledDeviation <- apply(as.matrix(sampledDeviation), 2, sd) + + #Norm Deviation + normdev <- (observed_deviation - meanSampledDeviation) + z <- normdev/sdSampledDeviation + if (length(fail_filter) > 0) { + z[fail_filter] <- NA + normdev[fail_filter] <- NA + } + + }else{ + + #Compute Background Null Per Iteration + niterations <- ncol(backgroudPeaks) + sampleMat2 <- Matrix::sparseMatrix( + j = as.vector(backgroudPeaks[annotationsVector@i + 1, seq_len(niterations)]), + i = rep(1, niterations * length(annotationsVector@x)), + x = rep(annotationsVector@x, niterations), + dims = c(1, nrow(countsMatrix)) + ) + sampled2 <- (sampleMat2 %*% countsMatrix)[1,] + sampledExpected2 <- (sampleMat2 %*% expectation %*% countsPerSample)[1,] + ###################### + # Equivalent to above + # colMeans(sampled) - colMeans(sampledExpected))/colMeans(sampledExpected) + ###################### + sampledDeviation2 <- (sampled2 - sampledExpected2)/sampledExpected2 + bgOverlap <- NA + + #Norm Deviation + normdev <- (observed_deviation - sampledDeviation2) + z <- NULL + if (length(fail_filter) > 0) { + normdev[fail_filter] <- NA + } + + } + + outList <- list( + z = z, + dev = normdev, + matches = length(annotationsVector@x) / nrow(countsMatrix), + overlap = bgOverlap + ) + + return(outList) + +} + +#' @export +addBackgroundPeaks <- function( + ArchRProj, + bias = "GC", + niterations = 50, + w = 0.1, + binSize = 50, + seed = 1, + outFile = file.path(getOutputDirectory(ArchRProj), "Background-Peaks.rds"), + binarize = FALSE, + force = FALSE, + ... + ){ + + set.seed(1) + tstart <- Sys.time() + if(!inherits(ArchRProj, "ArchRProject")){ + stop("Error Needs to be ArchR Project for Input!") + } + ArrowFiles <- getSampleColData(ArchRProj)$ArrowFiles + allCells <- rownames(getCellColData(ArchRProj)) + outDir <- getOutputDirectory(ArchRProj) + if(!all(file.exists(ArrowFiles))){ + stop("Error Input Arrow Files do not all exist!") + } + ############################################################## + #Get Row Sums for Expectation! + ############################################################## + .messageDiffTime("Computing Expectations!", tstart, addHeader = TRUE) + useMatrix <- "PeakMatrix" + availableChr <- .availableSeqnames(ArrowFiles, useMatrix) + rS <- .getRowSums( + ArrowFiles = ArrowFiles, + seqnames = availableChr, + useMatrix = useMatrix, + filter0 = FALSE + ) + rS$start <- start(ArchRProj@peakSet) + rS$end <- end(ArchRProj@peakSet) + rS$bias <- mcols(ArchRProj@peakSet)[,bias] + + .messageDiffTime("Identifying Background Peaks!", tstart, addHeader = TRUE) + bdgPeaks <- .getBackgroundPeaks( + values = rS$value, + bias = rS$bias, + niterations = niterations, + w = w, + binSize = binSize, + seed = seed + ) + metadata(ArchRProj@peakSet)$backgroundPeaks <- outFile + saveRDS(bdgPeaks, outFile, compress = FALSE) + + return(ArchRProj) + +} + +.getBackgroundPeaks <- function(values, bias, niterations = 50, w = 0.1, binSize = 50, seed = 1){ + + .requirePackage("chromVAR") + + #minimal chromVAR change + #chromVAR reuiqres a matrix/se of ncol > 1 and with a log10(values) transform removing peaks with 0 reads + #to disable this we create a column of 1's forcing chromVAR to perform log10(values + 1) + + se <- SummarizedExperiment::SummarizedExperiment( + assays = SimpleList(counts = as.matrix(data.frame(values, 1))), + rowData = DataFrame(bias = bias) + ) + + bdgPeaks <- chromVAR::getBackgroundPeaks( + object = se, + bias = rowData(se)$bias, + niterations = niterations, + w = w, + bs = binSize + ) + + return(bdgPeaks) + +} + + + + + + + + + + + + diff --git a/R/MatrixFeatures.R b/R/MatrixFeatures.R new file mode 100644 index 00000000..12f02a79 --- /dev/null +++ b/R/MatrixFeatures.R @@ -0,0 +1,279 @@ +#' Add FeatureMatrix to Arrows/ArchRProject +#' +#' This function for each sample will independently compute counts for each feature +#' per cell in the Arrow File +#' +#' @param input ArchRProject or ArrowFiles +#' @param features GRanges to count for each cell +#' @param matrixName matrix output name in ArrowFiles cannot be a protected matrix name +#' @param ceiling ceiling for the number of counts per feature +#' @param binarize binarize matrix +#' @param threads number of threads +#' @param parallelParam parallel parameters for batch style execution +#' @param force force overwriting previous TileMatrix in ArrowFile +#' @export +addFeatureMatrix <- function( + input, + features = NULL, + matrixName = "FeatureMatrix", + ceiling = Inf, + binarize = FALSE, + threads = 1, + parallelParam = NULL, + force = FALSE, + ... +){ + + matrixName <- .isProtectedArray(matrixName) + + if(inherits(input, "ArchRProject")){ + ArrowFiles <- getArrowFiles(input) + allCells <- rownames(getCellColData(input)) + outDir <- getOutputDirectory(input) + }else if(inherits(input, "character")){ + outDir <- "" + ArrowFiles <- input + allCells <- NULL + }else{ + stop("Error Unrecognized Input!") + } + if(!all(file.exists(ArrowFiles))){ + stop("Error Input Arrow Files do not all exist!") + } + + #Add args to list + args <- mget(names(formals()),sys.frame(sys.nframe())) + args$ArrowFiles <- ArrowFiles + args$allCells <- allCells + args$X <- seq_along(ArrowFiles) + args$FUN <- .addFeatureMatrix + args$registryDir <- file.path(outDir, "CountFeaturesRegistry") + + #Run With Parallel or lapply + outList <- .batchlapply(args) + + if(inherits(input, "ArchRProject")){ + return(input) + }else{ + return(unlist(outList)) + } + +} + +#' Add PeakMatrix to Arrows in ArchRProject +#' +#' This function for each sample will independently compute counts for each peak +#' per cell in the Arrow File +#' +#' @param ArchRProj ArchRProject +#' @param ceiling ceiling for the number of counts per feature +#' @param binarize binarize matrix +#' @param threads number of threads +#' @param parallelParam parallel parameters for batch style execution +#' @param force force overwriting previous TileMatrix in ArrowFile +#' @export +addPeakMatrix <- function( + ArchRProj, + ceiling = 4, + binarize = FALSE, + parallelParam = NULL, + threads = 1, + force = FALSE, + ... +){ + + if(!inherits(ArchRProj, "ArchRProject")){ + stop("Adding a PeakMatrix is only for ArchRProject!") + } + + ArrowFiles <- getArrowFiles(ArchRProj) + allCells <- rownames(getCellColData(ArchRProj)) + outDir <- getOutputDirectory(ArchRProj) + + if(!all(file.exists(ArrowFiles))){ + stop("Error Input Arrow Files do not all exist!") + } + + #Add args to list + args <- mget(names(formals()),sys.frame(sys.nframe()))#as.list(match.call()) + args$ArrowFiles <- ArrowFiles + args$allCells <- allCells + args$matrixName = "PeakMatrix" + args$features <- ArchRProj@peakSet + args$X <- seq_along(ArrowFiles) + args$FUN <- .addFeatureMatrix + args$registryDir <- file.path(outDir, "CountPeaksRegistry") + + #Run With Parallel or lapply + outList <- .batchlapply(args) + + readsInPeaks <- lapply(outList, function(x) x$RIP) %>% unlist + FRIP <- lapply(outList, function(x) x$FRIP) %>% unlist + ArchRProj <- addCellColData(ArchRProj, data = readsInPeaks, name = "ReadsInPeaks", names(readsInPeaks)) + ArchRProj <- addCellColData(ArchRProj, data = FRIP, name = "FRIP", names(readsInPeaks)) + return(ArchRProj) + +} + +.addFeatureMatrix <- function( + i, + ArrowFiles, + features, + cellNames = NULL, + allCells = NULL, + matrixName = "PeakMatrix", + ceiling = 4, + binarize = FALSE, + force = FALSE, + ... + ){ + + ArrowFile <- ArrowFiles[i] + + o <- h5closeAll() + + #Check + if(!suppressMessages(h5createGroup(file = ArrowFile, matrixName))){ + if(force){ + o <- h5delete(file = ArrowFile, name = matrixName) + o <- h5createGroup(ArrowFile, matrixName) + }else{ + stop(sprintf("%s Already Exists!, set force = TRUE to override!", matrixName)) + } + } + + tstart <- Sys.time() + + #Get all cell ids before constructing matrix + if(is.null(cellNames)){ + cellNames <- .availableCells(ArrowFile) + } + + if(!is.null(allCells)){ + cellNames <- cellNames[cellNames %in% allCells] + } + + dfParams <- data.frame( + ceiling = ceiling, + binarize = binarize * 1, + stringsAsFactors = FALSE) + + if("name" %in% colnames(mcols(features))){ + featureDF <- data.frame( + seqnames = paste0(seqnames(features)), + idx = mcols(features)$idx, + start = start(features), + end = end(features), + name = mcols(features)$name, + stringsAsFactors = FALSE) + }else{ + featureDF <- data.frame( + seqnames = paste0(seqnames(features)), + idx = mcols(features)$idx, + start = start(features), + end = end(features), + stringsAsFactors = FALSE) + } + + ###################################### + # Initialize SP Mat Group + ###################################### + if(binarize){ + Class <- "binary" + }else{ + Class <- "integer" + } + o <- .initializeMat( + ArrowFile = ArrowFile, + Group = matrixName, + Class = Class, + cellNames = cellNames, + params = dfParams, + featureDF = featureDF, + force = force + ) + + ###################################### + # Add To SP Mat Group + ###################################### + uniqueChr <- as.character(unique(seqnames(features)@values)) + insertionsInPeaks <- rep(0, length(cellNames)) + names(insertionsInPeaks) <- cellNames + totalInsertions <- insertionsInPeaks + + for(z in seq_along(uniqueChr)){ + + o <- h5closeAll() + chr <- uniqueChr[z] + featurez <- features[BiocGenerics::which(seqnames(features)==chr)] + .messageDiffTime(sprintf("Adding %s for Chromosome %s of %s to Arrow File!", matrixName, z, length(uniqueChr)), tstart) + + #Read in Fragments + fragments <- .getFragsFromArrow(ArrowFile, chr = chr, out = "IRanges", cellNames = cellNames) + tabFrags <- table(mcols(fragments)$RG) + + #Count Left Insertion + temp <- IRanges(start = start(fragments), width = 1) + stopifnot(length(temp) == length(fragments)) + oleft <- findOverlaps(ranges(featurez), temp) + oleft <- DataFrame(queryHits=Rle(queryHits(oleft)), subjectHits = subjectHits(oleft)) + + #Count Right Insertion + temp <- IRanges(start = end(fragments), width = 1) + stopifnot(length(temp) == length(fragments)) + oright <- findOverlaps(ranges(featurez), temp) + oright <- DataFrame(queryHits=Rle(queryHits(oright)), subjectHits = subjectHits(oright)) + remove(temp) + + #Feature Idx + oleft$queryHits@values <- mcols(featurez)$idx[oleft$queryHits@values] + oright$queryHits@values <- mcols(featurez)$idx[oright$queryHits@values] + + #Correct to RG ID + oleft$subjectHits <- as.integer(BiocGenerics::match(mcols(fragments)$RG[oleft$subjectHits], cellNames)) + oright$subjectHits <- as.integer(BiocGenerics::match(mcols(fragments)$RG[oright$subjectHits], cellNames)) + remove(fragments) + + #Create Sparse Matrix + mat <- Matrix::sparseMatrix( + i = c( oleft$queryHits, oright$queryHits ), + j = c( oleft$subjectHits, oright$subjectHits ), + x = rep(1, nrow(oleft) + nrow(oright)), + dims = c(max(mcols(featurez)$idx), length(cellNames)) + ) + colnames(mat) <- cellNames + + #Compute total reads in Peak + totalInsertions[names(tabFrags)] <- totalInsertions[names(tabFrags)] + 2 * tabFrags + insertionsInPeaks <- insertionsInPeaks + Matrix::colSums(mat) + + #Ceiling + if(!is.null(ceiling)){ + mat@x[mat@x > ceiling] <- ceiling + } + if(binarize){ + mat@x[mat@x > 0] <- 1 + } + + #Write sparseMatrix to Arrow File! + o <- .addMatToArrow( + mat = mat, + ArrowFile = ArrowFile, + Group = paste0(matrixName,"/", chr), + binarize = binarize, + addColSums = TRUE, + addRowSums = TRUE + ) + gc() + + } + + out <- list(ArrowFile = ArrowFile, RIP = insertionsInPeaks, FRIP = insertionsInPeaks / totalInsertions) + + return(out) + +} + + + + diff --git a/R/MatrixGeneScores.R b/R/MatrixGeneScores.R new file mode 100644 index 00000000..6a6ec1b0 --- /dev/null +++ b/R/MatrixGeneScores.R @@ -0,0 +1,315 @@ +#' Add GeneScoreMatrix to Arrows/ArchRProject +#' +#' This function for each sample will independently compute counts for each tile +#' per cell and then infer gene activity scores. +#' +#' @param input ArchRProject or ArrowFiles +#' @param genes genes as a GRanges object +#' @param geneModel gene model as a string for weighting peaks for gene score calculation (function of x) +#' @param upstream upstream the Gene Start to consider for calculation +#' @param downstream downstream the Gene Start to consider for calculation +#' @param tileSize tileSize for binning counts prior to gene score calculation +#' @param ceiling ceiling of read counts per tile (prevent huge biases) +#' @param scaleTo scale gene scores to +#' @param excludeChr exclude chromosomes from this analysis +#' @param blacklist blacklist GRanges used to remove tiles prior to calculation +#' @param threads number of threads +#' @param parallelParam parallel parameters for batch style execution +#' @param force force overwriting previous TileMatrix in ArrowFile +#' @export +addGeneScoreMatrix <- function( + input = NULL, + genes = NULL, + geneModel = "exp(-abs(x)/10000)", + upstream = 100000, + downstream = 100000, + tileSize = 500, + ceiling = 4, + scaleTo = 10000, + excludeChr = c("chrY","chrM"), + blacklist = NULL, + threads = 1, + parallelParam = NULL, + force = FALSE, + ... + ){ + + if(inherits(input, "ArchRProject")){ + ArrowFiles <- getArrowFiles(input) + allCells <- rownames(getCellColData(input)) + outDir <- getOutputDirectory(input) + }else if(inherits(input, "character")){ + outDir <- "" + ArrowFiles <- input + allCells <- NULL + }else{ + stop("Error Unrecognized Input!") + } + if(!all(file.exists(ArrowFiles))){ + stop("Error Input Arrow Files do not all exist!") + } + + #Valid GRanges + genes <- .validGRanges(genes) + + #Add args to list + args <- mget(names(formals()),sys.frame(sys.nframe()))#as.list(match.call()) + args$ArrowFiles <- ArrowFiles + args$allCells <- allCells + args$X <- seq_along(ArrowFiles) + args$FUN <- batchFUN + args$registryDir <- file.path(outDir, "GeneScoresRegistry") + + #Run With Parallel or lapply + outList <- .batchlapply(args) + + if(inherits(input, "ArchRProject")){ + + return(input) + + }else{ + + return(unlist(outList)) + + } + +} + +.addGeneScoreMat <- function( + i, + ArrowFiles, + genes, + cellNames = NULL, + allCells = NULL, + upstream = 100000, + downstream = 100000, + scaleTo = 10000, + tileSize = 200, + ceiling = 4, + blacklist = NULL, + geneModel = "exp(-abs(x)/10000)", + excludeChr = c("chrY","chrM"), + force = FALSE, + tmpFile = tempfile(), + ... + ){ + + ArrowFile <- ArrowFiles[i] + + #Check + if(!suppressMessages(h5createGroup(file = ArrowFile, "GeneScoreMatrix"))){ + if(force){ + o <- h5delete(file = ArrowFile, name = "GeneScoreMatrix") + o <- h5createGroup(ArrowFile, "GeneScoreMatrix") + }else{ + stop("GeneScoreMatrix Already Exists!, set force = TRUE to override!") + } + } + + o <- h5closeAll() + + #Add Gene Index + geneStart <- genes[BiocGenerics::which(seqnames(genes) %bcni% excludeChr)] + geneStart <- sort(sortSeqlevels(geneStart)) + seqlevels(geneStart) <- as.character(unique(seqnames(geneStart))) + geneStart <- geneStart[!is.na(mcols(geneStart)$symbol)] + geneStart <- resize(geneStart, 1, "start") + geneStart <- split(geneStart, seqnames(geneStart)) + geneStart <- lapply(geneStart, function(x){ + mcols(x)$idx <- seq_along(x) + return(x) + }) + + #Blacklist Split + if(!is.null(blacklist)){ + blacklist <- split(blacklist, seqnames(blacklist)) + } + + #Get all cell ids before constructing matrix + if(is.null(cellNames)){ + cellNames <- .availableCells(ArrowFile) + } + if(!is.null(allCells)){ + cellNames <- cellNames[cellNames %in% allCells] + } + + tstart <- Sys.time() + + totalGS <- rep(0, length(cellNames)) + names(totalGS) <- cellNames + + ######################################################################################################### + #First we will write gene scores to a temporary path! rhdf5 delete doesnt actually delete the memory! + ######################################################################################################### + for(z in seq_along(geneStart)){ + + #Get Gene Starts + geneStarti <- geneStart[[z]] + mcols(geneStarti)$idx <- seq_along(geneStarti) + chri <- paste0(unique(seqnames(geneStarti))) + .messageDiffTime(sprintf("Creating Temporary Gene Score Matrix for Chromosome %s of %s!", z, length(geneStart)), tstart) + + #Read in Fragments + frag <- .getFragsFromArrow(ArrowFile, chr = chri, out = "IRanges", cellNames = cellNames) + fragSt <- trunc(start(frag)/tileSize) * tileSize + fragEd <- trunc(end(frag)/tileSize) * tileSize + fragBC <- rep(S4Vectors::match(mcols(frag)$RG, cellNames), 2) + rm(frag) + gc() + + #Unique Inserts + uniqIns <- sort(unique(c(fragSt,fragEd))) + + #Construct tile by cell mat! + matGS <- Matrix::sparseMatrix( + i = match(c(fragSt, fragEd), uniqIns), + j = as.vector(fragBC), + x = rep(1, 2*length(fragSt)), + dims = c(length(uniqIns), length(cellNames)) + ) + + if(!is.null(ceiling)){ + matGS@x[matGS@x > ceiling] <- ceiling + } + + #Unique Tiles + uniqueTiles <- IRanges(start = uniqIns, width = tileSize) + + #Clean Memory + rm(uniqIns, fragSt, fragEd, fragBC) + gc() + + #Time to Overlap Gene Windows + extenedGeneStart <- ranges(suppressWarnings(extendGRanges(geneStarti, upstream = upstream, downstream = downstream))) #Warning if beyond chromosome this doesnt matter for this analysis + tmp <- suppressWarnings(findOverlaps(extenedGeneStart, uniqueTiles)) + x <- distance(ranges(geneStarti)[queryHits(tmp)], uniqueTiles[subjectHits(tmp)]) + + #Determine Sign for Distance relative to strand + isMinus <- BiocGenerics::which(strand(geneStarti) == "-") + signDist <- sign(start(uniqueTiles)[subjectHits(tmp)] - start(ranges(geneStarti))[queryHits(tmp)]) + signDist[isMinus] <- signDist[isMinus] * -1 + + #Correct the orientation for the distance! + x <- x * signDist + + #Evaluate Input Model + x <- eval(parse(text=geneModel)) + + #Remove Blacklisted Tiles! + if(!is.null(blacklist)){ + blacklisti <- blacklist[[chri]] + if(is.null(blacklisti) | length(blacklisti) > 0){ + tilesBlacklist <- 1 * (!overlapsAny(uniqueTiles, ranges(blacklisti))) + if(length(tilesBlacklist) > 0){ + x <- x * tilesBlacklist[subjectHits(tmp)] #Multiply Such That All Blacklisted Tiles weight is now 0! + } + } + } + + #Clean Memory + rm(isMinus, signDist, extenedGeneStart, uniqueTiles) + gc() + + #Creating Sparse Matrix + tmp <- Matrix::sparseMatrix( + i = queryHits(tmp), + j = subjectHits(tmp), + x = x, + dims = c(length(geneStarti), nrow(matGS))) + + #Calculate Gene Scores + matGS <- tmp %*% matGS + colnames(matGS) <- cellNames + totalGS <- totalGS + Matrix::colSums(matGS) + + #Save tmp file + saveRDS(matGS, file = paste0(tmpFile, "-", chri, ".rds"), compress = FALSE) + + #Clean Memory + rm(matGS, tmp) + gc() + + } + + + ######################################################################################################### + #Organize info for ArchR Arrow + ######################################################################################################### + featureDF <- Reduce("c",geneStart) %>% + {data.frame( + row.names=NULL, + seqnames=as.character(seqnames(.)), + start=start(.), + name=mcols(.)$symbol, + idx=mcols(.)$idx, + stringsAsFactors=FALSE)} + + dfParams <- data.frame( + upstream = upstream, + downstream = downstream, + scaleTo = scaleTo, + tileSize = tileSize, + ceiling = ceiling, + geneModel = geneModel, + stringsAsFactors=FALSE + ) + + ###################################### + # Initialize SP Mat Group + ###################################### + o <- .initializeMat( + ArrowFile = ArrowFile, + Group = "GeneScoreMatrix", + Class = "double", + cellNames = cellNames, + params = dfParams, + featureDF = featureDF, + force = force + ) + + #Clean Memory + rm(dfParams, featureDF, genes) + gc() + + #Normalize and add to Arrow File! + for(z in seq_along(geneStart)){ + + #Get Chromosome + chri <- paste0(unique(seqnames(geneStart[[z]]))) + + .messageDiffTime(sprintf("Adding Normalized Gene Score Matrix for Chromosome %s of %s to Arrow File!", z, length(geneStart)), tstart) + + #Re-Create Matrix for that chromosome! + matGS <- readRDS(paste0(tmpFile, "-", chri, ".rds")) + file.remove(paste0(tmpFile, "-", chri, ".rds")) + + #Normalize + matGS@x <- as.numeric(scaleTo * matGS@x/rep.int(totalGS, Matrix::diff(matGS@p))) + + #Round to Reduce Digits After Final Normalization + matGS@x <- round(matGS@x, 2) + matGS <- Matrix::drop0(matGS) + + #Write sparseMatrix to Arrow File! + o <- .addMatToArrow( + mat = matGS, + ArrowFile = ArrowFile, + Group = paste0("GeneScoreMatrix/", chri), + binarize = FALSE, + addColSums = TRUE, + addRowSums = TRUE + ) + gc() + + #Clean Memory + rm(matGS) + gc() + + } + + return(ArrowFile) + +} + + + diff --git a/R/MatrixTiles.R b/R/MatrixTiles.R new file mode 100644 index 00000000..cb0434be --- /dev/null +++ b/R/MatrixTiles.R @@ -0,0 +1,207 @@ +#' Add TileMatrix to Arrows/ArchRProject +#' +#' This function for each sample will independently compute counts for each tile +#' per cell in the Arrow File +#' +#' @param input ArchRProject or ArrowFiles +#' @param chromSizes chromomosome sizes used for identifying number of tiles to count +#' @param tileSize size for each tile to break up each chromosome +#' @param binarize save as a Sparse.Binary.Matrix or Sparse.Integer.Matrix +#' @param excludeChr exclude chromosomes from this analysis +#' @param threads number of threads +#' @param parallelParam parallel parameters for batch style execution +#' @param force force overwriting previous TileMatrix in ArrowFile +#' @export +addTileMatrix <- function( + input, + chromSizes = getChromSizes(input), + tileSize = 500, + binarize = TRUE, + excludeChr = c("chrM","chrY"), + threads = 1, + parallelParam = NULL, + force = FALSE, + ... + ){ + + if(inherits(input, "ArchRProject")){ + ArrowFiles <- getArrowFiles(input) + allCells <- rownames(getCellColData(input)) + outDir <- getOutputDirectory(input) + }else if(inherits(input, "character")){ + outDir <- "" + ArrowFiles <- input + allCells <- NULL + }else{ + stop("Error Unrecognized Input!") + } + if(!all(file.exists(ArrowFiles))){ + stop("Error Input Arrow Files do not all exist!") + } + + #Add args to list + args <- mget(names(formals()),sys.frame(sys.nframe()))#as.list(match.call()) + args$ArrowFiles <- ArrowFiles + args$allCells <- allCells + args$X <- seq_along(ArrowFiles) + args$FUN <- .addTileMat + args$chromLengths <- end(chromSizes) + names(args$chromLengths) <- paste0(seqnames(chromSizes)) + args$registryDir <- file.path(outDir, "CountTilesRegistry") + + #Run With Parallel or lapply + outList <- .batchlapply(args) + + if(inherits(input, "ArchRProject")){ + return(input) + }else{ + return(unlist(outList)) + } + +} + +.addTileMat <- function( + i, + ArrowFiles, + cellNames = NULL, + allCells = NULL, + tileSize = 500, + binarize = TRUE, + excludeChr = "chrY", + blacklist = NULL, + chromLengths = NULL, + force = FALSE, + ...){ + + ArrowFile <- ArrowFiles[i] + + o <- h5closeAll() + + #Check + if(!suppressMessages(h5createGroup(file = ArrowFile, "TileMatrix"))){ + if(force){ + o <- h5delete(file = ArrowFile, name = "TileMatrix") + o <- h5createGroup(ArrowFile, "TileMatrix") + }else{ + stop("TileMatrix Already Exists!, set force = TRUE to override!") + } + } + + + tstart <- Sys.time() + if(!is.null(blacklist)){ + blacklist <- split(blacklist, seqnames(blacklist)) + } + + #Get all cell ids before constructing matrix + if(is.null(cellNames)){ + cellNames <- .availableCells(ArrowFile) + } + + if(!is.null(allCells)){ + cellNames <- cellNames[cellNames %in% allCells] + } + + chromLengths <- chromLengths[names(chromLengths) %ni% excludeChr] + if(length(chromLengths)==0){ + stop("Error removed all chromLengths with exclude chr!") + } + + dfParams <- data.frame( + seqnames = names(chromLengths), + length = as.vector(chromLengths), + tileSize = tileSize, + binarize = binarize, + stringsAsFactors=FALSE) + + featureDF <- lapply(seq_along(chromLengths), function(x){ + DataFrame(seqnames = names(chromLengths)[x], idx = seq_len(trunc(chromLengths[x])/tileSize + 1)) + }) %>% Reduce("rbind", .) + featureDF$start <- (featureDF$idx - 1) * tileSize + + ###################################### + # Initialize SP Mat Group + ###################################### + if(binarize){ + Class <- "binary" + }else{ + Class <- "integer" + } + o <- .initializeMat( + ArrowFile = ArrowFile, + Group = "TileMatrix", + Class = Class, + cellNames = cellNames, + params = dfParams, + featureDF = featureDF, + force = force + ) + + ###################################### + # Add To SP Mat Group + ###################################### + for(z in seq_along(chromLengths)){ + + o <- h5closeAll() + chr <- names(chromLengths)[z] + .messageDiffTime(sprintf("Adding Tile Matrix for Chromosome %s of %s to Arrow File!", z, length(chromLengths)), tstart) + + #Read in Fragments + fragments <- .getFragsFromArrow(ArrowFile, chr = chr, out = "IRanges", cellNames = cellNames) + + #N Tiles + nTiles <- trunc(chromLengths[z] / tileSize) + 1 + + #Create Sparse Matrix + matchID <- S4Vectors::match(mcols(fragments)$RG, cellNames) + mat <- Matrix::sparseMatrix( + i = c(trunc(start(fragments) / tileSize), trunc(end(fragments) / tileSize)) + 1, + j = c(matchID, matchID), + x = rep(1, 2*length(fragments)), + dims = c(nTiles, length(cellNames)) + ) + colnames(mat) <- cellNames + rm(fragments, matchID) + gc() + + #Binarize + if(binarize){ + mat@x[mat@x > 0] <- 1 + } + + #Remove Blacklisted Tiles! + if(!is.null(blacklist)){ + blacklistz <- blacklist[[chr]] + if(length(blacklistz) > 0){ + tile2 <- floor(tileSize/2) + blacklistIdx <- unique(trunc(start(unlist(GenomicRanges::slidingWindows(blacklistz,tile2,tile2)))/tileSize) + 1) + blacklistIdx <- sort(blacklistIdx) + idxToZero <- which((mat@i + 1) %bcin% blacklistIdx) + if(length(idxToZero) > 0){ + mat@x[idxToZero] <- 0 + mat <- Matrix::drop0(mat) + } + } + } + + #Write sparseMatrix to Arrow File! + o <- .addMatToArrow( + mat = mat, + ArrowFile = ArrowFile, + Group = paste0("TileMatrix/", chr), + binarize = binarize, + addColSums = TRUE, + addRowSums = TRUE + ) + + gc() + + } + + return(ArrowFile) + +} + + + + diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 00000000..62e1cca6 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,35 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +rleSumsStrandedChr <- function(rle, x, strand, width) { + .Call('_ArchR_rleSumsStrandedChr', PACKAGE = 'ArchR', rle, x, strand, width) +} + +rleSumsStranded <- function(rleList, grList, width, as_integer) { + .Call('_ArchR_rleSumsStranded', PACKAGE = 'ArchR', rleList, grList, width, as_integer) +} + +tabulate2dCpp <- function(x, xmin, xmax, y, ymin, ymax) { + .Call('_ArchR_tabulate2dCpp', PACKAGE = 'ArchR', x, xmin, xmax, y, ymin, ymax) +} + +computeSparseRowVariances <- function(j, val, rm, n) { + .Call('_ArchR_computeSparseRowVariances', PACKAGE = 'ArchR', j, val, rm, n) +} + +determineOverlapCpp <- function(m, overlapCut) { + .Call('_ArchR_determineOverlapCpp', PACKAGE = 'ArchR', m, overlapCut) +} + +kmerIdxCpp <- function(str, window, n, kmer) { + .Call('_ArchR_kmerIdxCpp', PACKAGE = 'ArchR', str, window, n, kmer) +} + +kmerPositionFrequencyCpp <- function(string_vector, strand_vector, window, w, kmer) { + .Call('_ArchR_kmerPositionFrequencyCpp', PACKAGE = 'ArchR', string_vector, strand_vector, window, w, kmer) +} + +kmerIDFrequencyCpp <- function(string_vector, id_vector, n_id, window, w, kmer) { + .Call('_ArchR_kmerIDFrequencyCpp', PACKAGE = 'ArchR', string_vector, id_vector, n_id, window, w, kmer) +} + diff --git a/R/ReproduciblePeakSet.R b/R/ReproduciblePeakSet.R new file mode 100644 index 00000000..9e3e3224 --- /dev/null +++ b/R/ReproduciblePeakSet.R @@ -0,0 +1,403 @@ +#' Add Reproducible Peak Set to ArchR Project +#' +#' This function will get insertions from coverage files call peaks +#' and merge to get a Union Reproducible Peak Set +#' +#' @param ArchRProj ArchRProject +#' @param groupBy use groupings for peak calling matching group coverage files +#' @param reproducibility reproducibility for peak calling (string that is a function of n) +#' @param peaksPerCell number of peaks that can be identified per cell on average +#' @param excludeChr exclude chromosomes from peak calling +#' @param pathToMacs2 path to macs2 executable (see Macs2) +#' @param genomeSize genome size for peak calling (see Macs2) +#' @param shift shift of Tn5 insertions (<- | ) (see Macs2) +#' @param extsize extension of Tn5 insertions (|<- ->|) (see Macs2) +#' @param method significance method for Macs2 (see Macs2) +#' @param cutOff significance cutoff for Macs2 (see Macs2) +#' @param extendSummits extend peak summits for final fixed-width peaks +#' @param promoterDist promoter distance from TSS for annotating peaks +#' @param genomeAnno genome annotation for ArchRProject +#' @param geneAnno gene annotation for ArchRProject +#' @param additionalParams additional parameters to pass to Macs2 (see Macs2) +#' @param threads number of threads for parallel execution +#' @param parallelParam parallel parameters for batch style execution +#' @param force force creating peakSet if existed +#' @param verboseHeader verbose sections +#' @param verboseAll verbose sections and subsections +#' @param ... additional args +#' @export +addReproduciblePeakSet <- function( + ArchRProj = NULL, + groupBy = "Clusters", + reproducibility = "2", + peaksPerCell = 500, + maxPeaks = 150000, + excludeChr = c("chrM","chrY"), + pathToMacs2 = "macs2", + genomeSize = NULL, + shift = -75, + extsize = 150, + method = "q", + cutOff = 0.05, + extendSummits = 250, + promoterDist = 500, + genomeAnno = getGenomeAnnotation(ArchRProj), + geneAnno = getGeneAnnotation(ArchRProj), + additionalParams = "--nomodel --nolambda", + threads = 1, + parallelParam = "mclapply", + force = FALSE, + verboseHeader = TRUE, + verboseAll = FALSE, + ... + ){ + + tstart <- Sys.time() + utility <- ArchR:::.checkPath(pathToMacs2) + + coverageMetadata <- .getCoverageMetadata(ArchRProj = ArchRProj, groupBy = groupBy) + coverageParams <- .getCoverageParams(ArchRProj = ArchRProj, groupBy = groupBy) + + ##################################################### + # Peak Calling Summary + ##################################################### + tableGroups <- table(getCellColData(ArchRProj, groupBy, drop = TRUE)) + groupSummary <- lapply(seq_along(coverageParams$cellGroups), function(y){ + x <- coverageParams$cellGroups[[y]] + uniq <- unique(unlist(x)) + n <- lapply(x, length) %>% unlist %>% sum + nmin <- lapply(x, length) %>% unlist %>% min + nmax <- lapply(x, length) %>% unlist %>% max + data.frame( + Group=names(coverageParams$cellGroups)[y], + nCells=tableGroups[y], + nCellsUsed=length(uniq), + nReplicates=length(x), + nMin=nmin, + nMax=nmax, + maxPeaks = min(maxPeaks, nmin * peaksPerCell) + ) + }) %>% Reduce("rbind",.) + + .messageDiffTime("Peak Calling Parameters!", tstart) + printSummary <- groupSummary + rownames(printSummary) <- NULL + print(printSummary) + + ##################################################### + # Create Output Directory + ##################################################### + outDir <- file.path(getOutputDirectory(ArchRProj), "PeakCalls") + outSubDir <- file.path(getOutputDirectory(ArchRProj), "PeakCalls", "ReplicateCalls") + outBedDir <- file.path(getOutputDirectory(ArchRProj), "PeakCalls", "InsertionBeds") + dir.create(outDir, showWarnings = FALSE) + dir.create(outSubDir, showWarnings = FALSE) + dir.create(outBedDir, showWarnings = FALSE) + + ##################################################### + # Genome Size Presets + ##################################################### + if(is.null(genomeSize)){ + if(grepl("hg19|hg38", getGenome(ArchRProj), ignore.case = TRUE)){ + genomeSize <- 2.7e9 + }else if(grepl("mm9|mm10", getGenome(ArchRProj), ignore.case = TRUE)){ + genomeSize <- 1.87e9 + } + } + + ##################################################### + # Arguments for Peak Calling + ##################################################### + coverageFiles <- coverageMetadata$File + names(coverageFiles) <- coverageMetadata$Name + args <- list() + args$X <- seq_len(nrow(coverageMetadata)) + args$FUN <- .callSummitsOnCoverages + args$coverageFiles <- coverageFiles + args$outFiles <- file.path(outSubDir, paste0(coverageMetadata$Name,"-summits.rds")) + args$bedDir <- outBedDir + args$excludeChr <- excludeChr + args$peakParams <- list( + pathToMacs2 = pathToMacs2, + genomeSize = genomeSize, + shift = shift, + extsize = extsize, + cutOff = cutOff, + method = method, + additionalParams = additionalParams + ) + args$parallelParam <- parallelParam + args$threads <- threads + args$registryDir <- file.path(outDir, "batchRegistry") + + ##################################################### + # Batch Call Peaks + ##################################################### + .messageDiffTime("Batching Peak Calls!", tstart) + + #back lapply + outSummmits <- unlist(.batchlapply(args)) + + #Summarize Output + outSummitList <- split(outSummmits, coverageMetadata$Group) + summitNamesList <- split(coverageMetadata$Name, coverageMetadata$Group) + + ##################################################### + # BSgenome for Add Nucleotide Frequencies! + ##################################################### + .requirePackage(genomeAnno$genome) + .requirePackage("Biostrings") + BSgenome <- eval(parse(text = genomeAnno$genome)) + BSgenome <- .validBSgenome(BSgenome) + + ##################################################### + # Identify Reproducible Peaks! + ##################################################### + .messageDiffTime("Identifying Reproducible Peaks!", tstart) + groupPeaks <- .safelapply(seq_along(outSummitList), function(i){ + .messageDiffTime(sprintf("Creating Reproducible Peaks for Group %s of %s", i, length(outSummitList)), tstart) + peaks <- suppressMessages(.identifyReproduciblePeaks( + summitFiles = outSummitList[[i]], + summitNames = summitNamesList[[i]], + reproducibility = reproducibility, + extendSummits = extendSummits, + blacklist = genomeAnno$blacklist + )) + peaks <- sort(sortSeqlevels(peaks)) + peaks <- subsetByOverlaps(peaks, genomeAnno$chromSizes, type = "within") + peaks <- .fastAnnoPeaks(peaks, BSgenome = BSgenome, geneAnno = geneAnno, promoterDist = promoterDist) + peaks <- peaks[which(mcols(peaks)$N < 0.001)] #Remove N Containing Peaks + peaks <- peaks[order(peaks$groupScoreQuantile, decreasing = TRUE)] + peaks <- head(peaks, groupSummary[names(outSummitList)[i],"maxPeaks"]) + mcols(peaks)$N <- NULL #Remove N Column + saveRDS(peaks, file.path(outDir, paste0(names(outSummitList)[i], "-reproduciblePeaks.gr.rds"))) + return(peaks) + }, threads = threads) + names(groupPeaks) <- names(outSummitList) + + #Construct Union Peak Set + .messageDiffTime("Creating Union Peak Set!", tstart) + unionPeaks <- Reduce("c",groupPeaks) + unionPeaks <- nonOverlappingGRanges(unionPeaks, by = "groupScoreQuantile", decreasing = TRUE) + + #Summarize Output + peakDF <- lapply(seq_along(groupPeaks), function(x){ + data.frame(Group = names(groupPeaks)[x], table(groupPeaks[[x]]$peakType)) + }) %>% Reduce("rbind", .) + peakDF$Group <- paste0(peakDF$Group, "(n = ", tableGroups[peakDF$Group],")") + peakDF <- rbind(data.frame(Group = "UnionPeaks", table(unionPeaks$peakType)), peakDF) + peakDF$Freq <- peakDF$Freq / 1000 + metadata(unionPeaks)$PeakCallSummary <- peakDF + + #Add Peak Set + ArchRProj <- addPeakSet(ArchRProj, unionPeaks, force = TRUE) + + pdf(file.path(outDir, "PeakCallSummary.pdf"), width = 6, height = 4, onefile=FALSE) + print(plotPeakCallSummary(ArchRProj)) + dev.off() + + .messageDiffTime("Finished Creating Union Peak Set!", tstart) + + return(ArchRProj) + +} + +#' @export +plotPeakCallSummary <- function(ArchRProj, pal = NULL){ + + peakDF <- metadata(ArchRProj@peakSet)$PeakCallSummary + if(is.null(peakDF)){ + stop("Error no Peak Call Summary available are you sure these peaks were called with CreateReproduciblePeakSet?") + } + if(is.null(pal)){ + pal <- paletteDiscrete(values=peakDF$Var1) + } + + lengthMax <- split(peakDF$Freq, peakDF$Group) %>% lapply(sum) %>% unlist %>% max + + p <- ggplot(peakDF, aes(x=Group, y=Freq, fill=Var1)) + + geom_bar(stat = "identity") + + theme_ArchR(rotate_x_axis_text_90 = TRUE) + + ylab("Number of Peaks (x10^3)") + + xlab("") + + scale_fill_manual(values=pal) + + scale_y_continuous( + breaks = seq(0, lengthMax * 2,50), + limits = c(0, lengthMax * 1.1), + expand = c(0,0) + ) + + return(p) + +} + +##################### +# Utility Functions +##################### + +.fastAnnoPeaks <- function(peaks, BSgenome, geneAnno, promoterDist = 1000){ + + #Validate + peaks <- .validGRanges(peaks) + peakSummits <- resize(peaks,1,"center") + geneAnno$genes <- .validGRanges(geneAnno$genes) + geneAnno$exons <- .validGRanges(geneAnno$exons) + geneAnno$TSS <- .validGRanges(geneAnno$TSS) + BSgenome <- .validBSgenome(BSgenome) + + #First Lets Get Distance to Nearest Gene Start + distPeaks <- distanceToNearest(peakSummits, resize(geneAnno$genes, 1, "start"), ignore.strand = TRUE) + mcols(peaks)$distToGeneStart <- mcols(distPeaks)$distance + mcols(peaks)$nearestGene <- mcols(geneAnno$genes)$symbol[subjectHits(distPeaks)] + og <- overlapsAny(peakSummits, geneAnno$genes, ignore.strand = TRUE) + oe <- overlapsAny(peakSummits, geneAnno$exons, ignore.strand = TRUE) + type <- rep("Distal", length(peaks)) + type[which(og & oe)] <- "Exon" + type[which(og & !oe)] <- "Intron" + type[mcols(peaks)$distToGeneStart < promoterDist] <- "Promoter" + mcols(peaks)$peakType <- type + + #First Lets Get Distance to Nearest TSS's + distTSS <- distanceToNearest(peakSummits, resize(geneAnno$TSS, 1, "start"), ignore.strand = TRUE) + mcols(peaks)$distToTSS <- mcols(distTSS)$distance + if("symbol" %in% colnames(mcols(geneAnno$TSS))){ + mcols(peaks)$nearestTSS <- mcols(geneAnno$TSS)$symbol[subjectHits(distPeaks)] + }else if("tx_name" %in% colnames(mcols(geneAnno$TSS))){ + mcols(peaks)$nearestTSS <- mcols(geneAnno$TSS)$tx_name[subjectHits(distPeaks)] + } + + #Get NucleoTide Content + nucFreq <- BSgenome::alphabetFrequency(getSeq(BSgenome, peaks)) + mcols(peaks)$GC <- round(rowSums(nucFreq[,c("G","C")]) / rowSums(nucFreq),4) + mcols(peaks)$N <- round(nucFreq[,c("N")] / rowSums(nucFreq),4) + peaks + +} + +.identifyReproduciblePeaks <- function( + summitFiles = NULL, + summitNames = NULL, + reproducibility = 0.51, + extendSummits = 250, + blacklist + ){ + + start <- Sys.time() + summits <- lapply(seq_along(summitFiles), function(x){ + grx <- readRDS(summitFiles[x]) + grx <- subsetByOverlaps(grx, blacklist, invert = TRUE) #Not Overlapping Blacklist! + grx$GroupReplicate <- paste0(summitNames[x]) + grx + }) %>% Reduce("c", .) + + extendedSummits <- resize(summits, extendSummits * 2 + 1, "center") + extendedSummits <- lapply(split(extendedSummits, extendedSummits$GroupReplicate), function(x){ + nonES <- nonOverlappingGRanges(x, by = "score", decreasing = TRUE) + nonES$replicateScoreQuantile <- round(.getQuantiles(nonES$score),3) + nonES + }) %>% Reduce("c", .) + nonOverlapES <- nonOverlappingGRanges(extendedSummits, by = "replicateScoreQuantile", decreasing = TRUE) + + overlapMat <- lapply(split(extendedSummits, extendedSummits$GroupReplicate), function(x){ + overlapsAny(nonOverlapES, x) + }) %>% Reduce("cbind", .) + + if(length(summitFiles) > 1){ + nonOverlapES$Reproducibility <- rowSums(overlapMat) + nonOverlapES$ReproducibilityPercent <- round(rowSums(overlapMat) / ncol(overlapMat) , 3) + n <- length(summitFiles) + minRep <- eval(parse(text=reproducibility)) + if(!is.numeric(minRep)){ + stop("Error reproducibility not numeric when evaluated!") + } + idxPass <- which(nonOverlapES$Reproducibility >= minRep) + nonOverlapPassES <- nonOverlapES[idxPass] + }else{ + nonOverlapES$Reproducibility <- rep(NA, length(nonOverlapES)) + nonOverlapPassES <- nonOverlapES + } + + nonOverlapPassES$groupScoreQuantile <- round(.getQuantiles(nonOverlapPassES$replicateScoreQuantile),3) + mcols(nonOverlapPassES) <- mcols(nonOverlapPassES)[,c("score","replicateScoreQuantile", "groupScoreQuantile", "Reproducibility", "GroupReplicate")] + + return(nonOverlapPassES) + +} + +.callSummitsOnCoverages <- function(i, coverageFiles, outFiles, peakParams, bedDir, excludeChr, tstart, ...){ + + .messageDiffTime(sprintf("Group %s of %s, Calling Peaks with MACS2!", i, length(coverageFiles)), tstart) + + ################ + # Create Bed File from Coverage File + ################ + bedFile <- file.path(bedDir, paste0(names(coverageFiles)[i], ".insertions.bed")) + o <- .writeCoverageToBed(coverageFiles[i], bedFile, excludeChr = excludeChr) + peakParams$bedFile <- bedFile + + ################ + # MACS2 Peak-Calling Leave Room For Other Options? + ################ + summits <- do.call(.callSummitsMACS2, peakParams) + rmf <- file.remove(bedFile) + + ################ + # Save output + ################ + saveRDS(summits, outFiles[i]) + + .messageDiffTime(sprintf("Group %s of %s, Finished Calling Peaks with MACS2!", i, length(coverageFiles)), tstart) + + outFiles[i] + +} + +.callSummitsMACS2 <- function( + bedFile = NULL, + pathToMacs2 = "macs2", + genomeSize = 2.7e9, + shift = -75, + extsize = 150, + cutOff = 0.05, + method = "q", + additionalParams = "--nomodel --nolambda", + ... + ){ + + stopifnot(tolower(method) %in% c("p","q")) + stopifnot(!is.null(genomeSize)) + utility <- ArchR:::.checkPath(pathToMacs2) + + #Output Files + bedName <- gsub("\\.insertions.bed", "", bedFile) + summitsFile <- paste0(bedName, "_summits.bed") + narrowPeaksFile <- paste0(bedName, "_peaks.narrowPeak") + xlsFile <- paste0(bedName, "_peaks.xls") + + #Create MACS2 Command + cmd <- sprintf("callpeak -g %s --name %s --treatment %s --outdir %s --format BED --call-summits --keep-dup all %s", + genomeSize, basename(bedName), bedFile, dirname(bedName), additionalParams) + + if(!is.null(shift) & !is.null(extsize)){ + cmd <- sprintf("%s --shift %s --extsize %s", cmd , shift, extsize) + } + + if(tolower(method) == "p"){ + cmd <- sprintf("%s -p %s", cmd , cutOff) + }else{ + cmd <- sprintf("%s -q %s", cmd , cutOff) + } + + run <- system2(pathToMacs2, cmd, wait=TRUE, stdout=NULL, stderr=NULL) + + #Read Summits! + out <- fread(summitsFile, select = c(1,2,3,5)) + out <- GRanges(out$V1, IRanges(out$V2 + 1, out$V3), score = out$V5) + + #Remove Files + r2 <- suppressWarnings(file.remove(summitsFile, narrowPeaksFile, xlsFile)) + + return(out) + +} + diff --git a/R/VisualizeData.R b/R/VisualizeData.R new file mode 100644 index 00000000..3a3450df --- /dev/null +++ b/R/VisualizeData.R @@ -0,0 +1,383 @@ +#' Visualize Embedding from ArchR Project +#' +#' This function will plot an embedding that was created from +#' computeEmbedding +#' +#' @param ArchRProj ArchRProject +#' @param embedding embedding to visualize (see computeEmbedding) +#' @param colorBy colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix) +#' @param name name of column in cellColData or Feature in Array in Arrows +#' @param log2Norm log2 Normalize features if they are continuous +#' @param pal custom palette to use for plotting +#' @param size size of points in plot +#' @param rastr rastr points in plot +#' @param quantCut quantile cut of continuous features +#' @param quantHex quantile evaluation for each hex in geom_hex +#' @param discreteSet discrete palette for visualizing embedding +#' @param continuousSet continuous palette for visualizing embedding +#' @param randomize randomize points prior to plotting +#' @param keepAxis keep x and y axis for plot +#' @param baseSize base size for text in plot +#' @param plotContinuous how to plot continuous features (points and hex) +#' @param plotParams additional params to pass to ggPoint/ggHex +#' @param plotWidth plot width used for creating a consistent plot independent of legend size +#' @param plotHeight plot height used for creating a consistent plot independent of legend size +#' @param ... additional args +#' @export +VisualizeEmbedding <- function( + ArchRProj = NULL, + embedding = "UMAP", + colorBy = "colData", + name = "Sample", + log2Norm = NULL, + pal = NULL, + size = 0.5, + rastr = TRUE, + quantCut = c(0.05, 0.95), + quantHex = 0.5, + discreteSet = NULL, + continuousSet = NULL, + randomize = TRUE, + keepAxis = FALSE, + baseSize = 6, + plotContinuous = NULL, + plotParams = list(), + plotWidth = 5, + plotHeight = 5, + ... + ){ + + .requirePackage("ggplot2") + + ############################## + # Plot Helpers + ############################## + .quantileCut <- function (x, lo = 0, hi = 0.975, rm0 = TRUE){ + q <- quantile(x, probs = c(lo, hi)) + x[x < q[1]] <- q[1] + x[x > q[2]] <- q[2] + return(x) + } + + .summarizeHex <- function(x){ + quantile(x, quantHex) + } + + ############################## + # Get Embedding + ############################## + df <- getEmbedding(ArchRProj, embedding = embedding, return = "df") + + #Parameters + plotParams$x <- df[,1] + plotParams$y <- df[,2] + plotParams$title <- paste0(embedding, " of ", stringr::str_split(colnames(df)[1],pattern="#",simplify=TRUE)[,1]) + plotParams$baseSize <- baseSize + + if(tolower(colorBy) == "coldata" | tolower(colorBy) == "cellcoldata"){ + + plotParams$color <- as.vector(getCellColData(ArchRProj)[,name]) + plotParams$discrete <- .isDiscrete(plotParams$color) + plotParams$continuousSet <- "solar_extra" + plotParams$discreteSet <- "stallion" + plotParams$title <- paste(plotParams$title, " colored by\ncolData : ", name) + if(is.null(plotContinuous)){ + plotContinuous <- "hexplot" + } + + }else{ + if (tolower(colorBy) == "genescorematrix"){ + if(is.null(log2Norm)){ + log2Norm <- TRUE + } + plotParams$continuousSet <- "white_blue_purple" + }else{ + plotParams$continuousSet <- "solar_extra" + } + plotParams$color <- .getMatrixValues(ArchRProj, name = name, matrixName = colorBy, log2Norm = log2Norm) + plotParams$discrete <- FALSE + plotParams$title <- sprintf("%s colored by\n%s : %s", plotParams$title, colorBy, name) + if(is.null(plotContinuous)){ + plotContinuous <- "hexplot" + } + if(plotContinuous=="hexplot"){ + plotParams$fun <- .summarizeHex + } + + } + + #Additional Params! + plotParams$xlabel <- gsub("_", " ",stringr::str_split(colnames(df)[1],pattern="#",simplify=TRUE)[,2]) + plotParams$ylabel <- gsub("_", " ",stringr::str_split(colnames(df)[2],pattern="#",simplify=TRUE)[,2]) + + if(!is.null(continuousSet)){ + plotParams$continuousSet <- continuousSet + } + if(!is.null(continuousSet)){ + plotParams$discreteSet <- discreteSet + } + plotParams$rastr <- rastr + plotParams$size <- size + plotParams$randomize <- randomize + + if(plotParams$discrete){ + plotParams$color <- paste0(plotParams$color) + } + + if(!plotParams$discrete){ + plotParams$color <- .quantileCut(plotParams$color, min(quantCut), max(quantCut)) + plotParams$pal <- paletteContinuous(set = plotParams$continuousSet) + if(tolower(plotContinuous) == "hex" | tolower(plotContinuous) == "hexplot"){ + out <- do.call(ggHex, plotParams) + }else{ + out <- do.call(ggPoint, plotParams) + } + }else{ + out <- do.call(ggPoint, plotParams) + } + + if(!keepAxis){ + out <- out + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) + } + + .fixPlotSize(out, plotWidth = unit(plotWidth, "in"), plotHeight = unit(plotHeight,"in"), margin = 0.1) + +} + + +#' Visualize Groups from ArchR Project +#' +#' This function will plot an embedding that was created from +#' computeEmbedding +#' +#' @param ArchRProj ArchRProject +#' @param groupBy use groupings in cellColData for summarizing and plotting +#' @param colorBy colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix) +#' @param name name of column in cellColData or Feature in Array in Arrows +#' @param pal custom palette to use for plotting +#' @param ylim limits for features in plot +#' @param size size of points in ggplot +#' @param baseSize rastr points in ggplot +#' @param ratioYX ratio of Y axis to X axis +#' @param points add points to plot using quasirandom +#' @param plotWidth plot width used for creating a consistent plot independent of legend size +#' @param plotHeight plot height used for creating a consistent plot independent of legend size +#' @param ... additional args +#' @export +VisualizeGroups <- function( + ArchRProj, + groupBy = "Sample", + colorBy = "colData", + name = "TSSEnrichment", + log2Norm = NULL, + pal = NULL, + ylim = NULL, + size = 0.5, + baseSize = 6, + ratioYX = NULL, + points = FALSE, + plotWidth = 6, + plotHeight = 4, + ... + ){ + + .requirePackage("ggplot2") + + groupNames <- getCellColData(ArchRProj, groupBy, drop = TRUE) + + if(tolower(colorBy) == "coldata" | tolower(colorBy) == "cellcoldata"){ + values <- getCellColData(ArchRProj, name, drop = TRUE) + }else{ + if (tolower(colorBy) == "genescorematrix"){ + if(is.null(log2Norm)){ + log2Norm <- TRUE + } + } + values <- .getMatrixValues(ArchRProj, name = name, matrixName = colorBy, log2Norm = log2Norm) + } + + if(is.null(ylim)){ + ylim <- range(values) %>% extendrange(f = 0.05) + } + + if(is.null(ratioYX)){ + ratioYX <- sqrt(length(unique(groupNames)) / 2) + } + + p <- ggViolin( + x = groupNames, + y = values, + xlabel = groupBy, + ylabel = name, + baseSize = baseSize, + ratioYX = ratioYX * length(unique(groupNames)) / diff(ylim), + size = size, + points = points + ) + + .fixPlotSize(p, height = 1 / ratioYX, margin = 0.1, plotWidth = plotWidth, plotHeight = plotHeight) + +} + +#' @export +.getMatrixValues <- function(ArchRProj, name, matrixName, log2Norm = TRUE){ + + o <- h5closeAll() + featureDF <- .getFeatureDF(getArrowFiles(ArchRProj), matrixName) + if(grepl(":",name)){ + sname <- stringr::str_split(name,pattern=":",simplify=TRUE)[1,1] + name <- stringr::str_split(name,pattern=":",simplify=TRUE)[1,2] + idx <- intersect(which(tolower(name) == tolower(featureDF$name)), BiocGenerics::which(tolower(sname) == tolower(featureDF$seqnames))) + }else{ + idx <- which(tolower(name) == tolower(featureDF$name))[1] + } + if(length(idx)==0){ + stop(sprintf("FeatureName does not exist for %s! See availableFeatures", name)) + } + featureDF <- featureDF[idx, ,drop=FALSE] + + #Get Values for FeatureName + cellNamesList <- split(rownames(getCellColData(ArchRProj)), getCellColData(ArchRProj)$Sample) + values <- lapply(seq_along(cellNamesList), function(x){ + o <- h5closeAll() + ArrowFile <- getSampleColData(ArchRProj)[names(cellNamesList)[x],"ArrowFiles"] + valuesx <- .getMatFromArrow( + ArrowFile = ArrowFile, + featureDF = featureDF, + binarize = FALSE, + useMatrix = matrixName, + cellNames = cellNamesList[[x]] + ) + colnames(valuesx) <- cellNamesList[[x]] + valuesx + }) %>% Reduce("cbind", .) + gc() + + #Values Summary + values <- values[1,] + if(!is.null(log2Norm)){ + if(log2Norm){ + values <- log2(values + 1) + } + } + + return(values) + +} + +#' @export +.fixPlotSize <- function( + p = NULL, + plotWidth = unit(6, "in"), + plotHeight = unit(6, "in"), + margin = 0.25, + height = 1, + it = 0.05, + ... + ){ + + .requirePackage("grid") + .requirePackage("gridExtra") + + if(!inherits(plotWidth, "unit")){ + plotWidth <- unit(plotWidth, "in") + } + + if(!inherits(plotHeight, "unit")){ + plotHeight <- unit(plotHeight, "in") + } + + #adapted from https://github.com/jwdink/egg/blob/master/R/set_panel_size.r + g <- ggplotGrob(p) + + legend <- grep("guide-box", g$layout$name) + if(length(legend)!=0){ + gl <- g$grobs[[legend]] + g <- ggplotGrob(p + theme(legend.position = "none")) + }else{ + g <- ggplotGrob(p) + } + + panels <- grep("panel", g$layout$name) + panel_index_w <- unique(g$layout$l[panels]) + panel_index_h <- unique(g$layout$t[panels]) + + nw <- length(panel_index_w) + nh <- length(panel_index_h) + + pw <- convertWidth(plotWidth, unitTo = "in", valueOnly = TRUE) + ph <- convertWidth(plotHeight, unitTo = "in", valueOnly = TRUE) + + x <- 0 + width <- 1 + sm <- FALSE + + while(!sm){ + + x <- x + it + + w <- unit(x * width, "in") + h <- unit(x * height / width, "in") + m <- unit(x * margin / width, "in") + + g$widths[panel_index_w] <- rep(w, nw) + g$heights[panel_index_h] <- rep(h, nh) + + sw <- convertWidth( + x = sum(g$widths) + m, + unitTo = "in", + valueOnly = TRUE + ) + + sh <- convertHeight( + x = sum(g$heights) + m, + unitTo = "in", + valueOnly = TRUE + ) + + sm <- sw > pw | sh > ph + + } + + if(length(legend)!=0){ + + sgh <- convertHeight( + x = sum(g$heights), + unitTo = "in", + valueOnly = TRUE + ) + + slh <- convertHeight( + x = sum(gl$heights), + unitTo = "in", + valueOnly = TRUE + ) + + p <- grid.arrange(g, gl, ncol=1, nrow=2, heights = unit.c(unit(sgh,"in"), unit(slh, "in"))) + + }else{ + + p <- grid.arrange(g) + + } + + + invisible(p) + +} + +.isDiscrete <- function(x){ + is.factor(x) || is.character(x) || is.logical(x) +} + + + + + + + + + + + + diff --git a/R/ZZZ-CoAccessibility.R b/R/ZZZ-CoAccessibility.R new file mode 100644 index 00000000..74e2366f --- /dev/null +++ b/R/ZZZ-CoAccessibility.R @@ -0,0 +1,45 @@ +#' Extend Filter then Normalize Scores for Summits +#' @export +computeCoAccessibility <- function( + ArchRProject, + k = 100, + knnIteration = 5000, + overlapCutoff = 0.8, + seed = 1, + knnMethod = "nabor", + maxDist = 250000, + ...){ + + set.seed(seed) + + #Get Matrix List + matrixDF <- validInputMatrix(proj, useMatrix = "PeakMatrix") + + #Check for existence + stopifnot(all(apply(matrixDF, 2, function(x) all(file.exists(paste0(x)))))) + + #Subsample + idx <- sample(seq_len(nrow(mat)), knnIteration) + + #KNN Matrix + knnMat <- computeKnn(data = mat, query = mat[idx,], method = knnMethod) + + #Determin Overlap + keepKnn <- determineOverlapCpp(k1, floor(cutOffOverlap*k)) + + #Keep Above Cutoff + knnMat <- knnMat[keepKnn==0,] + message("Identified ", nrow(knnMat), " Groupings!") + + # #Time to compute partial group matrix for coAccessibility! + # featureDF <- DataFrame() + # groupMat <- constructGroupMatrix( + # inputFilesMatrix = matrixDF, + # featureDF = , + # groupList = groupList + # ) + + #Work in progress! + keepKnn + +} \ No newline at end of file diff --git a/R/ZZZ-FeatureEnrichment.R b/R/ZZZ-FeatureEnrichment.R new file mode 100644 index 00000000..306dcbc1 --- /dev/null +++ b/R/ZZZ-FeatureEnrichment.R @@ -0,0 +1,54 @@ +#------------------------------------------------------ +# Hypergeometric Testing +#------------------------------------------------------ + +#' Feature Matches Over Representation +#' +#' This function takes in genes matches combined with a vector of peaks in target and peaks in bdg and returns hypergeometric minus log 10 pvalues +#' @param genes motif similarity matrix used for labeling family info default is null +#' @param compare vector of compare peaks idx which will be used for hypergeometric +#' @param background vector of background peaks idx which will be used for hypergeometric +#' @export +#' +featureEnrichment <- function(featureMatches, compare, background){ + + suppressPackageStartupMessages(require(Matrix)) + + #Prep + stopifnot(length(grep("Matches", assayNames(featureMatches))) == 1) + matches <- getAssay(featureMatches, grep("Matches", assayNames(featureMatches), value = TRUE)) + matchCompare <- matches[compare, ,drop=FALSE] + matchBackground <- matches[background, ,drop=FALSE] + matchCompareTotal <- Matrix::colSums(matchCompare) + matchBackgroundTotal <- Matrix::colSums(matchBackground) + + pOut <- data.frame(feature = colnames(matches), + CompareFrequency = matchCompareTotal, + nCompare = nrow(matchCompare), + CompareProportion = matchCompareTotal/nrow(matchCompare), + BackgroundFrequency = matchBackgroundTotal, + nBackground = nrow(matchBackground), + BackgroundProporition = matchBackgroundTotal/nrow(matchBackground)) + + #Enrichment + pOut$Enrichment <- pOut$CompareProportion / pOut$BackgroundProporition + + #mlog10phyper + pOut$mlog10phyper <- lapply(seq_len(nrow(pOut)), function(x){ + p <- -phyper(pOut$CompareFrequency[x] - 1, # Number of Successes the -1 is due to cdf integration + pOut$BackgroundFrequency[x], # Number of all successes in background + pOut$nBackground[x] - pOut$BackgroundFrequency[x], # Number of non successes in background + pOut$nCompare[x], # Number that were drawn + lower.tail = FALSE, log.p = TRUE)# P[X > x] Returns LN must convert to log10 + return(p/log(10)) + }) %>% unlist %>% round(3) + pOut$FDR <- p.adjust(10^-matrixStats::rowMins(as.matrix(data.frame(pOut$mlog10phyper, 250))), method = "BH") + pOut$mlog10FDR <- -log10(pOut$FDR) + pOut <- pOut[order(pOut$mlog10phyper, decreasing = TRUE), c(1, 10, 2:8, 9, 11)] + + return(pOut) +} + + + + diff --git a/R/ZZZ-Trajectory.R b/R/ZZZ-Trajectory.R new file mode 100644 index 00000000..13e795f7 --- /dev/null +++ b/R/ZZZ-Trajectory.R @@ -0,0 +1,106 @@ +#' Get Optimal Plotting Windows +#' +#' @param rle rle +#' @export +alignCellsToTrajectory <- function(mat, groups, trajectory, filterInitial = 0.1, finterFinal = 0.25, dof = 250, spar = 1){ + + #Filter by Distance + matFilter <- lapply(seq_along(trajectory), function(x){ + #Subset + matx <- mat[groups==trajectory[x],] + groupsx <- groups[groups==trajectory[x]] + #Filter Distance + matMeanx <- colMeans(matx) + diffx <- sqrt(colSums((t(matx) - matMeanx)^2)) + idxKeep <- which(diffx <= quantile(diffx, 1 - filterInitial)) + #Filter + matx <- matx[idxKeep,] + matx$Groups <- groupsx[idxKeep] + matx + }) %>% Reduce("rbind",.) + + #Now Initial Alignment + initialTime <- lapply(seq_along(trajectory), function(x){ + #Subset + matx <- matFilter[matFilter$Groups==trajectory[x], which(colnames(matFilter) %ni% "Groups")] + #Get Differences + if(x!=length(trajectory)){ + matMeanxp1 <- colMeans(matFilter[matFilter$Groups==trajectory[x+1], which(colnames(matFilter) %ni% "Groups")]) + diffx1 <- sqrt(colSums((t(matx) - matMeanxp1)^2)) + timex <- (1 - getQuantiles(diffx1)) + x + }else{ + matMeanxm1 <- colMeans(matFilter[matFilter$Groups==trajectory[x-1], which(colnames(matFilter) %ni% "Groups")]) + diffx1 <- sqrt(colSums((t(matx) - matMeanxm1)^2)) + timex <- getQuantiles(diffx1) + x + } + timex + }) %>% unlist + + #Fit Splines + dfSplineFit <- lapply(seq_len(ncol(matFilter[, which(colnames(matFilter) %ni% "Groups")])), function(x){ + smooth.spline(initialTime, matFilter[, which(colnames(matFilter) %ni% "Groups")][,x], df = dof, spar = spar)[[2]] + }) %>% Reduce("cbind",.) %>% data.frame() + + #Trajectories + dfTrajectory <- mat[groups %in% trajectory,] + dfTrajectory$Groups <- groups[groups %in% trajectory] + + #Nearest Neighbors + knnMat <- get.knnx( + data = dfSplineFit[,which(colnames(matFilter) %ni% "Groups")], + query = dfTrajectory[,which(colnames(matFilter) %ni% "Groups")], + k = 3) + + #Lets Create Pseudotime + knn_index <- knnMat[[1]] + knn_dist <- knnMat[[2]] + knn_diff <- ifelse(knn_index[,2] > knn_index[,3], 1, -1) + knn_distq <- getQuantiles(knn_dist[,1]) + + #Filter + idxKeep <- which(knn_dist[,1] < quantile(knn_dist[,1], 1 - filterFinal)) + finalTrajectory <- data.frame( + Cells = rownames(dfTrajectory)[idxKeep], + Distance = knn_dist[idxKeep, 1], + DistanceIdx = knn_index[idxKeep, 1] + knn_distq[idxKeep] + ) + + #Lets Align the non Clusters + dfTrajectory <- mat[groups %ni% trajectory,] + dfTrajectory$Groups <- groups[groups %ni% trajectory] + + #Nearest Neighbors + knnMat <- get.knnx( + data = dfSplineFit[,which(colnames(matFilter) %ni% "Groups")], + query = dfTrajectory[,which(colnames(matFilter) %ni% "Groups")], + k = 3) + + #Lets Create Pseudotime + knn_index <- knnMat[[1]] + knn_dist <- knnMat[[2]] + knn_diff <- ifelse(knn_index[,2] > knn_index[,3], 1, -1) + knn_distq <- getQuantiles(knn_dist[,1]) + idxKeep <- which(knn_dist[,1] < max(finalTrajectory[,2])) + finalTrajectoryAdditional <- data.frame( + Cells = rownames(dfTrajectory)[idxKeep], + Distance = knn_dist[idxKeep, 1], + DistanceIdx = knn_index[idxKeep, 1] + knn_distq[idxKeep] + ) + + #Final Matrix + finalTrajectory <- rbind(finalTrajectory, finalTrajectoryAdditional) + finalTrajectory$PseudoTime <- 100*getQuantiles(finalTrajectory[,3]) + finalTrajectory +} + + + + + + + + + + + + diff --git a/README.md b/README.md index 90767b12..d2ad3718 100644 --- a/README.md +++ b/README.md @@ -1 +1,40 @@ -# ArchR \ No newline at end of file +# Installation of ArchR + +```r + +if (!requireNamespace("devtools", quietly = TRUE)) + install.packages("devtools") + +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +devtools::install_github("jgranja24/ArchR", + auth_token = token, #please email me for a token + repos = BiocManager::repositories() +) + +``` + +# Additional Packages that are used from github + + +```r + +# ggrastr is a package for plotting ggplots with rastr'd points which +# is super helpful for large UMAPs etc +# +# You need to have Cairo for ggrastr +# +# On Mac OSx you need to have XQuartz (https://www.xquartz.org/) +# +devtools::install_github('VPetukhov/ggrastr') + +# harmony is a package that can correct batch effects +devtools::install_github("immunogenomics/harmony") + +# presto is a package that has efficient tools for wilcoxon tests on sparseMatrices +devtools::install_github('immunogenomics/presto') + + +``` + diff --git a/data/geneAnnoHg19.rda b/data/geneAnnoHg19.rda new file mode 100644 index 00000000..8f65f971 Binary files /dev/null and b/data/geneAnnoHg19.rda differ diff --git a/data/geneAnnoHg38.rda b/data/geneAnnoHg38.rda new file mode 100644 index 00000000..4de33834 Binary files /dev/null and b/data/geneAnnoHg38.rda differ diff --git a/data/geneAnnoMm10.rda b/data/geneAnnoMm10.rda new file mode 100644 index 00000000..0bdc1b1c Binary files /dev/null and b/data/geneAnnoMm10.rda differ diff --git a/data/geneAnnoMm9.rda b/data/geneAnnoMm9.rda new file mode 100644 index 00000000..719e9561 Binary files /dev/null and b/data/geneAnnoMm9.rda differ diff --git a/data/genomeAnnoHg19.rda b/data/genomeAnnoHg19.rda new file mode 100644 index 00000000..eae4892f Binary files /dev/null and b/data/genomeAnnoHg19.rda differ diff --git a/data/genomeAnnoHg38.rda b/data/genomeAnnoHg38.rda new file mode 100644 index 00000000..c1e73bb3 Binary files /dev/null and b/data/genomeAnnoHg38.rda differ diff --git a/data/genomeAnnoMm10.rda b/data/genomeAnnoMm10.rda new file mode 100644 index 00000000..d0af754f Binary files /dev/null and b/data/genomeAnnoMm10.rda differ diff --git a/data/genomeAnnoMm9.rda b/data/genomeAnnoMm9.rda new file mode 100644 index 00000000..861b9c87 Binary files /dev/null and b/data/genomeAnnoMm9.rda differ diff --git a/man/.DS_Store b/man/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/man/.DS_Store differ diff --git a/man/.Rapp.history b/man/.Rapp.history new file mode 100644 index 00000000..e69de29b diff --git a/man/ArchRRegionTrack.Rd b/man/ArchRRegionTrack.Rd new file mode 100644 index 00000000..fba2be3d --- /dev/null +++ b/man/ArchRRegionTrack.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRBrowser.R +\name{ArchRRegionTrack} +\alias{ArchRRegionTrack} +\title{Plot ArchR Region Track} +\usage{ +ArchRRegionTrack(ArchRProj, region = NULL, groupBy = "Clusters", + useGroups = NULL, useCoverages = FALSE, + plotSummary = c("bulkTrack", "featureTrack", "geneTrack"), + sizes = c(10, 0.5, 4), features = NULL, geneSymbol = NULL, + upstream = 50000, downstream = 50000, tileSize = 100, + normMethod = "ReadsInTSS", threads = 1, ylim = NULL, + baseSize = 7, borderWidth = 0.4, tickWidth = 0.4, + facetbaseSize = 7, geneAnno = getGeneAnnotation(ArchRProj), + title = "", ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{region}{GRanges region that will be plotted in (if more that one first will be selected)} + +\item{groupBy}{use groupings for bulk/scTrack} + +\item{useGroups}{select a subset of groups for plotting} + +\item{useCoverages}{use group coverages for track plotting} + +\item{plotSummary}{summary of region track to be plotted} + +\item{sizes}{sizes corresponding to plotSummary} + +\item{features}{GRanges features to be plotted (ie getPeakSet(ArchRProj))} + +\item{geneSymbol}{if region is null plotting can be centered at gene start site corresponding to the gene symbol} + +\item{upstream}{bp upstream of geneStart to extend} + +\item{downstream}{bp downstream of geneStart to extend} + +\item{tileSize}{with of tiles to plot bulk/scTrack} + +\item{normMethod}{normMethod normalization column in cellColData to normalize bulkTrack} + +\item{threads}{number of threads for parallel execution} + +\item{ylim}{y-limits for bulkTrack} + +\item{baseSize}{size of font in plot} + +\item{borderWidth}{border width in plot} + +\item{tickWidth}{axis tick width in plot} + +\item{geneAnno}{geneAnnotation for geneTrack} + +\item{title}{verbose sections} + +\item{...}{additional args} +} +\description{ +This function will plot the coverage at an input region +} diff --git a/man/ArchR_palettes.Rd b/man/ArchR_palettes.Rd new file mode 100644 index 00000000..bde26028 --- /dev/null +++ b/man/ArchR_palettes.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ColorPalettes.R +\docType{data} +\name{ArchR_palettes} +\alias{ArchR_palettes} +\title{List of palettes to be used in plots} +\format{An object of class \code{list} of length 25.} +\usage{ +ArchR_palettes +} +\description{ +List of palettes to be used in plots +} +\keyword{datasets} diff --git a/man/ComputeEmbedding.Rd b/man/ComputeEmbedding.Rd new file mode 100644 index 00000000..e76035fe --- /dev/null +++ b/man/ComputeEmbedding.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ComputeEmbedding.R +\name{ComputeEmbedding} +\alias{ComputeEmbedding} +\title{Compute Embedding from Reduced Dimensions in ArchR Project} +\usage{ +ComputeEmbedding(ArchRProj = NULL, reducedDims = "IterativeLSI", + embedding = "UMAP", embeddingOut = NULL, saveModel = TRUE, + seed = 1, force = FALSE, embeddingParams = list(), ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{reducedDims}{reduced dimensions to use} + +\item{embedding}{embedding type (umap, tumap, rtsne, fftrtsne)} + +\item{...}{additional args} + +\item{colorBy}{colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix)} + +\item{name}{name of column in cellColData or Feature in Array in Arrows} + +\item{log2Norm}{log2 Normalize features if they are continuous} + +\item{pal}{custom palette to use for plotting} + +\item{size}{size of points in plot} + +\item{rastr}{rastr points in plot} + +\item{quantCut}{quantile cut of continuous features} + +\item{quantHex}{quantile evaluation for each hex in geom_hex} + +\item{discreteSet}{discrete palette for visualizing embedding} + +\item{continuousSet}{continuous palette for visualizing embedding} + +\item{randomize}{randomize points prior to plotting} + +\item{keepAxis}{keep x and y axis for plot} + +\item{baseSize}{base size for text in plot} + +\item{plotContinuous}{how to plot continuous features (points and hex)} + +\item{plotParams}{additional params to pass to ggPoint/ggHex} + +\item{plotWidth}{plot width used for creating a consistent plot independent of legend size} + +\item{plotHeight}{plot height used for creating a consistent plot independent of legend size} +} +\description{ +This function will plot an embedding that was created from +computeEmbedding +} diff --git a/man/FilterCells.Rd b/man/FilterCells.Rd new file mode 100644 index 00000000..eb814d24 --- /dev/null +++ b/man/FilterCells.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/FilterCells.R +\name{FilterCells} +\alias{FilterCells} +\title{Extend Filter then Normalize Scores for Summits} +\usage{ +FilterCells(ArchRProj, filterList) +} +\arguments{ +\item{df}{dataframe where first column is sample names 2nd column is group information and 3rd column is MACS2 summit files} + +\item{genome}{mm9, hg19 character or BSgenome object} + +\item{blacklist}{regions to blacklist} + +\item{extend}{how to extend summits (summit +- extend)} + +\item{scorePerMillion}{normalized Score-per-million minimum to keep} + +\item{selectionRules}{string with a formula containing n (majority = (n+1)/2, multiple samples = 2)} +} +\description{ +Extend Filter then Normalize Scores for Summits +} diff --git a/man/IdentifyClusters.Rd b/man/IdentifyClusters.Rd new file mode 100644 index 00000000..a16f5946 --- /dev/null +++ b/man/IdentifyClusters.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/IdentifyClusters.R +\name{IdentifyClusters} +\alias{IdentifyClusters} +\title{Identify Clusters for Single Cell Data} +\usage{ +IdentifyClusters(input, reducedDims = "IterativeLSI", + name = "Clusters", sampleCells = NULL, seed = 1, + method = "seurat", dimsToUse = NULL, knnAssign = 10, + nOutlier = 20, verbose = TRUE, tstart = NULL, ...) +} +\arguments{ +\item{input}{ArchRProject or matrix for cluster identification} + +\item{reducedDims}{reducedDims of ArchRProject if provided} + +\item{name}{name of cluster column if input is ArchRProject} + +\item{seed}{seed} + +\item{method}{supported methods are Seurat and LouvainJaccard} + +\item{dimsToUse}{reduced dims to use} + +\item{knnAssign}{number of nearest neighbors for assignment of outliers and estimation} + +\item{nOutlier}{number of cells required for a cluster to be called if not then these will be considered an outlier} + +\item{...}{arguments to provide Seurat::FindClusters or ArchR:::.clustLouvain (knn = 50, jaccard = TRUE)} +} +\description{ +This function will identify clusters for single cell reduced dimensions supplied or from and ArchRProject +} diff --git a/man/IterativeLSI.Rd b/man/IterativeLSI.Rd new file mode 100644 index 00000000..a411c5e3 --- /dev/null +++ b/man/IterativeLSI.Rd @@ -0,0 +1,67 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LatentSemanticIndexing.R +\name{IterativeLSI} +\alias{IterativeLSI} +\title{Compute Iterative LSI} +\usage{ +IterativeLSI(ArchRProj = NULL, useMatrix = "TileMatrix", + reducedDimsOut = "IterativeLSI", iterations = 3, dimsToUse = 1:25, + binarize = TRUE, sampleCells = 5000, varFeatures = 50000, + selectionMethod = "var", scaleTo = 10000, totalFeatures = 5e+05, + filterQuantile = 0.99, saveIterations = TRUE, + outDir = getOutputDirectory(ArchRProj), clusterParams = list(), + runHarmony = FALSE, harmonyParams = list(), threads = 1, + seed = 1, verboseHeader = TRUE, verboseAll = FALSE, + force = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{useMatrix}{use matrix for LSI clustering from Arrow} + +\item{reducedDimsOut}{name of dimensionality reduction to be stored as} + +\item{iterations}{number of LSI iterations to perform} + +\item{dimsToUse}{number of dimensions to compute and use from LSI (TFIDF-SVD) for clustering} + +\item{binarize}{binarize matrix prior to LSI} + +\item{sampleCells}{number of cells to sample for LSI estimation} + +\item{varFeatures}{number of variable features to use for LSI} + +\item{selectionMethod}{selection method for variable features (var or vmr)} + +\item{scaleTo}{scaleTo for Cluster Averages for variance calculation} + +\item{totalFeatures}{number of features to consider (ranked by total number of counts) use for LSI} + +\item{filterQuantile}{filter features for initial LSI that are above this quantile} + +\item{saveIterations}{save LSI iterations as rds in the outDir} + +\item{outDir}{output directory for saving LSI iterations} + +\item{clusterParams}{additional params to pass to IdentifyClusters} + +\item{runHarmony}{run harmony batch correction through the iterations} + +\item{harmonyParams}{additional params to pass to harmony} + +\item{threads}{number of threads for parallel execution} + +\item{seed}{seed for analysis} + +\item{verboseHeader}{verbose sections} + +\item{verboseAll}{verbose sections and subsections} + +\item{force}{verbose sections and subsections} + +\item{...}{additional args} +} +\description{ +This function will compute an iterative LSI dimensionality reduction +on an ArchRProject. +} diff --git a/man/VisualizeEmbedding.Rd b/man/VisualizeEmbedding.Rd new file mode 100644 index 00000000..e30ed0ca --- /dev/null +++ b/man/VisualizeEmbedding.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VisualizeData.R +\name{VisualizeEmbedding} +\alias{VisualizeEmbedding} +\title{Visualize Embedding from ArchR Project} +\usage{ +VisualizeEmbedding(ArchRProj = NULL, embedding = "UMAP", + colorBy = "colData", name = "Sample", log2Norm = NULL, + pal = NULL, size = 0.5, rastr = TRUE, quantCut = c(0.05, 0.95), + quantHex = 0.5, discreteSet = NULL, continuousSet = NULL, + randomize = TRUE, keepAxis = FALSE, baseSize = 6, + plotContinuous = NULL, plotParams = list(), plotWidth = 5, + plotHeight = 5, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{embedding}{embedding to visualize (see computeEmbedding)} + +\item{colorBy}{colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix)} + +\item{name}{name of column in cellColData or Feature in Array in Arrows} + +\item{log2Norm}{log2 Normalize features if they are continuous} + +\item{pal}{custom palette to use for plotting} + +\item{size}{size of points in plot} + +\item{rastr}{rastr points in plot} + +\item{quantCut}{quantile cut of continuous features} + +\item{quantHex}{quantile evaluation for each hex in geom_hex} + +\item{discreteSet}{discrete palette for visualizing embedding} + +\item{continuousSet}{continuous palette for visualizing embedding} + +\item{randomize}{randomize points prior to plotting} + +\item{keepAxis}{keep x and y axis for plot} + +\item{baseSize}{base size for text in plot} + +\item{plotContinuous}{how to plot continuous features (points and hex)} + +\item{plotParams}{additional params to pass to ggPoint/ggHex} + +\item{plotWidth}{plot width used for creating a consistent plot independent of legend size} + +\item{plotHeight}{plot height used for creating a consistent plot independent of legend size} + +\item{...}{additional args} +} +\description{ +This function will plot an embedding that was created from +computeEmbedding +} diff --git a/man/VisualizeGroups.Rd b/man/VisualizeGroups.Rd new file mode 100644 index 00000000..19d67f31 --- /dev/null +++ b/man/VisualizeGroups.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/VisualizeData.R +\name{VisualizeGroups} +\alias{VisualizeGroups} +\title{Visualize Groups from ArchR Project} +\usage{ +VisualizeGroups(ArchRProj, groupBy = "Sample", colorBy = "colData", + name = "TSSEnrichment", log2Norm = NULL, pal = NULL, ylim = NULL, + size = 0.5, baseSize = 6, ratioYX = NULL, points = FALSE, + plotWidth = 6, plotHeight = 4, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{groupBy}{use groupings in cellColData for summarizing and plotting} + +\item{colorBy}{colorBy cellColData or Arrays in Arrows (ie GeneScoreMatrix)} + +\item{name}{name of column in cellColData or Feature in Array in Arrows} + +\item{pal}{custom palette to use for plotting} + +\item{ylim}{limits for features in plot} + +\item{size}{size of points in ggplot} + +\item{baseSize}{rastr points in ggplot} + +\item{ratioYX}{ratio of Y axis to X axis} + +\item{points}{add points to plot using quasirandom} + +\item{plotWidth}{plot width used for creating a consistent plot independent of legend size} + +\item{plotHeight}{plot height used for creating a consistent plot independent of legend size} + +\item{...}{additional args} +} +\description{ +This function will plot an embedding that was created from +computeEmbedding +} diff --git a/man/addCellColData.Rd b/man/addCellColData.Rd new file mode 100644 index 00000000..2eafaf1d --- /dev/null +++ b/man/addCellColData.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{addCellColData} +\alias{addCellColData} +\title{Add information to cellColData in ArchRProject} +\usage{ +addCellColData(ArchRProj, data = NULL, name = NULL, + cells = getCellNames(ArchRProj), force = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{data}{data to add to cellColData} + +\item{name}{new column name in cellColData if already exists set force = TRUE to override} + +\item{cells}{names of cells corresponding to data} + +\item{force}{if name already exists in cellColData set force = TRUE to override} + +\item{...}{additional args} +} +\description{ +This function adds new data to cellColData in ArchRProject +} diff --git a/man/addDemuxletResults.Rd b/man/addDemuxletResults.Rd new file mode 100644 index 00000000..80d47d06 --- /dev/null +++ b/man/addDemuxletResults.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DoubletsScores.R +\name{addDemuxletResults} +\alias{addDemuxletResults} +\title{Add Demuxlet Results to ArchR Project} +\usage{ +addDemuxletResults(ArchRProj, bestFiles, sampleNames) +} +\arguments{ +\item{ArchRProj}{ArchR Project} + +\item{bestFiles}{paths to best files} + +\item{sampleNames}{sampleNames corresponding to best files (match those in ArchRProj)} +} +\description{ +This function will read in .best file output from demuxlet and add the +classifications into the cellColData for the ArchR Project +} diff --git a/man/addDeviationsMatrix.Rd b/man/addDeviationsMatrix.Rd new file mode 100644 index 00000000..5c54ca8d --- /dev/null +++ b/man/addDeviationsMatrix.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MatrixDeviations.R +\name{addDeviationsMatrix} +\alias{addDeviationsMatrix} +\title{Add DeviationsMatrix to Arrow Files in ArchRProject} +\usage{ +addDeviationsMatrix(ArchRProj, annotations = NULL, matrixName = NULL, + out = c("z", "deviations"), binarize = FALSE, threads = 1, + parallelParam = NULL, force = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{annotations}{annotaions name stored in ArchRProject} + +\item{matrixName}{matrixName to be stored as in Arrow Files} + +\item{out}{save ouptut matrices deviations and/or z} + +\item{binarize}{binarize peaks prior to computing deviations} + +\item{threads}{number of threads for parallel execution} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{force}{force overwriting previous TileMatrix in ArrowFile} +} +\description{ +This function for each sample will independently compute counts for each tile +per cell and then infer gene activity scores. +} diff --git a/man/addDoubletScores.Rd b/man/addDoubletScores.Rd new file mode 100644 index 00000000..20196c0e --- /dev/null +++ b/man/addDoubletScores.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DoubletsScores.R +\name{addDoubletScores} +\alias{addDoubletScores} +\title{Add Doublet Scores to Arrows/ArchRProject} +\usage{ +addDoubletScores(input, useMatrix = "TileMatrix", k = 200, + nTrials = 100, knnMethod = "UMAP", UMAPParams = list(), + LSIParams = list(), useClusters = FALSE, outDir = "QualityControl", + threads = 1, parallelParam = NULL, verboseHeader = TRUE, + verboseAll = FALSE, ...) +} +\arguments{ +\item{input}{ArchRProject or ArrowFiles} + +\item{useMatrix}{matrix name for performing analyses} + +\item{k}{number of cells nearby a simulated doublet to consider} + +\item{nTrials}{number of trials to simulate doublets in thousands} + +\item{knnMethod}{dimension reduction to use for KNN (UMAP or SVD)} + +\item{UMAPParams}{list of parameters to pass to uwot::umap} + +\item{LSIParams}{list of parameters to pass to IterativeLSI} + +\item{outDir}{directory to plot and store results from analyses} + +\item{threads}{number of threads} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{verboseHeader}{verbose sections} + +\item{verboseAll}{verbose sections and subsections} + +\item{...}{additional args} + +\item{threads}{number of therads for parallel execution} + +\item{parallelParam}{parallel parameters for batch style execution} +} +\description{ +This function for each sample will independently assign inferred doublet information +to each cell for removing strong heterotropic doublet based clusters downstream. +} diff --git a/man/addFeatureMatrix.Rd b/man/addFeatureMatrix.Rd new file mode 100644 index 00000000..4fc82a20 --- /dev/null +++ b/man/addFeatureMatrix.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MatrixFeatures.R +\name{addFeatureMatrix} +\alias{addFeatureMatrix} +\title{Add FeatureMatrix to Arrows/ArchRProject} +\usage{ +addFeatureMatrix(input, features = NULL, matrixName = "FeatureMatrix", + ceiling = Inf, binarize = FALSE, threads = 1, + parallelParam = NULL, force = FALSE, ...) +} +\arguments{ +\item{input}{ArchRProject or ArrowFiles} + +\item{features}{GRanges to count for each cell} + +\item{matrixName}{matrix output name in ArrowFiles cannot be a protected matrix name} + +\item{ceiling}{ceiling for the number of counts per feature} + +\item{binarize}{binarize matrix} + +\item{threads}{number of threads} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{force}{force overwriting previous TileMatrix in ArrowFile} +} +\description{ +This function for each sample will independently compute counts for each feature +per cell in the Arrow File +} diff --git a/man/addGeneScoreMatrix.Rd b/man/addGeneScoreMatrix.Rd new file mode 100644 index 00000000..751e7782 --- /dev/null +++ b/man/addGeneScoreMatrix.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MatrixGeneScores.R +\name{addGeneScoreMatrix} +\alias{addGeneScoreMatrix} +\title{Add GeneScoreMatrix to Arrows/ArchRProject} +\usage{ +addGeneScoreMatrix(input = NULL, genes = NULL, + geneModel = "exp(-abs(x)/10000)", upstream = 1e+05, + downstream = 1e+05, tileSize = 500, ceiling = 4, scaleTo = 10000, + excludeChr = c("chrY", "chrM"), blacklist = NULL, threads = 1, + parallelParam = NULL, force = FALSE, ...) +} +\arguments{ +\item{input}{ArchRProject or ArrowFiles} + +\item{genes}{genes as a GRanges object} + +\item{geneModel}{gene model as a string for weighting peaks for gene score calculation (function of x)} + +\item{upstream}{upstream the Gene Start to consider for calculation} + +\item{downstream}{downstream the Gene Start to consider for calculation} + +\item{tileSize}{tileSize for binning counts prior to gene score calculation} + +\item{ceiling}{ceiling of read counts per tile (prevent huge biases)} + +\item{scaleTo}{scale gene scores to} + +\item{excludeChr}{exclude chromosomes from this analysis} + +\item{blacklist}{blacklist GRanges used to remove tiles prior to calculation} + +\item{threads}{number of threads} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{force}{force overwriting previous TileMatrix in ArrowFile} +} +\description{ +This function for each sample will independently compute counts for each tile +per cell and then infer gene activity scores. +} diff --git a/man/addGroupCoverages.Rd b/man/addGroupCoverages.Rd new file mode 100644 index 00000000..ee2786d9 --- /dev/null +++ b/man/addGroupCoverages.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GroupCoverages.R +\name{addGroupCoverages} +\alias{addGroupCoverages} +\title{Add Group Coverages to ArchR Project} +\usage{ +addGroupCoverages(ArchRProj, groupBy = "Clusters", useLabels = TRUE, + minCells = 40, maxCells = 500, maxFragments = 25 * 10^6, + minReplicates = 2, maxReplicates = 5, sampleRatio = 0.8, + kmerLength = 6, threads = 16, parallelParam = "mclapply", + force = FALSE, verboseHeader = TRUE, verboseAll = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{groupBy}{group cells by this column in cellColData} + +\item{useLabels}{use sample labels to create sample guided subgroupings as pseudo replicates} + +\item{minCells}{minimum cells per group for coverage files} + +\item{maxCells}{maximum cells per group for coverage files} + +\item{maxFragments}{maximum fragments per group for coverage files (this prevents large files created for optimizing memory)} + +\item{minReplicates}{minimum replicates for group for coverage files} + +\item{maxReplicates}{maximum replicates for group for coverage files} + +\item{sampleRatio}{sampling ratio for pseudo replicates when needed} + +\item{kmerLength}{kmer length for adding Tn5 bias estimation} + +\item{threads}{number of threads} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{force}{force creating coverage files if existed} + +\item{verboseHeader}{verbose sections} + +\item{verboseAll}{verbose sections and subsections} + +\item{...}{additional args} +} +\description{ +This function will merge cells within each group into an insertion +coverage file +} diff --git a/man/addMotifAnnotations.Rd b/man/addMotifAnnotations.Rd new file mode 100644 index 00000000..26806dc3 --- /dev/null +++ b/man/addMotifAnnotations.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{addMotifAnnotations} +\alias{addMotifAnnotations} +\title{Add Motif Annotations to ArchRProject} +\usage{ +addMotifAnnotations(ArchRProj = NULL, motifSet = "JASPAR2018", + name = "Motif", species = NULL, collection = "CORE", + cutOff = 5e-05, w = 7, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{motifSet}{motifSet JASPAR : JASPAR2016, JASPAR2018; chromVARmotifs : human, mouse, encode, homer} + +\item{name}{of annotations to store as in ArchRProject} + +\item{species}{species relevant to dataset (default will guess based on getGenome)} + +\item{collection}{JASPAR collection (default = CORE)} + +\item{cutOff}{pvalue cutoff for motif search (see motimatchr)} + +\item{w}{width to consider for motif (see motimatchr)} + +\item{...}{additional args} +} +\description{ +This function adds motif postions and matches to an ArchRProject +} diff --git a/man/addPeakMatrix.Rd b/man/addPeakMatrix.Rd new file mode 100644 index 00000000..af0e700b --- /dev/null +++ b/man/addPeakMatrix.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MatrixFeatures.R +\name{addPeakMatrix} +\alias{addPeakMatrix} +\title{Add PeakMatrix to Arrows in ArchRProject} +\usage{ +addPeakMatrix(ArchRProj, ceiling = 4, binarize = FALSE, + parallelParam = NULL, threads = 1, force = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{ceiling}{ceiling for the number of counts per feature} + +\item{binarize}{binarize matrix} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{threads}{number of threads} + +\item{force}{force overwriting previous TileMatrix in ArrowFile} +} +\description{ +This function for each sample will independently compute counts for each peak +per cell in the Arrow File +} diff --git a/man/addPeakSet.Rd b/man/addPeakSet.Rd new file mode 100644 index 00000000..756571b7 --- /dev/null +++ b/man/addPeakSet.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{addPeakSet} +\alias{addPeakSet} +\title{Add PeakSet to ArchRProject} +\usage{ +addPeakSet(ArchRProj, peakSet, force = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{peakSet}{peakSet as a GRanges} + +\item{force}{force overriding peakSet in ArchRProject} + +\item{...}{additional args} +} +\description{ +This function adds a peakSet to an ArchRProject +} diff --git a/man/addReproduciblePeakSet.Rd b/man/addReproduciblePeakSet.Rd new file mode 100644 index 00000000..12c00014 --- /dev/null +++ b/man/addReproduciblePeakSet.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ReproduciblePeakSet.R +\name{addReproduciblePeakSet} +\alias{addReproduciblePeakSet} +\title{Add Reproducible Peak Set to ArchR Project} +\usage{ +addReproduciblePeakSet(ArchRProj = NULL, groupBy = "Clusters", + reproducibility = "2", peaksPerCell = 500, maxPeaks = 150000, + excludeChr = c("chrM", "chrY"), pathToMacs2 = "macs2", + genomeSize = NULL, shift = -75, extsize = 150, method = "q", + cutOff = 0.05, extendSummits = 250, promoterDist = 500, + genomeAnno = getGenomeAnnotation(ArchRProj), + geneAnno = getGeneAnnotation(ArchRProj), + additionalParams = "--nomodel --nolambda", threads = 1, + parallelParam = "mclapply", force = FALSE, verboseHeader = TRUE, + verboseAll = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{groupBy}{use groupings for peak calling matching group coverage files} + +\item{reproducibility}{reproducibility for peak calling (string that is a function of n)} + +\item{peaksPerCell}{number of peaks that can be identified per cell on average} + +\item{excludeChr}{exclude chromosomes from peak calling} + +\item{pathToMacs2}{path to macs2 executable (see Macs2)} + +\item{genomeSize}{genome size for peak calling (see Macs2)} + +\item{shift}{shift of Tn5 insertions (<- | ) (see Macs2)} + +\item{extsize}{extension of Tn5 insertions (|<- ->|) (see Macs2)} + +\item{method}{significance method for Macs2 (see Macs2)} + +\item{cutOff}{significance cutoff for Macs2 (see Macs2)} + +\item{extendSummits}{extend peak summits for final fixed-width peaks} + +\item{promoterDist}{promoter distance from TSS for annotating peaks} + +\item{genomeAnno}{genome annotation for ArchRProject} + +\item{geneAnno}{gene annotation for ArchRProject} + +\item{additionalParams}{additional parameters to pass to Macs2 (see Macs2)} + +\item{threads}{number of threads for parallel execution} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{force}{force creating peakSet if existed} + +\item{verboseHeader}{verbose sections} + +\item{verboseAll}{verbose sections and subsections} + +\item{...}{additional args} +} +\description{ +This function will get insertions from coverage files call peaks +and merge to get a Union Reproducible Peak Set +} diff --git a/man/addSampleColData.Rd b/man/addSampleColData.Rd new file mode 100644 index 00000000..3e136379 --- /dev/null +++ b/man/addSampleColData.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{addSampleColData} +\alias{addSampleColData} +\title{Add information to sampleColData in ArchRProject} +\usage{ +addSampleColData(ArchRProj, data = NULL, name = NULL, + samples = rownames(sampleColData(ArchRProj)), force = FALSE) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{data}{data to add to sampleColData} + +\item{name}{new column name in sampleColData if already exists set force = TRUE to override} + +\item{force}{if name already exists in sampleColData set force = TRUE to override} + +\item{cells}{names of samples corresponding to data} + +\item{...}{additional args} +} +\description{ +This function adds new data to sampleColData in ArchRProject +} diff --git a/man/addSeqLengths.Rd b/man/addSeqLengths.Rd new file mode 100644 index 00000000..ddcb0a24 --- /dev/null +++ b/man/addSeqLengths.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{addSeqLengths} +\alias{addSeqLengths} +\title{Add Seqlengths to genomic ranges} +\usage{ +addSeqLengths(gr, genome) +} +\arguments{ +\item{gr}{see validGRanges} + +\item{genome}{see validBSgenome} +} +\description{ +Add Seqlengths to genomic ranges +} diff --git a/man/addTileMatrix.Rd b/man/addTileMatrix.Rd new file mode 100644 index 00000000..ed68600c --- /dev/null +++ b/man/addTileMatrix.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MatrixTiles.R +\name{addTileMatrix} +\alias{addTileMatrix} +\title{Add TileMatrix to Arrows/ArchRProject} +\usage{ +addTileMatrix(input, chromSizes = getChromSizes(input), tileSize = 500, + binarize = TRUE, excludeChr = c("chrM", "chrY"), threads = 1, + parallelParam = NULL, force = FALSE, ...) +} +\arguments{ +\item{input}{ArchRProject or ArrowFiles} + +\item{chromSizes}{chromomosome sizes used for identifying number of tiles to count} + +\item{tileSize}{size for each tile to break up each chromosome} + +\item{binarize}{save as a Sparse.Binary.Matrix or Sparse.Integer.Matrix} + +\item{excludeChr}{exclude chromosomes from this analysis} + +\item{threads}{number of threads} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{force}{force overwriting previous TileMatrix in ArrowFile} +} +\description{ +This function for each sample will independently compute counts for each tile +per cell in the Arrow File +} diff --git a/man/alignCellsToTrajectory.Rd b/man/alignCellsToTrajectory.Rd new file mode 100644 index 00000000..bb91eef6 --- /dev/null +++ b/man/alignCellsToTrajectory.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ZZZ-Trajectory.R +\name{alignCellsToTrajectory} +\alias{alignCellsToTrajectory} +\title{Get Optimal Plotting Windows} +\usage{ +alignCellsToTrajectory(mat, groups, trajectory, filterInitial = 0.1, + finterFinal = 0.25, dof = 250, spar = 1) +} +\arguments{ +\item{rle}{rle} +} +\description{ +Get Optimal Plotting Windows +} diff --git a/man/availableFeatures.Rd b/man/availableFeatures.Rd new file mode 100644 index 00000000..9fd377f0 --- /dev/null +++ b/man/availableFeatures.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{availableFeatures} +\alias{availableFeatures} +\title{Return Available Features for a given Matrix in ArrowFiles within an ArchRProject} +\usage{ +availableFeatures(ArchRProj, useMatrix = "GeneScoreMatrix", + select = NULL, ignore.case = TRUE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{useMatrix}{Matrix Name as in Arrow Files (ie TileMatrix, GeneScoreMatrix, ...)} + +\item{select}{select a specific name with grep} + +\item{ignore.case}{ignore case when searching with select} + +\item{...}{additional args} +} +\description{ +This function will identify available features for a matrix and return them for downstream +plotting utils. +} diff --git a/man/columnOverlaps.Rd b/man/columnOverlaps.Rd new file mode 100644 index 00000000..d376f144 --- /dev/null +++ b/man/columnOverlaps.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{columnOverlaps} +\alias{columnOverlaps} +\title{Instead of counting overlaps get columns like max score or etc in query} +\usage{ +columnOverlaps(query, subject, colname = "score", ignore.strand = TRUE, + decreasing = TRUE) +} +\arguments{ +\item{query}{granges query} + +\item{subject}{granges subject} + +\item{colname}{mcols(gr)[[colname]] cannot be null} + +\item{decreasing}{for order} +} +\description{ +Instead of counting overlaps get columns like max score or etc in query +} diff --git a/man/computeCoAccessibility.Rd b/man/computeCoAccessibility.Rd new file mode 100644 index 00000000..e924c25d --- /dev/null +++ b/man/computeCoAccessibility.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ZZZ-CoAccessibility.R +\name{computeCoAccessibility} +\alias{computeCoAccessibility} +\title{Extend Filter then Normalize Scores for Summits} +\usage{ +computeCoAccessibility(ArchRProject, k = 100, knnIteration = 5000, + overlapCutoff = 0.8, seed = 1, knnMethod = "nabor", + maxDist = 250000, ...) +} +\description{ +Extend Filter then Normalize Scores for Summits +} diff --git a/man/computeKNN.Rd b/man/computeKNN.Rd new file mode 100644 index 00000000..4a081807 --- /dev/null +++ b/man/computeKNN.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/IdentifyClusters.R +\name{computeKNN} +\alias{computeKNN} +\title{Group Means} +\usage{ +computeKNN(data, query = NULL, k = 50, method = NULL, + includeSelf = FALSE, ...) +} +\description{ +Group Means +} diff --git a/man/computeLSI.Rd b/man/computeLSI.Rd new file mode 100644 index 00000000..b0534cc6 --- /dev/null +++ b/man/computeLSI.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LatentSemanticIndexing.R +\name{computeLSI} +\alias{computeLSI} +\title{Compute LSI} +\usage{ +computeLSI(mat, nDimensions = 50, binarize = TRUE, seed = 1, + verbose = TRUE, tstart = NULL, ...) +} +\arguments{ +\item{mat}{sparseMatrix (dgcMatrix) for LSI} + +\item{nDimensions}{number of LSI dimensions to compute} + +\item{binarize}{binarize matrix prior to LSI} + +\item{seed}{seed for analysis} + +\item{verbose}{verbose} + +\item{tstart}{time stamp to pass} + +\item{...}{additional args} +} +\description{ +This function will compute a LSI transform (TF-IDF followed by SVD) +} diff --git a/man/constructGRanges.Rd b/man/constructGRanges.Rd new file mode 100644 index 00000000..66f97bff --- /dev/null +++ b/man/constructGRanges.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{constructGRanges} +\alias{constructGRanges} +\title{Construct GRanges seqnames start end accounting for ends before starts (adding strandedness)} +\usage{ +constructGRanges(seqnames, start, end, ignore.strand = TRUE) +} +\arguments{ +\item{seqnames}{seqnames of GRanges} + +\item{start}{start of GRanges} + +\item{end}{end of GRanges} + +\item{ignore.strand}{ignore strandedness for overlaps} +} +\description{ +Construct GRanges seqnames start end accounting for ends before starts (adding strandedness) +} diff --git a/man/createArrowFiles.Rd b/man/createArrowFiles.Rd new file mode 100644 index 00000000..91e3b6a6 --- /dev/null +++ b/man/createArrowFiles.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CreateArrow.R +\name{createArrowFiles} +\alias{createArrowFiles} +\title{Create Arrow Files} +\usage{ +createArrowFiles(inputFiles = NULL, sampleNames = NULL, + outputNames = paste0("./", sampleNames), geneAnno = NULL, + genomeAnno = NULL, filterFrags = 1000, filterTSS = 4, + removeFilteredCells = TRUE, minFrags = 500, + outDir = "QualityControl", nucLength = 147, TSSParams = list(), + excludeChr = c("chrM", "chrY"), nChunk = 5, bcTag = "qname", + gsubExpression = NULL, bamFlag = NULL, offsetPlus = 4, + offsetMinus = -5, addTileMat = TRUE, TileMatParams = list(), + addGeneScoreMat = TRUE, GeneScoreMatParams = list(), force = FALSE, + threads = 1, parallelParam = NULL, verboseHeader = TRUE, + verboseAll = FALSE, ...) +} +\arguments{ +\item{inputFiles}{input files (tabixFile, bamFile or textFile)} + +\item{sampleNames}{sample names corresponding to input files} + +\item{outputNames}{output names prefix (ie PBMC -> PBMC.arrow)} + +\item{geneAnno}{geneAnnotation input for TSS Scores etc.} + +\item{genomeAnno}{genomeAnnotation input for ChromSizes Nucleotide Information etc.} + +\item{filterFrags}{min fragments per cell to be filtered for analyses such as tileMat etc.} + +\item{filterTSS}{min TSS Score per cell to be filtered for analyses such as tileMat etc.} + +\item{removeFilteredCells}{remove fragments corresponding to cells pass filterFrags and filterTSS} + +\item{minFrags}{min fragments per cell to be immediately filtered} + +\item{outDir}{out directory for QC information from sample to be plotted / saved} + +\item{nucLength}{nucleosome length for id'ing fragments as sub-, mono-, or multi-nucleosome spanning} + +\item{TSSParams}{TSS parameters for computing TSS scores} + +\item{excludeChr}{exclude these chromosomes from analysis downstream (does not apply to fragments)} + +\item{nChunk}{number of chunks per chromosome when reading in input files} + +\item{bcTag}{barcode tag location in bam file (see ScanBam in Rsamtools)} + +\item{bamFlag}{list of bam flags for reading in fragments from input files (see ScanBam in Rsamtools)} + +\item{offsetPlus}{Tn5 offset of "+" stranded insertion (see Buenrostro 2013)} + +\item{offsetMinus}{Tn5 offset of "-" stranded insertion (see Buenrostro 2013)} + +\item{addTileMat}{addTileMatrix to ArrowFiles} + +\item{TileMatParams}{additional parameters to pass to addTileMatrix (see addTileMatrix)} + +\item{addGeneScoreMat}{addGeneScoreMatrix to ArrowFiles} + +\item{GeneScoreMatParams}{additional parameters to pass to addGeneScoreMatrix (see addGeneScoreMatrix)} + +\item{force}{force creation of arrow files if already exist} + +\item{threads}{number threads for parallel execution} + +\item{parallelParam}{parallel parameters for batch style execution} + +\item{...}{additional args} +} +\description{ +This function will create an Arrow Files from input files +for downstream analysis +} diff --git a/man/dot-ArchRLogo.Rd b/man/dot-ArchRLogo.Rd new file mode 100644 index 00000000..98fef2e2 --- /dev/null +++ b/man/dot-ArchRLogo.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HelperUtils.R +\name{.ArchRLogo} +\alias{.ArchRLogo} +\title{This function returns ascii archr LOGO or arrow etc.} +\usage{ +.ArchRLogo(ascii = "Logo") +} +\arguments{ +\item{ascii}{logo, arrow, target} +} +\description{ +This function returns ascii archr LOGO or arrow etc. +} diff --git a/man/dot-checkPath.Rd b/man/dot-checkPath.Rd new file mode 100644 index 00000000..abd34013 --- /dev/null +++ b/man/dot-checkPath.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HelperUtils.R +\name{.checkPath} +\alias{.checkPath} +\title{Check path for utility} +\usage{ +.checkPath(u = NULL, path = NULL, error = TRUE) +} +\arguments{ +\item{u}{utility that you want to check is in path} + +\item{path}{check on top of path a custom path} + +\item{error}{cause error if not in path} +} +\description{ +Check path for utility +} diff --git a/man/dot-fileExtension.Rd b/man/dot-fileExtension.Rd new file mode 100644 index 00000000..d3f8ffb9 --- /dev/null +++ b/man/dot-fileExtension.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HelperUtils.R +\name{.fileExtension} +\alias{.fileExtension} +\title{Get File Extension} +\usage{ +.fileExtension(x) +} +\arguments{ +\item{x}{character string refering to a file you want to get the extension from} +} +\description{ +Get File Extension +} diff --git a/man/extendGRanges.Rd b/man/extendGRanges.Rd new file mode 100644 index 00000000..570c619f --- /dev/null +++ b/man/extendGRanges.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{extendGRanges} +\alias{extendGRanges} +\title{Merge Genomic Ranges} +\usage{ +extendGRanges(x, upstream, downstream) +} +\arguments{ +\item{query}{see validGRanges} + +\item{subject}{see validGRanges} + +\item{ignore.strand}{ignore strandedness for overlaps} +} +\description{ +Merge Genomic Ranges +} diff --git a/man/featureEnrichment.Rd b/man/featureEnrichment.Rd new file mode 100644 index 00000000..ea4d2618 --- /dev/null +++ b/man/featureEnrichment.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ZZZ-FeatureEnrichment.R +\name{featureEnrichment} +\alias{featureEnrichment} +\title{Feature Matches Over Representation} +\usage{ +featureEnrichment(featureMatches, compare, background) +} +\arguments{ +\item{compare}{vector of compare peaks idx which will be used for hypergeometric} + +\item{background}{vector of background peaks idx which will be used for hypergeometric} + +\item{genes}{motif similarity matrix used for labeling family info default is null} +} +\description{ +This function takes in genes matches combined with a vector of peaks in target and peaks in bdg and returns hypergeometric minus log 10 pvalues +} diff --git a/man/getAnnotation.Rd b/man/getAnnotation.Rd new file mode 100644 index 00000000..129f73c0 --- /dev/null +++ b/man/getAnnotation.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getAnnotation} +\alias{getAnnotation} +\title{Get Embedding from ArchRProject} +\usage{ +getAnnotation(ArchRProj, name = NULL, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{name}{name of annotations} + +\item{...}{additional args} +} +\description{ +This function gets an embedding from an ArchRProject +} diff --git a/man/getArrowFiles.Rd b/man/getArrowFiles.Rd new file mode 100644 index 00000000..2c46497a --- /dev/null +++ b/man/getArrowFiles.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getArrowFiles} +\alias{getArrowFiles} +\title{Get ArrowFiles in ArchRProject} +\usage{ +getArrowFiles(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets ArrowFiles in ArchRProject +} diff --git a/man/getBlacklist.Rd b/man/getBlacklist.Rd new file mode 100644 index 00000000..4d34c5dd --- /dev/null +++ b/man/getBlacklist.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getBlacklist} +\alias{getBlacklist} +\title{Get blacklist from ArchRProject} +\usage{ +getBlacklist(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets the blacklist as a GRanges from genomeAnnotation in ArchRProject +} diff --git a/man/getCellColData.Rd b/man/getCellColData.Rd new file mode 100644 index 00000000..5146baf4 --- /dev/null +++ b/man/getCellColData.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getCellColData} +\alias{getCellColData} +\title{Get cellColData in ArchRProject} +\usage{ +getCellColData(ArchRProj, select = NULL, drop = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{select}{select a subset of column names from cellColData can put in a string function} + +\item{drop}{drop if selecting only one column name} + +\item{...}{additional args} +} +\description{ +This function gets sampleColData in ArchRProject +} diff --git a/man/getCellNames.Rd b/man/getCellNames.Rd new file mode 100644 index 00000000..9c0e0de9 --- /dev/null +++ b/man/getCellNames.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getCellNames} +\alias{getCellNames} +\title{Get cellNames in ArchRProject} +\usage{ +getCellNames(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets cellNames in ArchRProject +} diff --git a/man/getChromLengths.Rd b/man/getChromLengths.Rd new file mode 100644 index 00000000..018c5c28 --- /dev/null +++ b/man/getChromLengths.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getChromLengths} +\alias{getChromLengths} +\title{Get chromLengths from ArchRProject} +\usage{ +getChromLengths(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets chromosome lengths as a vector from genomeAnnotation in ArchRProject +} diff --git a/man/getChromSizes.Rd b/man/getChromSizes.Rd new file mode 100644 index 00000000..7442fbe9 --- /dev/null +++ b/man/getChromSizes.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getChromSizes} +\alias{getChromSizes} +\title{Get chromSizes from ArchRProject} +\usage{ +getChromSizes(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets chromosome lengths as GRanges from genomeAnnotation in ArchRProject +} diff --git a/man/getEmbedding.Rd b/man/getEmbedding.Rd new file mode 100644 index 00000000..0bee39f8 --- /dev/null +++ b/man/getEmbedding.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getEmbedding} +\alias{getEmbedding} +\title{Get Embedding from ArchRProject} +\usage{ +getEmbedding(ArchRProj, embedding = "IterativeLSI", return = "df", ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{embedding}{embedding name in ArchRProject} + +\item{return}{return embedding as df or all info} + +\item{...}{additional args} +} +\description{ +This function gets an embedding from an ArchRProject +} diff --git a/man/getExons.Rd b/man/getExons.Rd new file mode 100644 index 00000000..f1bb527d --- /dev/null +++ b/man/getExons.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getExons} +\alias{getExons} +\title{Get Exons from ArchRProject} +\usage{ +getExons(ArchRProj, symbols = NULL, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets exons from geneAnnotation in ArchRProject +} diff --git a/man/getFragmentsFromArrow.Rd b/man/getFragmentsFromArrow.Rd new file mode 100644 index 00000000..b7e2427f --- /dev/null +++ b/man/getFragmentsFromArrow.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArrowRead.R +\name{getFragmentsFromArrow} +\alias{getFragmentsFromArrow} +\title{Read Fragments from Arrow} +\usage{ +getFragmentsFromArrow(ArrowFile, chr = NULL, cellNames = NULL, + method = "fast", verbose = TRUE, ...) +} +\arguments{ +\item{ArrowFile}{ArchRProject or ArrowFiles} + +\item{chr}{GRanges to count for each cell} + +\item{cellNames}{matrix output name in ArrowFiles cannot be a protected matrix name} + +\item{method}{ceiling for the number of counts per feature} + +\item{verbose}{binarize matrix} + +\item{...}{additional params} +} +\description{ +This function for each sample will independently compute counts for each feature +per cell in the Arrow File +} diff --git a/man/getGeneAnnotation.Rd b/man/getGeneAnnotation.Rd new file mode 100644 index 00000000..178d3d58 --- /dev/null +++ b/man/getGeneAnnotation.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getGeneAnnotation} +\alias{getGeneAnnotation} +\title{Get geneAnnotation from ArchRProject} +\usage{ +getGeneAnnotation(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets geneAnnotation in ArchRProject +} diff --git a/man/getGenes.Rd b/man/getGenes.Rd new file mode 100644 index 00000000..456ad7e7 --- /dev/null +++ b/man/getGenes.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getGenes} +\alias{getGenes} +\title{Get Genes from ArchRProject} +\usage{ +getGenes(ArchRProj, symbols = NULL, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{symbols}{gene symbols to subset} + +\item{...}{additional args} +} +\description{ +This function gets genes from geneAnnotation in ArchRProject +} diff --git a/man/getGenome.Rd b/man/getGenome.Rd new file mode 100644 index 00000000..26be504f --- /dev/null +++ b/man/getGenome.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getGenome} +\alias{getGenome} +\title{Get genome from ArchRProject} +\usage{ +getGenome(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets the genome from genomeAnnotation in ArchRProject +} diff --git a/man/getGenomeAnnotation.Rd b/man/getGenomeAnnotation.Rd new file mode 100644 index 00000000..f2988ae0 --- /dev/null +++ b/man/getGenomeAnnotation.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getGenomeAnnotation} +\alias{getGenomeAnnotation} +\title{Get genomeAnnotation from ArchRProject} +\usage{ +getGenomeAnnotation(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets genomeAnnotation in ArchRProject +} diff --git a/man/getMatches.Rd b/man/getMatches.Rd new file mode 100644 index 00000000..e98ac399 --- /dev/null +++ b/man/getMatches.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getMatches} +\alias{getMatches} +\title{Get Annotation Matches from ArchRProject} +\usage{ +getMatches(ArchRProj, name = NULL, annoName = NULL, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{name}{name of annotations} + +\item{annoName}{name to subset with annotations} + +\item{...}{additional args} +} +\description{ +This function gets annotation matches from an ArchRProject +} diff --git a/man/getMatrixFromArrow.Rd b/man/getMatrixFromArrow.Rd new file mode 100644 index 00000000..f4135581 --- /dev/null +++ b/man/getMatrixFromArrow.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArrowRead.R +\name{getMatrixFromArrow} +\alias{getMatrixFromArrow} +\title{Read Fragments from Arrow} +\usage{ +getMatrixFromArrow(ArrowFile, useMatrix = "GeneScoreMatrix", + useSeqnames = NULL, cellNames = NULL, verbose = TRUE, ...) +} +\arguments{ +\item{ArrowFile}{ArchRProject or ArrowFiles} + +\item{useMatrix}{matrix name to get from Arrow} + +\item{useSeqnames}{use a subset of seqnames for matrix} + +\item{cellNames}{ceiling for the number of counts per feature} + +\item{verbose}{binarize matrix} + +\item{...}{additional params} +} +\description{ +This function for each sample will independently compute counts for each feature +per cell in the Arrow File +} diff --git a/man/getOutputDirectory.Rd b/man/getOutputDirectory.Rd new file mode 100644 index 00000000..e7e70e2e --- /dev/null +++ b/man/getOutputDirectory.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getOutputDirectory} +\alias{getOutputDirectory} +\title{Get outputDirectory in ArchRProject} +\usage{ +getOutputDirectory(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets outputDirectory from ArchRProject +} diff --git a/man/getPeakSet.Rd b/man/getPeakSet.Rd new file mode 100644 index 00000000..abfa14dd --- /dev/null +++ b/man/getPeakSet.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getPeakSet} +\alias{getPeakSet} +\title{Get PeakSet from ArchRProject} +\usage{ +getPeakSet(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets peakSet from an ArchRProject +} diff --git a/man/getPositions.Rd b/man/getPositions.Rd new file mode 100644 index 00000000..4e086826 --- /dev/null +++ b/man/getPositions.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getPositions} +\alias{getPositions} +\title{Get Annotation Positions from ArchRProject} +\usage{ +getPositions(ArchRProj, name = NULL, annoName = NULL, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{name}{name of annotations} + +\item{annoName}{name to subset with annotations} + +\item{...}{additional args} +} +\description{ +This function gets annotation positions from an ArchRProject +} diff --git a/man/getReducedDims.Rd b/man/getReducedDims.Rd new file mode 100644 index 00000000..d6d52e78 --- /dev/null +++ b/man/getReducedDims.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getReducedDims} +\alias{getReducedDims} +\title{Get Reduced Dimensions from ArchRProject} +\usage{ +getReducedDims(ArchRProj, reducedDims = "TileLSI", return = "matrix", + ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{reducedDims}{reduced dimensions name in ArchRProject} + +\item{return}{return reduced dimensions as matrix or all info} + +\item{...}{additional args} +} +\description{ +This function gets an embedding from an ArchRProject +} diff --git a/man/getSampleColData.Rd b/man/getSampleColData.Rd new file mode 100644 index 00000000..7edc1839 --- /dev/null +++ b/man/getSampleColData.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getSampleColData} +\alias{getSampleColData} +\title{Get sampleColData in ArchRProject} +\usage{ +getSampleColData(ArchRProj, select = NULL, drop = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{select}{select a subset of column names from sampleColData} + +\item{drop}{drop if selecting only one column name} + +\item{...}{additional args} +} +\description{ +This function gets sampleColData in ArchRProject +} diff --git a/man/getSampleNames.Rd b/man/getSampleNames.Rd new file mode 100644 index 00000000..e7528b1c --- /dev/null +++ b/man/getSampleNames.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getSampleNames} +\alias{getSampleNames} +\title{Get sampleNames in ArchRProject} +\usage{ +getSampleNames(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets sampleNames in ArchRProject +} diff --git a/man/getTSS.Rd b/man/getTSS.Rd new file mode 100644 index 00000000..1163188a --- /dev/null +++ b/man/getTSS.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{getTSS} +\alias{getTSS} +\title{Get TSS from ArchRProject} +\usage{ +getTSS(ArchRProj, ...) +} +\arguments{ +\item{ArchRProj}{ArchRProject} + +\item{...}{additional args} +} +\description{ +This function gets TSS from geneAnnotation in ArchRProject +} diff --git a/man/ggAlignPlots.Rd b/man/ggAlignPlots.Rd new file mode 100644 index 00000000..244411b9 --- /dev/null +++ b/man/ggAlignPlots.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GgplotHelper.R +\name{ggAlignPlots} +\alias{ggAlignPlots} +\title{Align GG Plots} +\usage{ +ggAlignPlots(..., sizes, type = "v", plotList = NULL, + grobList = NULL) +} +\arguments{ +\item{...}{ggplots} + +\item{sizes}{sizes are a vector or list of values for each ggplot ie c(1,1) for two plots} + +\item{type}{v,vertical or h,horizontal} +} +\description{ +Align GG Plots +} diff --git a/man/ggHex.Rd b/man/ggHex.Rd new file mode 100644 index 00000000..c45c74f9 --- /dev/null +++ b/man/ggHex.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GgplotHelper.R +\name{ggHex} +\alias{ggHex} +\title{GG Violin Plot} +\usage{ +ggHex(x, y, color, extend = 0.05, ratioYX = 1, xlim = NULL, + ylim = NULL, bins = 150, pal = paletteContinuous(set = + "solar_extra"), title = "", baseSize = 12, xlabel = "x", + ylabel = "y", fun = "median", ...) +} +\arguments{ +\item{x}{categorical values to each y value} + +\item{y}{numeric values} + +\item{pal}{color palette see paletteDiscrete for examples} + +\item{xlabel}{xlabel} + +\item{ylabel}{ylabel} + +\item{base_size}{base_size of theme} + +\item{size}{size of barplot lines} +} +\description{ +GG Violin Plot +} diff --git a/man/ggLine.Rd b/man/ggLine.Rd new file mode 100644 index 00000000..96efc95d --- /dev/null +++ b/man/ggLine.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GgplotHelper.R +\name{ggLine} +\alias{ggLine} +\title{GG Plot One to One Heatscatter} +\usage{ +ggLine(x, y, color = NULL, discrete = TRUE, discreteSet = "stallion", + continuousSet = "solar_extra", pal = NULL, size = 1, xlim = NULL, + ylim = NULL, extend = 0.05, xlabel = "x", ylabel = "y", + title = "", alpha = 1, baseSize = 6, ratioYX = 1, + nullColor = "lightGrey") +} +\arguments{ +\item{x}{x} + +\item{y}{y} + +\item{pal}{continuous color palette to use} + +\item{size}{geom_point size} + +\item{xlabel}{xlabel} + +\item{ylabel}{ylabel} + +\item{title}{ggtitle} + +\item{alpha}{geom_point alpha} + +\item{baseSize}{base_font size} + +\item{min}{xmin quantile [0,1]} + +\item{max}{xmax quantile [0,1]} + +\item{plot_n}{number of points to plot} + +\item{kernel_n}{n for MASS::kde2d default = 100} + +\item{plot_n}{number of points to plot} +} +\description{ +GG Plot One to One Heatscatter +} diff --git a/man/ggOneToOne.Rd b/man/ggOneToOne.Rd new file mode 100644 index 00000000..6f0d5902 --- /dev/null +++ b/man/ggOneToOne.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GgplotHelper.R +\name{ggOneToOne} +\alias{ggOneToOne} +\title{GG Plot One to One Heatscatter} +\usage{ +ggOneToOne(x, y, nPlot = 100 * 10^3, nKernel = 100, size = 2, + xlabel = "x", ylabel = "y", title = "Sample Correlation", + min = 0.1, max = 0.9999, densityMax = 0.95, extend = 0.05, + alpha = 1, baseSize = 12, pal = paletteContinuous(set = "viridis")) +} +\arguments{ +\item{x}{x} + +\item{y}{y} + +\item{nPlot}{number of points to plot} + +\item{nKernel}{n for MASS::kde2d default = 100} + +\item{size}{geom_point size} + +\item{xlabel}{xlabel} + +\item{ylabel}{ylabel} + +\item{title}{ggtitle} + +\item{min}{xmin quantile [0,1]} + +\item{max}{xmax quantile [0,1]} + +\item{alpha}{geom_point alpha} + +\item{baseSize}{base_font size default is 12} + +\item{pal}{continuous color palette to use} +} +\description{ +GG Plot One to One Heatscatter +} diff --git a/man/ggPoint.Rd b/man/ggPoint.Rd new file mode 100644 index 00000000..ed63532b --- /dev/null +++ b/man/ggPoint.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GgplotHelper.R +\name{ggPoint} +\alias{ggPoint} +\title{GG Plot One to One Heatscatter} +\usage{ +ggPoint(x, y, color = NULL, discrete = TRUE, + discreteSet = "stallion", labelMeans = FALSE, + continuousSet = "solar_extra", pal = NULL, colorDensity = FALSE, + size = 1, xlim = NULL, ylim = NULL, extend = 0.05, + xlabel = "x", randomize = FALSE, seed = 1, ylabel = "y", + title = "", alpha = 1, baseSize = 6, ratioYX = 1, + labelType = "ggrepel", bgColor = "white", fgColor = NULL, + labelSize = 1.5, addFit = NULL, nullColor = "lightGrey", + rastr = FALSE, dpi = 300) +} +\arguments{ +\item{x}{x} + +\item{y}{y} + +\item{pal}{continuous color palette to use} + +\item{size}{geom_point size} + +\item{xlabel}{xlabel} + +\item{ylabel}{ylabel} + +\item{title}{ggtitle} + +\item{alpha}{geom_point alpha} + +\item{baseSize}{base_font size} + +\item{min}{xmin quantile [0,1]} + +\item{max}{xmax quantile [0,1]} + +\item{plot_n}{number of points to plot} + +\item{kernel_n}{n for MASS::kde2d default = 100} + +\item{plot_n}{number of points to plot} +} +\description{ +GG Plot One to One Heatscatter +} diff --git a/man/ggViolin.Rd b/man/ggViolin.Rd new file mode 100644 index 00000000..bb02e4d1 --- /dev/null +++ b/man/ggViolin.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GgplotHelper.R +\name{ggViolin} +\alias{ggViolin} +\title{GG Violin Plot} +\usage{ +ggViolin(x, y, base_size = 12, xlabel = NULL, ylabel = NULL, + points = FALSE, baseSize = 6, ratioYX = 1, sampleRatio = 0.1, + size = 1, title = "", pal = paletteDiscrete(values = x, set = + "stallion")) +} +\arguments{ +\item{x}{categorical values to each y value} + +\item{y}{numeric values} + +\item{base_size}{base_size of theme} + +\item{xlabel}{xlabel} + +\item{ylabel}{ylabel} + +\item{size}{size of barplot lines} + +\item{pal}{color palette see paletteDiscrete for examples} +} +\description{ +GG Violin Plot +} diff --git a/man/grapes-ni-grapes.Rd b/man/grapes-ni-grapes.Rd new file mode 100644 index 00000000..aa8b81b8 --- /dev/null +++ b/man/grapes-ni-grapes.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/HelperUtils.R +\name{\%ni\%} +\alias{\%ni\%} +\title{Negated Value Matching} +\usage{ +x \%ni\% table +} +\arguments{ +\item{x}{x search within table} + +\item{table}{to search x in} +} +\description{ +This function is the reciprocal of %in% +See match funciton in base R +x %ni% table +} diff --git a/man/keepFilteredChromosomes.Rd b/man/keepFilteredChromosomes.Rd new file mode 100644 index 00000000..24d86e7d --- /dev/null +++ b/man/keepFilteredChromosomes.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{keepFilteredChromosomes} +\alias{keepFilteredChromosomes} +\title{Filters unwanted chr mainly underscores} +\usage{ +keepFilteredChromosomes(x, remove = c("chrM"), underscore = TRUE, + standard = TRUE, pruning.mode = "coarse") +} +\arguments{ +\item{x}{GRanges or something with seqlevels} + +\item{remove}{remove vector} + +\item{underscore}{remove all underscores?} + +\item{standard}{keep standard chromosomes} +} +\description{ +Filters unwanted chr mainly underscores +} diff --git a/man/markerFeatures.Rd b/man/markerFeatures.Rd new file mode 100644 index 00000000..d96af373 --- /dev/null +++ b/man/markerFeatures.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MarkerFeatures.R +\name{markerFeatures} +\alias{markerFeatures} +\title{Identify Marker Features for each Group} +\usage{ +markerFeatures(ArchRProj = NULL, groupBy = "Clusters", + useGroups = NULL, useMatrix = "GeneScoreMatrix", + bias = c("TSSEnrichment", "log10(nFrags)"), normBy = NULL, + testMethod = "wilcoxon", minCells = 50, maxCells = 500, + threads = 1, k = 100, bufferRatio = 0.8, binarize = FALSE, + useSeqnames = NULL, method = "ArchR", verboseHeader = TRUE, + verboseAll = FALSE, ...) +} +\arguments{ +\item{ArchRProj}{ArchR Project} + +\item{groupBy}{group cells by this column in cellColData} + +\item{useGroups}{use subset of groups in group column in cellColData} + +\item{useMatrix}{matrix name in Arrow Files that will be used for identifying features} + +\item{bias}{biases to account for in selecting null group using info from cellColData} + +\item{normBy}{normalize by column in cellColData prior to test} + +\item{testMethod}{pairwise test method group vs null} + +\item{minCells}{minimum cells per group for testing} + +\item{maxCells}{maximum cells per group for testing} + +\item{k}{knn for matching cell biases} + +\item{bufferRatio}{buffering ratio for matching cell biases} + +\item{binarize}{binarize prior to testing} + +\item{useSeqnames}{specific seqnames to use only} + +\item{method}{marker identification method} + +\item{verboseHeader}{verbose sections} + +\item{verboseAll}{verbose sections and subsections} + +\item{...}{additional args} +} +\description{ +This function will identify a null set of cells that match biases per cell +while maintaining the input group proportions. Then it will compute a pairwise +test of the group vs the null set. +} diff --git a/man/markerHeatmap.Rd b/man/markerHeatmap.Rd new file mode 100644 index 00000000..3eea7bb5 --- /dev/null +++ b/man/markerHeatmap.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MarkerHeatmap.R +\name{markerHeatmap} +\alias{markerHeatmap} +\title{Plot a Heatmap of Identified Marker Features} +\usage{ +markerHeatmap(seMarker, FDR = 0.001, log2FC = 0.1, log2Norm = TRUE, + scaleTo = 10^4, scaleRows = TRUE, limits = c(-2, 2), + grepExclude = NULL, pal = NULL, binaryClusterRows = TRUE, + labelMarkers = NULL, labelTop = NULL, labelRows = FALSE, + returnMat = FALSE, ...) +} +\arguments{ +\item{seMarker}{Summarized Experiment result from markerFeatures} + +\item{FDR}{False-Discovery Rate Cutoff to Be called a Marker} + +\item{log2FC}{Log2 Fold Change Cutoff to Be called a Marker} + +\item{log2Norm}{log2 Normalization prior to plotting set true for counting assays (not DeviationsMatrix!)} + +\item{scaleTo}{scale to prior to log2 Normalization, if log2Norm is FALSE this does nothing} + +\item{scaleRows}{compute row z-scores on matrix} + +\item{limits}{heatmap color limits} + +\item{grepExclude}{remove features by grep} + +\item{pal}{palette for heatmap, default will use solar_extra} + +\item{binaryClusterRows}{fast clustering implementation for row clustering by binary sorting} + +\item{labelMarkers}{label specific markers by name on heatmap (matches rownames of seMarker)} + +\item{labelTop}{label the top features for each column in seMarker} + +\item{labelRows}{label all rows} + +\item{returnMat}{return final matrix that is used for plotting heatmap} + +\item{...}{additional args} +} +\description{ +This function will plot a heatmap of the results from markerFeatures +} diff --git a/man/mergeGRanges.Rd b/man/mergeGRanges.Rd new file mode 100644 index 00000000..911e40ce --- /dev/null +++ b/man/mergeGRanges.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{mergeGRanges} +\alias{mergeGRanges} +\title{Merge Genomic Ranges} +\usage{ +mergeGRanges(gr, ignore.strand = TRUE) +} +\arguments{ +\item{gr}{see validGRanges} + +\item{ignore.strand}{ignore strandedness for merging} +} +\description{ +Merge Genomic Ranges +} diff --git a/man/nonOverlappingGRanges.Rd b/man/nonOverlappingGRanges.Rd new file mode 100644 index 00000000..77f008c4 --- /dev/null +++ b/man/nonOverlappingGRanges.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{nonOverlappingGRanges} +\alias{nonOverlappingGRanges} +\title{Instead of counting overlaps get columns like max score or etc in query} +\usage{ +nonOverlappingGRanges(gr, by = "score", decreasing = TRUE, + verbose = FALSE) +} +\arguments{ +\item{decreasing}{for order} + +\item{query}{granges query} + +\item{subject}{granges subject} + +\item{colname}{mcols(gr)[[colname]] cannot be null} +} +\description{ +Instead of counting overlaps get columns like max score or etc in query +} diff --git a/man/overlappingBP.Rd b/man/overlappingBP.Rd new file mode 100644 index 00000000..ffc90a8a --- /dev/null +++ b/man/overlappingBP.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{overlappingBP} +\alias{overlappingBP} +\title{Merge Genomic Ranges} +\usage{ +overlappingBP(query, subject, ignore.strand = TRUE) +} +\arguments{ +\item{query}{see validGRanges} + +\item{subject}{see validGRanges} + +\item{ignore.strand}{ignore strandedness for overlaps} +} +\description{ +Merge Genomic Ranges +} diff --git a/man/overlapsMany.Rd b/man/overlapsMany.Rd new file mode 100644 index 00000000..31890354 --- /dev/null +++ b/man/overlapsMany.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{overlapsMany} +\alias{overlapsMany} +\title{Overlaps Many includes information from mcols(gr)} +\usage{ +overlapsMany(query, subject, by, ignore.strand = TRUE) +} +\arguments{ +\item{query}{see validGRanges} + +\item{subject}{see validGRanges} + +\item{by}{column in subject to split overlaps by} + +\item{ignore.strand}{ignore strandedness for overlaps} +} +\description{ +Overlaps Many includes information from mcols(gr) +} diff --git a/man/paletteContinuous.Rd b/man/paletteContinuous.Rd new file mode 100644 index 00000000..011e0196 --- /dev/null +++ b/man/paletteContinuous.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ColorPalettes.R +\name{paletteContinuous} +\alias{paletteContinuous} +\title{Continuous Color Palette} +\usage{ +paletteContinuous(set = "solar_extra", n = 256, reverse = FALSE, + returnStructure = FALSE) +} +\arguments{ +\item{set}{continuous palette name or number} + +\item{n}{number for gradient} + +\item{reverse}{return reversed values} + +\item{returnStructure}{return structure palette} +} +\description{ +Continuous Color Palette +} diff --git a/man/paletteDiscrete.Rd b/man/paletteDiscrete.Rd new file mode 100644 index 00000000..13860473 --- /dev/null +++ b/man/paletteDiscrete.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ColorPalettes.R +\name{paletteDiscrete} +\alias{paletteDiscrete} +\title{Optimized discrete color palette} +\usage{ +paletteDiscrete(set = "stallion", values, reverse = FALSE, + returnStructure = FALSE, ...) +} +\arguments{ +\item{set}{continuous palette name or number} + +\item{values}{is a vector containing the sample names used in the plot which will be given a color} + +\item{reverse}{return reversed values} + +\item{returnStructure}{return structure palette} +} +\description{ +This function assesses the number of inputs and returns a tailored color palette for aesthetics +} diff --git a/man/plotFootprint.Rd b/man/plotFootprint.Rd new file mode 100644 index 00000000..3b475b7c --- /dev/null +++ b/man/plotFootprint.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Footprinting.R +\name{plotFootprint} +\alias{plotFootprint} +\title{Plot Group Footprints} +\usage{ +plotFootprint(input = NULL, positions = NULL, groupBy = "Clusters", + useGroups = NULL, pal = NULL, flank = 250, flankNorm = 50, + smoothWindow = 10, nTop = NULL, normMethod = "none", + threads = 16, verboseHeader = TRUE, verboseAll = FALSE, ...) +} +\arguments{ +\item{input}{ArchRProject or previous Footprint Summarized Experiment} + +\item{positions}{sample names corresponding to input files} + +\item{groupBy}{output names prefix (ie PBMC -> PBMC.arrow)} + +\item{useGroups}{geneAnnotation input for TSS Scores etc.} + +\item{pal}{genomeAnnotation input for ChromSizes Nucleotide Information etc.} + +\item{flank}{min fragments per cell to be filtered for analyses such as tileMat etc.} + +\item{flankNorm}{min TSS Score per cell to be filtered for analyses such as tileMat etc.} + +\item{smoothWindow}{remove fragments corresponding to cells pass filterFrags and filterTSS} + +\item{nTop}{min fragments per cell to be immediately filtered} + +\item{normMethod}{normalization method for footprint plot relative to bias} + +\item{threads}{number of therads for parallel execution} + +\item{verboseHeader}{verbose sections} + +\item{verboseAll}{verbose sections and subsections} + +\item{...}{additional args} +} +\description{ +This function will create an Arrow Files from input files +for downstream analysis +} diff --git a/man/plotPDF.Rd b/man/plotPDF.Rd new file mode 100644 index 00000000..11917522 --- /dev/null +++ b/man/plotPDF.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArchRProjectMethods.R +\name{plotPDF} +\alias{plotPDF} +\title{Plot PDF in outputDirectory of ArchRProject} +\usage{ +plotPDF(name, width = 8, height = 8, ArchRProj = NULL, + addDOC = TRUE, useDingbats = FALSE, ...) +} +\arguments{ +\item{name}{name of PDF file} + +\item{width}{width of PDF in inches} + +\item{height}{height of PDF in inches} + +\item{ArchRProj}{ArchRProject} + +\item{addDOC}{add date of creation to end of plot file name} + +\item{useDingbats}{use dingbats characters for plotting} + +\item{...}{additional args to pdf} +} +\description{ +This function will plot PDF in output directory of an ArchRProject +} diff --git a/man/projectLSI.Rd b/man/projectLSI.Rd new file mode 100644 index 00000000..0cb32c62 --- /dev/null +++ b/man/projectLSI.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/LatentSemanticIndexing.R +\name{projectLSI} +\alias{projectLSI} +\title{Project LSI} +\usage{ +projectLSI(mat, LSI, returnModel = FALSE, verbose = TRUE, + tstart = NULL, ...) +} +\arguments{ +\item{mat}{sparseMatrix (dgcMatrix) for LSI} + +\item{LSI}{previous LSI transform to project into} + +\item{returnModel}{return projection information} + +\item{verbose}{verbose} + +\item{tstart}{time stamp to pass} + +\item{...}{additional args} +} +\description{ +This function will compute a LSI Projection (TF-IDF followed by SVD projection) +} diff --git a/man/shuffleGRanges.Rd b/man/shuffleGRanges.Rd new file mode 100644 index 00000000..4ccad454 --- /dev/null +++ b/man/shuffleGRanges.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{shuffleGRanges} +\alias{shuffleGRanges} +\title{Shuffle Genomic Ranges} +\usage{ +shuffleGRanges(subject, genome, n, shuffleChr = TRUE) +} +\arguments{ +\item{subject}{see validGRanges} + +\item{genome}{see validBSgenome} + +\item{n}{nPermutations} + +\item{shuffleChr}{shuffle across chromosomes randomly vs using previous knowledge of chromosome distribution} +} +\description{ +Shuffle Genomic Ranges +} diff --git a/man/subsetSeqnames.Rd b/man/subsetSeqnames.Rd new file mode 100644 index 00000000..49ea8bc7 --- /dev/null +++ b/man/subsetSeqnames.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GRangesUtils.R +\name{subsetSeqnames} +\alias{subsetSeqnames} +\title{Subset by Seqnames} +\usage{ +subsetSeqnames(gr, seqNames, useNames = FALSE) +} +\arguments{ +\item{gr}{grange} + +\item{seqnames}{seqnames to subset} +} +\description{ +Subset by Seqnames +} diff --git a/man/theme_ArchR.Rd b/man/theme_ArchR.Rd new file mode 100644 index 00000000..60b1bbe0 --- /dev/null +++ b/man/theme_ArchR.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GgplotHelper.R +\name{theme_ArchR} +\alias{theme_ArchR} +\title{ggplot2 default theme for ArchR} +\usage{ +theme_ArchR(color = "black", baseSize = 6, baseFamily = "", + baseLineSize = 0.5, baseRectSize = 0.5, plotMarginCm = 1, + legendPosition = "bottom", legendTextSize = 5, axisTickCm = 0.1, + xText90 = FALSE, yText90 = FALSE, ...) +} +\arguments{ +\item{color}{color of theme} + +\item{base_size}{is the size of the font for the axis text and title} + +\item{base_family}{is family for font} + +\item{base_line_size}{is the size of line} + +\item{base_rect_size}{is the size of rectangle boxes} + +\item{plot_margin_cm}{plot margin in cm} + +\item{legend_position}{where is the legend default bottom} + +\item{legend_text_size}{0.75*base_size} + +\item{axis_tick_length_cm}{axis tick length in cm} + +\item{rotate_x_axis_text_90}{rotate x axis text 90 degrees} + +\item{rotate_y_axis_text_90}{rotate y axis text 90 degrees} +} +\description{ +This function returns a ggplot2 theme that is black borded with black font. +} diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 00000000..5008ddfc Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 00000000..22034c46 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +*.dll diff --git a/src/Footprinting_utils.cpp b/src/Footprinting_utils.cpp new file mode 100644 index 00000000..8698e16f --- /dev/null +++ b/src/Footprinting_utils.cpp @@ -0,0 +1,130 @@ +#include +using namespace Rcpp; +using namespace std; + +// [[Rcpp::export]] +IntegerVector rleSumsStrandedChr(S4 rle, IntegerVector x, IntegerVector strand, int width){ + + //Adapted from IRanges viewSums + + int ranges_length, *rle_lengths, upper_run, lower_run, lower_bound, upper_bound; + int strand_i; + + // Stuff from RLE + IntegerVector values = rle.slot("values"); + IntegerVector lengths = rle.slot("lengths"); + int rle_length = sum(lengths); + + // Stuff from Ranges + ranges_length = x.size(); + + // Initialize + rle_lengths = INTEGER(lengths); + upper_run = *rle_lengths; + int index = 0; + int position, i, y; + int max_index = lengths.size() - 1; + int start_i; + int stretch; + IntegerVector ans_pos = IntegerVector(width); + IntegerVector ans_minus = IntegerVector(width); + IntegerMatrix tmp = IntegerMatrix(3,width); + + for (i = 0; i < ranges_length; i++) { + + position = 0; + start_i = x[i]; + strand_i = strand[i]; + + if(start_i > 0 && start_i < rle_length){ + + while (index > 0 && upper_run > start_i) { + upper_run -= *rle_lengths; + rle_lengths--; + index--; + } + + while (upper_run < start_i) { + rle_lengths++; + index++; + upper_run += *rle_lengths; + } + + lower_run = upper_run - *rle_lengths + 1; + upper_bound = start_i + width - 1; + lower_bound = start_i; + + while (lower_run <= upper_bound) { + + stretch = (1 + (upper_bound < upper_run ? upper_bound : upper_run) - + (lower_bound > lower_run ? lower_bound : lower_run)); + + if (INTEGER(values)[index] == NA_INTEGER){ + + for(y = 0; y < stretch; y++){ + position += 1; + } + + }else{ + + for(y = 0; y < stretch; y++){ + tmp(strand_i - 1, position) += INTEGER(values)[index]; + position += 1; + } + + } + + if (index < max_index) { + rle_lengths++; + index++; + lower_run = upper_run + 1; + lower_bound = lower_run; + upper_run += *rle_lengths; + } else { + break; + } + + } + } + } + + // 0 = +, 1 = -, 2 = *. + + IntegerVector out = tmp(0, _ ) + tmp(2, _ ) + rev(tmp(1, _ )); + + return (out); + +} + +// [[Rcpp::export]] +IntegerVector rleSumsStranded(List rleList, List grList, int width, Function as_integer){ + + //This will iterate over a coverage object + + IntegerVector strand, debug, start; + IntegerVector out = IntegerVector(width); + + int n = grList.size(); + int shift = floor(width/2); + + for(int i = 0; i < n; i++){ + //rle + S4 rle = rleList[i]; + //gr + S4 gr = grList[i]; + //strand + S4 gr_strand = gr.slot("strand"); + strand = as_integer(gr_strand); + //start + S4 ranges = gr.slot("ranges"); + start = ranges.slot("start"); + start = start - shift; + out += rleSumsStrandedChr(rle, start, strand, width); + } + + debug = IntegerVector::create(grList.size()); + + return (out); + +} + diff --git a/src/General_Utils.cpp b/src/General_Utils.cpp new file mode 100644 index 00000000..ae8c64fa --- /dev/null +++ b/src/General_Utils.cpp @@ -0,0 +1,91 @@ +#include + +using namespace Rcpp; +using namespace std; + +// // [[Rcpp::export]] +// IntegerMatrix tabulate1dCpp(IntegerVector x1, int xmin, int xmax){ +// IntegerVector x = clone(x1); +// IntegerVector r = seq(xmin,xmax); +// IntegerMatrix out(r.size(),2); +// out(_, 0) = r; +// int n = x.size(); +// int xi; +// for(int i = 0; i < n; i++){ +// xi = (x[i] - xmin); +// if(xi >= 0 && xi <= r.size()){ +// out( xi , 1 ) = out( xi , 1 ) + 1; +// } +// } +// return out; +// } + +// [[Rcpp::export]] +IntegerMatrix tabulate2dCpp(IntegerVector &x, int &xmin, int &xmax, IntegerVector &y, int &ymin, int &ymax){ + if(x.size() != y.size()){ + stop("width must equal size!"); + } + int n = x.size(); + IntegerVector rx = seq(xmin, xmax); + IntegerVector ry = seq(ymin, ymax); + IntegerMatrix mat( ry.size() , rx.size() ); + int rys = ry.size(); + int rxs = rx.size(); + int xi,yi; + for(int i = 0; i < n; i++){ + xi = (x[i] - xmin); + yi = (y[i] - ymin); + if(yi >= 0 && yi < rys){ + if(xi >= 0 && xi < rxs){ + mat( yi , xi ) = mat( yi , xi ) + 1; + } + } + } + return mat; +} + +// // [[Rcpp::export]] +// IntegerMatrix tabulate2dCpp(IntegerVector x1, int xmin, int xmax, IntegerVector y1, int ymin, int ymax){ +// if(x1.size() != y1.size()){ +// stop("width must equal size!"); +// } +// IntegerVector x = clone(x1); +// IntegerVector y = clone(y1); +// int n = x.size(); +// IntegerVector rx = seq(xmin,xmax); +// IntegerVector ry = seq(ymin,ymax); +// IntegerMatrix mat( ry.size() , rx.size() ); +// int xi,yi; +// for(int i = 0; i < n; i++){ +// xi = (x[i] - xmin); +// yi = (y[i] - ymin); +// if(yi >= 0 && yi < ry.size()){ +// if(xi >= 0 && xi < rx.size()){ +// mat( yi , xi ) = mat( yi , xi ) + 1; +// } +// } +// } +// return mat; +// } + +// [[Rcpp::export]] +Rcpp::NumericVector computeSparseRowVariances(IntegerVector j, NumericVector val, NumericVector rm, int n) { + const int nv = j.size(); + const int nm = rm.size(); + Rcpp::NumericVector rv(nm); + Rcpp::NumericVector rit(nm); + int current; + // Calculate RowVars Initial + for (int i = 0; i < nv; ++i) { + current = j(i) - 1; + rv(current) = rv(current) + (val(i) - rm(current)) * (val(i) - rm(current)); + rit(current) = rit(current) + 1; + } + // Calculate Remainder Variance + for (int i = 0; i < nm; ++i) { + rv(i) = rv(i) + (n - rit(i))*rm(i)*rm(i); + } + rv = rv / (n - 1); + return(rv); +} + diff --git a/src/KNN_Utils.cpp b/src/KNN_Utils.cpp new file mode 100644 index 00000000..fc8799dc --- /dev/null +++ b/src/KNN_Utils.cpp @@ -0,0 +1,46 @@ +#include +using namespace Rcpp; +using namespace std; + +// [[Rcpp::export]] +Rcpp::IntegerVector determineOverlapCpp(IntegerMatrix m, int overlapCut){ + + int k2 = 2 * m.ncol(); + int nr = m.nrow(); + int nUnion; + int maxOverlap; + IntegerVector unionVector; + IntegerVector testVector = IntegerVector(nr); + IntegerVector nOverlap = IntegerVector(nr); + NumericVector maxOverlapVector = NumericVector(nr); + IntegerVector vi; + IntegerVector vj; + + for (int i = 1; i < nr; i++){ + + if (i % 500 == 0) Rcpp::Rcout << "Completed Computing KNN Overlap " << i << " of " << nr << endl; + + for(int j = 0; j < i; j++){ + + if(testVector(j) == 0){ + vi = m(i, _); + vj = m(j, _); + unionVector = union_( vi , vj ); + nUnion = unionVector.size(); + nOverlap(j) = k2 - nUnion; + }else{ + nOverlap(j) = 0; + } + } + + maxOverlap = max( nOverlap ); + maxOverlapVector(i) = maxOverlap; + if(maxOverlap > overlapCut){ + testVector(i) = -1; + } + + } + + return testVector; + +} \ No newline at end of file diff --git a/src/Kmer_Bias.cpp b/src/Kmer_Bias.cpp new file mode 100644 index 00000000..414e82c6 --- /dev/null +++ b/src/Kmer_Bias.cpp @@ -0,0 +1,98 @@ +#include +using namespace Rcpp; +using namespace std; + +// [[Rcpp::export]] +IntegerVector kmerIdxCpp(const std::string& str, const int window, const int n, CharacterVector &kmer){ + CharacterVector result( window ); + for ( int j = 0; j < window; j++ ){ + result[j] = str.substr( j, n ); + } + IntegerVector out = match( result , kmer ); + return out; +} + +// [[Rcpp::export]] +IntegerMatrix kmerPositionFrequencyCpp(StringVector &string_vector, IntegerVector &strand_vector, const int window, const int w, CharacterVector &kmer){ + + // Initialize Matrix + IntegerMatrix out = IntegerMatrix(kmer.size(),window); + rownames(out) = kmer; + + // Get Constants + int n = string_vector.size(); + std::string str_i; + + //Simple Vector for Storing matches + IntegerVector m(window); + + for(int i=0; i do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// rleSumsStrandedChr +IntegerVector rleSumsStrandedChr(S4 rle, IntegerVector x, IntegerVector strand, int width); +RcppExport SEXP _ArchR_rleSumsStrandedChr(SEXP rleSEXP, SEXP xSEXP, SEXP strandSEXP, SEXP widthSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< S4 >::type rle(rleSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type x(xSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type strand(strandSEXP); + Rcpp::traits::input_parameter< int >::type width(widthSEXP); + rcpp_result_gen = Rcpp::wrap(rleSumsStrandedChr(rle, x, strand, width)); + return rcpp_result_gen; +END_RCPP +} +// rleSumsStranded +IntegerVector rleSumsStranded(List rleList, List grList, int width, Function as_integer); +RcppExport SEXP _ArchR_rleSumsStranded(SEXP rleListSEXP, SEXP grListSEXP, SEXP widthSEXP, SEXP as_integerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< List >::type rleList(rleListSEXP); + Rcpp::traits::input_parameter< List >::type grList(grListSEXP); + Rcpp::traits::input_parameter< int >::type width(widthSEXP); + Rcpp::traits::input_parameter< Function >::type as_integer(as_integerSEXP); + rcpp_result_gen = Rcpp::wrap(rleSumsStranded(rleList, grList, width, as_integer)); + return rcpp_result_gen; +END_RCPP +} +// tabulate2dCpp +IntegerMatrix tabulate2dCpp(IntegerVector& x, int& xmin, int& xmax, IntegerVector& y, int& ymin, int& ymax); +RcppExport SEXP _ArchR_tabulate2dCpp(SEXP xSEXP, SEXP xminSEXP, SEXP xmaxSEXP, SEXP ySEXP, SEXP yminSEXP, SEXP ymaxSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerVector& >::type x(xSEXP); + Rcpp::traits::input_parameter< int& >::type xmin(xminSEXP); + Rcpp::traits::input_parameter< int& >::type xmax(xmaxSEXP); + Rcpp::traits::input_parameter< IntegerVector& >::type y(ySEXP); + Rcpp::traits::input_parameter< int& >::type ymin(yminSEXP); + Rcpp::traits::input_parameter< int& >::type ymax(ymaxSEXP); + rcpp_result_gen = Rcpp::wrap(tabulate2dCpp(x, xmin, xmax, y, ymin, ymax)); + return rcpp_result_gen; +END_RCPP +} +// computeSparseRowVariances +Rcpp::NumericVector computeSparseRowVariances(IntegerVector j, NumericVector val, NumericVector rm, int n); +RcppExport SEXP _ArchR_computeSparseRowVariances(SEXP jSEXP, SEXP valSEXP, SEXP rmSEXP, SEXP nSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerVector >::type j(jSEXP); + Rcpp::traits::input_parameter< NumericVector >::type val(valSEXP); + Rcpp::traits::input_parameter< NumericVector >::type rm(rmSEXP); + Rcpp::traits::input_parameter< int >::type n(nSEXP); + rcpp_result_gen = Rcpp::wrap(computeSparseRowVariances(j, val, rm, n)); + return rcpp_result_gen; +END_RCPP +} +// determineOverlapCpp +Rcpp::IntegerVector determineOverlapCpp(IntegerMatrix m, int overlapCut); +RcppExport SEXP _ArchR_determineOverlapCpp(SEXP mSEXP, SEXP overlapCutSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerMatrix >::type m(mSEXP); + Rcpp::traits::input_parameter< int >::type overlapCut(overlapCutSEXP); + rcpp_result_gen = Rcpp::wrap(determineOverlapCpp(m, overlapCut)); + return rcpp_result_gen; +END_RCPP +} +// kmerIdxCpp +IntegerVector kmerIdxCpp(const std::string& str, const int window, const int n, CharacterVector& kmer); +RcppExport SEXP _ArchR_kmerIdxCpp(SEXP strSEXP, SEXP windowSEXP, SEXP nSEXP, SEXP kmerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::string& >::type str(strSEXP); + Rcpp::traits::input_parameter< const int >::type window(windowSEXP); + Rcpp::traits::input_parameter< const int >::type n(nSEXP); + Rcpp::traits::input_parameter< CharacterVector& >::type kmer(kmerSEXP); + rcpp_result_gen = Rcpp::wrap(kmerIdxCpp(str, window, n, kmer)); + return rcpp_result_gen; +END_RCPP +} +// kmerPositionFrequencyCpp +IntegerMatrix kmerPositionFrequencyCpp(StringVector& string_vector, IntegerVector& strand_vector, const int window, const int w, CharacterVector& kmer); +RcppExport SEXP _ArchR_kmerPositionFrequencyCpp(SEXP string_vectorSEXP, SEXP strand_vectorSEXP, SEXP windowSEXP, SEXP wSEXP, SEXP kmerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< StringVector& >::type string_vector(string_vectorSEXP); + Rcpp::traits::input_parameter< IntegerVector& >::type strand_vector(strand_vectorSEXP); + Rcpp::traits::input_parameter< const int >::type window(windowSEXP); + Rcpp::traits::input_parameter< const int >::type w(wSEXP); + Rcpp::traits::input_parameter< CharacterVector& >::type kmer(kmerSEXP); + rcpp_result_gen = Rcpp::wrap(kmerPositionFrequencyCpp(string_vector, strand_vector, window, w, kmer)); + return rcpp_result_gen; +END_RCPP +} +// kmerIDFrequencyCpp +IntegerMatrix kmerIDFrequencyCpp(StringVector& string_vector, IntegerVector& id_vector, const int n_id, const int window, const int w, CharacterVector& kmer); +RcppExport SEXP _ArchR_kmerIDFrequencyCpp(SEXP string_vectorSEXP, SEXP id_vectorSEXP, SEXP n_idSEXP, SEXP windowSEXP, SEXP wSEXP, SEXP kmerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< StringVector& >::type string_vector(string_vectorSEXP); + Rcpp::traits::input_parameter< IntegerVector& >::type id_vector(id_vectorSEXP); + Rcpp::traits::input_parameter< const int >::type n_id(n_idSEXP); + Rcpp::traits::input_parameter< const int >::type window(windowSEXP); + Rcpp::traits::input_parameter< const int >::type w(wSEXP); + Rcpp::traits::input_parameter< CharacterVector& >::type kmer(kmerSEXP); + rcpp_result_gen = Rcpp::wrap(kmerIDFrequencyCpp(string_vector, id_vector, n_id, window, w, kmer)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_ArchR_rleSumsStrandedChr", (DL_FUNC) &_ArchR_rleSumsStrandedChr, 4}, + {"_ArchR_rleSumsStranded", (DL_FUNC) &_ArchR_rleSumsStranded, 4}, + {"_ArchR_tabulate2dCpp", (DL_FUNC) &_ArchR_tabulate2dCpp, 6}, + {"_ArchR_computeSparseRowVariances", (DL_FUNC) &_ArchR_computeSparseRowVariances, 4}, + {"_ArchR_determineOverlapCpp", (DL_FUNC) &_ArchR_determineOverlapCpp, 2}, + {"_ArchR_kmerIdxCpp", (DL_FUNC) &_ArchR_kmerIdxCpp, 4}, + {"_ArchR_kmerPositionFrequencyCpp", (DL_FUNC) &_ArchR_kmerPositionFrequencyCpp, 5}, + {"_ArchR_kmerIDFrequencyCpp", (DL_FUNC) &_ArchR_kmerIDFrequencyCpp, 6}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_ArchR(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +}