diff --git a/.DS_Store b/.DS_Store index 2b21082f..a885b2f8 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/DESCRIPTION b/DESCRIPTION index 40311f0c..6953679c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ArchR Type: Package -Date: 2020-11-23 +Date: 2022-04-03 Title: Analyzing single-cell regulatory chromatin in R. -Version: 1.0.1 +Version: 1.0.2 Authors@R: c( person("Jeffrey", "Granja", email = "jgranja.stanford@gmail.com", role = c("aut","cre")), person("Ryan", "Corces", role = "aut")) @@ -11,9 +11,17 @@ Roxygen: list(markdown = TRUE) License: GPL (>= 2) LinkingTo: Rcpp LazyData: TRUE -RoxygenNote: 7.1.1 +RoxygenNote: 7.1.2 Encoding: UTF-8 Imports: + ggplot2, + SummarizedExperiment, + data.table, + Matrix, + rhdf5, + magrittr, + S4Vectors (>= 0.9.25), + BiocGenerics, Rcpp (>= 0.12.16), matrixStats, plyr, @@ -29,17 +37,9 @@ Imports: grid, gridExtra, Biostrings, - ComplexHeatmap -Depends: - ggplot2, - SummarizedExperiment, - data.table, - Matrix, - rhdf5, - magrittr, - S4Vectors (>= 0.9.25), - BiocGenerics, + ComplexHeatmap, GenomicRanges +Depends: Collate: 'AllClasses.R' 'AnnotationGenome.R' diff --git a/NAMESPACE b/NAMESPACE index ce5679a8..5fec629b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,10 +3,10 @@ S3method("$",ArchRProject) S3method("$<-",ArchRProject) S3method("[",ArchRProject) -S3method(.DollarNames,ArchRProject) export("%bcin%") export("%bcni%") export("%ni%") +export(.DollarNames.ArchRProject) export(ArchRBrowser) export(ArchRBrowserTrack) export(ArchRPalettes) @@ -158,5 +158,7 @@ export(subsetCells) export(theme_ArchR) export(trajectoryHeatmap) export(validBSgenome) +import(data.table) +importFrom(GenomicRanges,GRanges) importFrom(Rcpp,sourceCpp) useDynLib(ArchR) diff --git a/R/AllClasses.R b/R/AllClasses.R index f87ff98c..314c4182 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -1,5 +1,7 @@ #' @useDynLib ArchR #' @importFrom Rcpp sourceCpp +#' @importFrom GenomicRanges GRanges +#' @import data.table NULL setClassUnion("characterOrNull", c("character", "NULL")) @@ -569,51 +571,29 @@ saveArchRProject <- function( newProj@imputeWeights <- SimpleList() } - #Copy Other Folders 2 layers nested + #Copy Recursively message("Copying Other Files...") for(i in seq_along(oldFiles)){ - - fin <- file.path(outDirOld, oldFiles[i]) - fout <- file.path(outputDirectory, oldFiles[i]) - message(sprintf("Copying Other Files (%s of %s): %s", i, length(oldFiles), basename(fin))) - - if(dir.exists(fin)){ - - dir.create(file.path(outputDirectory, basename(fin)), showWarnings=FALSE) - fin2 <- list.files(fin, full.names = TRUE) - - for(j in seq_along(fin2)){ - - if(dir.exists(fin2[j])){ - - dir.create(file.path(outputDirectory, basename(fin), basename(fin2)[j]), showWarnings=FALSE) - fin3 <- list.files(fin2[j], full.names = TRUE) - - for(k in seq_along(fin3)){ - - cf <- file.copy(fin3[k], file.path(fout, basename(fin3[k])), overwrite = overwrite) - - } - - }else{ - - cf <- file.copy(fin2[j], file.path(fout, basename(fin2[j])), overwrite = overwrite) - - } - - } - - }else{ - - cf <- file.copy(fin, fout, overwrite = overwrite) - - } - + message(sprintf("Copying Other Files (%s of %s): %s", i, length(oldFiles), oldFiles[i])) + oldPath <- file.path(outDirOld, oldFiles[i]) + file.copy(oldPath, outputDirectory, recursive=TRUE, overwrite=overwrite) } + #Set New Info newProj@sampleColData <- newProj@sampleColData[names(ArrowFilesNew), , drop = FALSE] newProj@sampleColData$ArrowFiles <- ArrowFilesNew[rownames(newProj@sampleColData)] + #Check for Group Coverages Copied + groupC <- length(newProj@projectMetadata$GroupCoverages) + if(length(groupC) > 0){ + for(z in seq_len(groupC)){ + zdata <- newProj@projectMetadata$GroupCoverages[[z]]$coverageMetadata + zfiles <- gsub(outDirOld, outputDirectory, zdata$File) + newProj@projectMetadata$GroupCoverages[[z]]$coverageMetadata$File <- zfiles + stopifnot(all(file.exists(zfiles))) + } + } + } message("Saving ArchRProject...") diff --git a/R/AnnotationGenome.R b/R/AnnotationGenome.R index 3d75ac3a..e76026de 100644 --- a/R/AnnotationGenome.R +++ b/R/AnnotationGenome.R @@ -6,9 +6,10 @@ #' @param chromSizes A `GRanges` object containing chromosome start and end coordinates. #' @param blacklist A `GRanges` object containing regions that should be excluded from analyses due to unwanted biases. #' @param filter A boolean value indicating whether non-standard chromosome scaffolds should be excluded. -#' These "non-standard" chromosomes are defined by `filterChrGR()`. +#' These "non-standard" chromosomes are defined by `filterChrGR()` and by manual annotation using the `filterChr` parameter. #' @param filterChr A character vector indicating the seqlevels that should be removed if manual removal is desired for certain seqlevels. -#' If no manual removal is desired, `filterChr` should be set to `NULL`. +#' If no manual removal is desired, `filterChr` should be set to `NULL`. If `filter` is set to `TRUE` but `filterChr` is set to `NULL`, +#' non-standard chromosomes will still be removed as defined in `filterChrGR()`. #' @export createGenomeAnnotation <- function( genome = NULL, @@ -24,23 +25,27 @@ createGenomeAnnotation <- function( .validInput(input = filter, name = "filter", valid = c("boolean")) .validInput(input = filterChr, name = "filterChr", valid = c("character", "null")) - if(is.null(genome) | is.null(blacklist) | is.null(chromSizes)){ + ################## + message("Getting genome..") + #validBSgenome works on both character and BSgenome inputs, which are the only allowable inputs to the param + bsg <- validBSgenome(genome) + genome <- bsg@pkgname - ################## - message("Getting genome..") - bsg <- validBSgenome(genome) - genome <- bsg@pkgname - - ################## - message("Getting chromSizes..") + if(is.null(chromSizes)) { + message("Attempting to infer chromSizes..") chromSizes <- GRanges(names(seqlengths(bsg)), IRanges(1, seqlengths(bsg))) if(filter){ - chromSizes <- filterChrGR(chromSizes, remove = filterChr) + chromSizes <- filterChrGR(chromSizes, remove = filterChr) } seqlengths(chromSizes) <- end(chromSizes) + } else { + message("Using provided chromSizes..") + chromSizes <- .validGRanges(chromSizes) + } + if(is.null(blacklist)){ ################## - message("Getting blacklist..") + message("Attempting to infer blacklist..") genomeName <- tryCatch({ bsg@provider_version @@ -50,15 +55,9 @@ createGenomeAnnotation <- function( blacklist <- .getBlacklist(genome = genomeName) - }else{ - - bsg <- validBSgenome(genome) - genome <- bsg@pkgname - - chromSizes <- .validGRanges(chromSizes) - + } else { + message("Using provided blacklist...") blacklist <- .validGRanges(blacklist) - } SimpleList(genome = genome, chromSizes = chromSizes, blacklist = blacklist) @@ -172,7 +171,7 @@ createGeneAnnotation <- function( ########################### message("Getting TSS..") - TSS <- unique(resize(GenomicFeatures::transcripts(TxDb), width = 1, fix = "start")) + TSS <- unique(GenomicRanges::resize(GenomicFeatures::transcripts(TxDb), width = 1, fix = "start")) if(!is.null(inGenes)){ genes <- .validGRanges(inGenes) diff --git a/R/AnnotationPeaks.R b/R/AnnotationPeaks.R index 13df1f22..2e13ff91 100644 --- a/R/AnnotationPeaks.R +++ b/R/AnnotationPeaks.R @@ -59,7 +59,8 @@ getPositions <- function(ArchRProj = NULL, name = NULL, annoName = NULL){ #' Get peak annotation matches from an ArchRProject #' -#' This function gets peak annotation matches from a given ArchRProject. +#' This function gets peak annotation matches from a given ArchRProject. The peaks in the returned object are in the +#' same order as the peaks returned by `getPeakSet()`. #' #' @param ArchRProj An `ArchRProject` object. #' @param name The name of the `peakAnnotation` object (i.e. Motifs) to retrieve from the designated `ArchRProject`. @@ -140,6 +141,10 @@ addPeakAnnotations <- function( names(regions) <- paste0("Region_", seq_along(regions)) } + if(any(duplicated(names(regions)))){ + stop("Found duplicated region names! Please make unique!") + } + regionPositions <- lapply(seq_along(regions), function(x){ .logThis(regions[[x]], paste0("regions[[x]]-", x), logFile = logFile) @@ -191,11 +196,14 @@ addPeakAnnotations <- function( if(is.null(peakSet)){ .logStop("peakSet is NULL. You need a peakset to run addMotifAnnotations! See addReproduciblePeakSet!", logFile = logFile) } - allPositions <- unlist(regionPositions) + allPositions <- unlist(regionPositions, use.names=TRUE) .logDiffTime("Creating Peak Overlap Matrix", t1 = tstart, verbose = TRUE, logFile = logFile) overlapRegions <- findOverlaps(peakSet, allPositions, ignore.strand=TRUE) + if(length(overlapRegions) == 0){ + stop("No Overlaps Found between regions and peak Matrix!") + } .logThis(overlapRegions, "overlapRegions", logFile = logFile) regionMat <- Matrix::sparseMatrix( @@ -210,6 +218,31 @@ addPeakAnnotations <- function( regionMat <- SummarizedExperiment::SummarizedExperiment(assays=SimpleList(matches = regionMat), rowRanges = peakSet) .logThis(regionMat, "regionSE", logFile = logFile) + ############################################################# + # Filter Regions With No Matches + ############################################################# + + #Number of Overlaps + nO <- Matrix::colSums(assay(regionMat)) + rF <- names(which(nO == 0)) + + if(all(nO == 0)){ + stop("No Overlaps Found! Please check your peakSet and genome!") + } + + if(length(rF) > 0){ + .logDiffTime(paste0("Filtering Region Annotations with 0 overlaps :\n\n ", paste(rF, collapse=", "), "\n\n"), t1 = tstart, verbose = TRUE, logFile = logFile) + #Filter + regionPositions <- regionPositions[!(names(regionPositions) %in% rF)] + regionMat <- regionMat[,names(regionPositions),drop=FALSE] + }else{ + .logDiffTime(paste0("All Regions Overlap at least 1 peak!"), t1 = tstart, verbose = TRUE, logFile = logFile) + } + + ############################################################# + # Summarize and Save + ############################################################# + dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE) savePositions <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Positions-In-Peaks.rds")) saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Matches-In-Peaks.rds")) @@ -239,27 +272,30 @@ addPeakAnnotations <- function( #' #' @param ArchRProj An `ArchRProject` object. #' @param motifSet The motif set to be used for annotation. Options include: (i) "JASPAR2016", "JASPAR2018", "JASPAR2020" -#' which gives the 2016, 2018 or 2020 version of JASPAR motifs or (ii) one of "cisbp", "encode", or "homer" which gives the -#' corresponding motif sets from the `chromVAR` package. -#' @param name The name of the `peakAnnotation` object to be stored in the provided `ArchRProject` +#' which gives the 2016, 2018 or 2020 version of JASPAR motifs, (ii) one of "cisbp", "encode", or "homer" which gives the +#' corresponding motif sets from the `chromVAR` package, or (iii) "vierstra" which gives the clustered archetype motifs +#' created by Jeff Vierstra (https://github.com/jvierstra/motif-clustering). +#' @param annoName The name of the `peakAnnotation` object to be stored in the provided `ArchRProject` #' @param species The name of the species relevant to the supplied `ArchRProject`. This is used for identifying which motif to be #' used from CisBP/JASPAR. By default, this function will attempt to guess the species based on the value from `getGenome()`. #' @param collection If one of the JASPAR motif sets is used via `motifSet`, this parameter allows you to indicate the JASPAR -#' collection to be used. See `getMatrixSet()` from `TFBSTools` for all options to supply for collection. +#' collection to be used. See `getMatrixSet()` from `TFBSTools` for all options to supply for collection. If `motifSet` is +#' "vierstra", then this must either be "archetype" (for the v2.1 clustered models) or "individual" (for the original v1 individual motif models). +#' NOTE: vierstra archetype motifs are currently in beta and have not been finalized by Jeff Vierstra. #' @param motifPWMs A custom set of motif PWMs as a PWMList for adding motif annotations. #' @param cutOff The p-value cutoff to be used for motif search. The p-value is determined vs a background set of sequences #' (see `MOODS` for more details on this determination). #' @param width The width in basepairs to consider for motif matches. See the `motimatchr` package for more information. #' @param version An integer specifying version 1 or version 2 of chromVARmotifs see github for more info GreenleafLab/chromVARmotifs. -#' @param force A boolean value indicating whether to force the `peakAnnotation` object indicated by `name` to be overwritten if +#' @param force A boolean value indicating whether to force the `peakAnnotation` object indicated by `annoName` to be overwritten if #' it already exists in the given `ArchRProject`. #' @param logFile The path to a file to be used for logging ArchR output. -#' @param ... Additional parameters to be passed to `TFBSTools::getMatrixSet` for getting a PWM object. +#' @param ... Additional parameters to be passed to `TFBSTools::getMatrixSet` for getting a JASPAR PWM object. #' @export addMotifAnnotations <- function( ArchRProj = NULL, motifSet = "cisbp", - name = "Motif", + annoName = "Motif", species = NULL, collection = "CORE", motifPWMs = NULL, @@ -273,7 +309,7 @@ addMotifAnnotations <- function( .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj")) .validInput(input = motifSet, name = "motifSet", valid = c("character", "null")) - .validInput(input = name, name = "name", valid = c("character")) + .validInput(input = annoName, name = "annoName", valid = c("character")) .validInput(input = species, name = "species", valid = c("character", "null")) .validInput(input = collection, name = "collection", valid = c("character", "null")) .validInput(input = cutOff, name = "cutOff", valid = c("numeric")) @@ -298,7 +334,7 @@ addMotifAnnotations <- function( .startLogging(logFile = logFile) .logThis(mget(names(formals()),sys.frame(sys.nframe())), "addMotifAnnotations Input-Parameters", logFile = logFile) - if(name %in% names(ArchRProj@peakAnnotation)){ + if(annoName %in% names(ArchRProj@peakAnnotation)){ if(force){ message("peakAnnotation name already exists! Overriding.") }else{ @@ -409,6 +445,36 @@ addMotifAnnotations <- function( motifs <- obj$motifs motifSummary <- obj$motifSummary + }else if(tolower(motifSet)=="vierstra"){ + if(tolower(collection)=="individual"){ + url = "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/Vierstra_Individual_Motifs.rds" + message("Using Vierstra v1.0 motifs. See https://www.vierstra.org/resources/motif_clustering for more details.") + } else if(tolower(collection == "archetype")){ + url = "https://jeffgranja.s3.amazonaws.com/ArchR/Annotations/Vierstra_Archetype_Motifs_v2.1.rds" + message("Using Vierstra v2.1beta motifs. See https://resources.altius.org/~jvierstra/projects/motif-clustering-v2.1beta/ for more details.") + } else { + stop(paste0("Error! collection ", collection, " not recognized for motifSet ",motifSet, + ". Accepted values are 'individual' and 'archetype'")) + } + + annoPath <- file.path(find.package("ArchR", NULL, quiet = TRUE), "data", "Annotations") + dir.create(annoPath, showWarnings = FALSE) + + #Download + if(!file.exists(file.path(annoPath, basename(url)))){ + message("Motif file ", basename(url)," does not exist! Downloading..") + download.file( + url = url, + destfile = file.path(annoPath, basename(url)), + quiet = FALSE + ) + } + motifFile <- file.path(annoPath, basename(url)) + + motifs <- readRDS(motifFile) + obj <- NULL + motifSummary <- NULL + }else if(tolower(motifSet)=="custom"){ obj <- NULL @@ -429,7 +495,6 @@ addMotifAnnotations <- function( # Get BSgenome Information! ############################################################# genome <- ArchRProj@genomeAnnotation$genome - .requirePackage(genome) BSgenome <- eval(parse(text = genome)) BSgenome <- validBSgenome(BSgenome) @@ -450,6 +515,28 @@ addMotifAnnotations <- function( w = width ) + ############################################################# + # Filter Motifs With No Matches + ############################################################# + + #Number of Overlaps + nO <- lapply(motifPositions, length) %>% unlist + mF <- names(which(nO == 0)) + + if(all(nO == 0)){ + stop("No Overlaps Found! Please check your peakSet and genome!") + } + + if(length(mF) > 0){ + .logDiffTime(paste0("Filtering Motif Annotations with 0 overlaps :\n\n ", paste(mF, collapse=", "), "\n\n"), t1 = tstart, verbose = TRUE, logFile = logFile) + #Filter + motifPositions <- motifPositions[nO > 0] + motifSummary <- motifSummary[names(motifPositions),,drop=FALSE] + motifs <- motifs[names(motifPositions)] + }else{ + .logDiffTime(paste0("All Motifs Overlap at least 1 peak!"), t1 = tstart, verbose = TRUE, logFile = logFile) + } + ############################################################# # Motif Overlap Matrix ############################################################# @@ -475,16 +562,16 @@ addMotifAnnotations <- function( ) dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE) - savePositions <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Positions-In-Peaks.rds")) - saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Matches-In-Peaks.rds")) + savePositions <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(annoName,"-Positions-In-Peaks.rds")) + saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(annoName,"-Matches-In-Peaks.rds")) - ArchRProj@peakAnnotation[[name]]$Name <- name - ArchRProj@peakAnnotation[[name]]$motifs <- motifs - ArchRProj@peakAnnotation[[name]]$motifSummary <- motifSummary - ArchRProj@peakAnnotation[[name]]$Positions <- savePositions - ArchRProj@peakAnnotation[[name]]$Matches <- saveMatches + ArchRProj@peakAnnotation[[annoName]]$Name <- annoName + ArchRProj@peakAnnotation[[annoName]]$motifs <- motifs + ArchRProj@peakAnnotation[[annoName]]$motifSummary <- motifSummary + ArchRProj@peakAnnotation[[annoName]]$Positions <- savePositions + ArchRProj@peakAnnotation[[annoName]]$Matches <- saveMatches - .safeSaveRDS(out, file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-In-Peaks-Summary.rds")), compress = FALSE) + .safeSaveRDS(out, file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(annoName,"-In-Peaks-Summary.rds")), compress = FALSE) .safeSaveRDS(out$motifPositions, savePositions, compress = FALSE) .safeSaveRDS(out$motifMatches, saveMatches, compress = FALSE) @@ -664,12 +751,15 @@ addArchRAnnotations <- function( #Download if(!file.exists(file.path(annoPath, basename(url)))){ + oldTimeout <- getOption('timeout') + options(timeout=10000) message("Annotation ", basename(url)," does not exist! Downloading..") download.file( url = url, destfile = file.path(annoPath, basename(url)), quiet = FALSE ) + options(timeout=oldTimeout) } AnnoFile <- file.path(annoPath, basename(url)) @@ -746,6 +836,30 @@ addArchRAnnotations <- function( ) .logThis(regionMat, "regionSE", logFile=logFile) + ############################################################# + # Filter Regions With No Matches + ############################################################# + + #Number of Overlaps + nO <- Matrix::colSums(assay(regionMat)) + rF <- names(which(nO == 0)) + + if(all(nO == 0)){ + stop("No Overlaps Found! Please check your peakSet and genome!") + } + + if(length(rF) > 0){ + .logDiffTime(paste0("Filtering Region Annotations with 0 overlaps :\n\n ", paste(rF, collapse=", "), "\n\n"), t1 = tstart, verbose = TRUE, logFile = logFile) + #Filter + regionMat <- regionMat[,nO > 0,drop=FALSE] + }else{ + .logDiffTime(paste0("All Regions Overlap at least 1 peak!"), t1 = tstart, verbose = TRUE, logFile = logFile) + } + + ############################################################# + # Save + ############################################################# + dir.create(file.path(getOutputDirectory(ArchRProj), "Annotations"), showWarnings=FALSE) saveMatches <- file.path(getOutputDirectory(ArchRProj), "Annotations", paste0(name,"-Matches-In-Peaks.rds")) @@ -786,7 +900,7 @@ addArchRAnnotations <- function( } if(chr %ni% .availableSeqnames(AnnoFile, Group)){ - stop("Error Chromosome not in AnnoFile!") + stop(paste("Error! Chromosome ",chr," not in AnnoFile!")) } o <- h5closeAll() diff --git a/R/ArchRBrowser.R b/R/ArchRBrowser.R index c496cc9f..41df3315 100644 --- a/R/ArchRBrowser.R +++ b/R/ArchRBrowser.R @@ -5,7 +5,8 @@ #' Launch ArchR Genome Browser #' #' This function will open an interactive shiny session in style of a browser track. It allows for normalization of the signal which -#' enables direct comparison across samples. +#' enables direct comparison across samples. Note that the genes displayed in this browser are derived from your `geneAnnotation` +#' (i.e. the `BSgenome` object you used) so they may not match other online genome browsers that use different gene annotations. #' #' @param ArchRProj An `ArchRProject` object. #' @param features A `GRanges` object containing the "features" to be plotted via the "featureTrack". This should be thought of as a @@ -291,7 +292,7 @@ ArchRBrowser <- function( region <- region[which(tolower(mcols(region)$symbol) %in% tolower(input$name))] region <- region[order(match(tolower(mcols(region)$symbol), tolower(input$name)))] - region1 <- resize(region, 1, "start") + region1 <- GenomicRanges::resize(region, 1, "start") strand(region1) <- "*" #Extend Region @@ -628,7 +629,8 @@ ArchRBrowserTrack <- function(...){ #' Plot an ArchR Region Track #' #' This function will plot the coverage at an input region in the style of a browser track. It allows for normalization of the signal -#' which enables direct comparison across samples. +#' which enables direct comparison across samples. Note that the genes displayed in these plots are derived from your `geneAnnotation` +#' (i.e. the `BSgenome` object you used) so they may not match other online genome browsers that use different gene annotations. #' #' @param ArchRProj An `ArchRProject` object. #' @param region A `GRanges` region that indicates the region to be plotted. If more than one region exists in the `GRanges` object, @@ -644,8 +646,10 @@ ArchRBrowserTrack <- function(...){ #' Blue-colored genes are on the minus strand and red-colored genes are on the plus strand), and "loopTrack" (links between a peak and a gene). #' @param sizes A numeric vector containing up to 3 values that indicate the sizes of the individual components passed to `plotSummary`. #' The order must be the same as `plotSummary`. -#' @param features A `GRanges` object containing the "features" to be plotted via the "featureTrack". This should be thought of as a -#' bed track. i.e. the set of peaks obtained using `getPeakSet(ArchRProj))`. +#' @param features A `GRanges` (for a single feature track) or `GRangesList` (for multiple feature tracks) object containing the "features" to +#' be plotted via the "featureTrack". This should be thought of as a bed track. i.e. the set of peaks obtained using `getPeakSet(ArchRProj))`. +#' If you provide a `GRangesList`, then each element of that object must be named and this name will be used on the plot. +#' For example - `GRangesList("peaks" = peak_gr, "other" = other_gr)`. #' @param loops A `GRanges` object containing the "loops" to be plotted via the "loopTrack". #' This `GRanges` object start represents the center position of one loop anchor and the end represents the center position of another loop anchor. #' A "loopTrack" draws an arc between two genomic regions that show some type of interaction. This type of track can be used @@ -663,7 +667,7 @@ ArchRBrowserTrack <- function(...){ #' @param normMethod The name of the column in `cellColData` by which normalization should be performed. The recommended and default value #' is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth and sample data quality. #' @param threads The number of threads to use for parallel execution. -#' @param ylim The numeric quantile y-axis limit to be used for for "bulkTrack" plotting. If not provided, the y-axis limit will be c(0, 0.999). +#' @param ylim The numeric quantile y-axis limit to be used for for "bulkTrack" plotting. This should be expressed as `c(lower limit, upper limit)` such as `c(0,0.99)`. If not provided, the y-axis limit will be c(0, 0.999). #' @param pal A custom palette (see `paletteDiscrete` or `ArchRPalettes`) used to override coloring for groups. #' @param baseSize The numeric font size to be used in the plot. This applies to all plot labels. #' @param scTileSize The width of the tiles in scTracks. Larger numbers may make cells overlap more. Default is 0.5 for about 100 cells. @@ -750,7 +754,7 @@ plotBrowserTrack <- function( region <- region[which(tolower(mcols(region)$symbol) %in% tolower(geneSymbol))] region <- region[order(match(tolower(mcols(region)$symbol), tolower(geneSymbol)))] print(region) - region <- resize(region, 1, "start") + region <- GenomicRanges::resize(region, 1, "start") strand(region) <- "*" region <- extendGR(region, upstream = upstream, downstream = downstream) } @@ -806,7 +810,7 @@ plotBrowserTrack <- function( } ########################################################## - # Bulk Tracks + # Single-cell Tracks ########################################################## if("sctrack" %in% tolower(plotSummary)){ .logDiffTime(sprintf("Adding SC Tracks (%s of %s)",x,length(region)), t1=tstart, verbose=verbose, logFile=logFile) @@ -848,7 +852,7 @@ plotBrowserTrack <- function( } ########################################################## - # Feature Tracks + # Loop Tracks ########################################################## if("looptrack" %in% tolower(plotSummary)){ if(!is.null(loops)){ @@ -1040,7 +1044,7 @@ plotBrowserTrack <- function( margin = margin(0,0.35,0,0.35, "cm")), strip.text.y = element_text(angle = 0), strip.background = element_rect(color="black")) + - guides(fill = FALSE, colour = FALSE) + ggtitle(title) + guides(fill = "none", colour = "none") + ggtitle(title) p @@ -1342,7 +1346,7 @@ plotBrowserTrack <- function( theme(axis.title.x=element_blank(), axis.text.x=element_blank(),axis.ticks.x=element_blank()) + theme(axis.title.y=element_blank(), axis.text.y=element_blank(),axis.ticks.y=element_blank()) + theme(legend.text = element_text(size = baseSize), strip.text.y = element_text(size = facetbaseSize, angle = 0)) + - guides(fill = guide_legend(override.aes = list(colour = NA, shape = "c", size=3)), color = FALSE) + + guides(fill = guide_legend(override.aes = list(colour = NA, shape = "c", size=3)), color = "none") + theme(legend.position="bottom") + theme(legend.title=element_text(size=5), legend.text=element_text(size=7), legend.key.size = unit(0.75,"line"), legend.background = element_rect(color =NA), strip.background = element_blank()) @@ -1350,17 +1354,17 @@ plotBrowserTrack <- function( #Add Labels if There are Genes with this orientation! if(length(which(genesO$strand!="-")) > 0){ p <- p + ggrepel::geom_label_repel(data=genesO[which(genesO$strand!="-"),], - aes(x = start, y = cluster, label = symbol, color = strand, fill = NA), + aes(x = start, y = cluster, label = symbol, color = strand), segment.color = "grey", nudge_x = -0.01*(end(region) - start(region)), nudge_y = -0.25, - size = labelSize, direction = "x") + size = labelSize, direction = "x", inherit.aes=FALSE) } #Add Labels if There are Genes with this orientation! if(length(which(genesO$strand=="-")) > 0){ p <- p + ggrepel::geom_label_repel(data=genesO[which(genesO$strand=="-"),], - aes(x = end, y = cluster, label = symbol, color = strand, fill = NA), + aes(x = end, y = cluster, label = symbol, color = strand), segment.color = "grey", nudge_x = +0.01*(end(region) - start(region)), nudge_y = 0.25, - size = labelSize, direction = "x") + size = labelSize, direction = "x", inherit.aes=FALSE) } p <- p + theme(legend.justification = c(0, 1), @@ -1422,11 +1426,24 @@ plotBrowserTrack <- function( featureList <- features hideY <- FALSE } + + #make sure all elements in featureList have a name for plot display + for(i in seq_along(featureList)){ + if(is.null(names(featureList)[i]) || is.na(names(featureList)[i]) || nchar(names(featureList)[i]) == 0) { + message("Warning! Object ",i," in your GRangesList (features) is not named. Generic numbering will be used.") + names(featureList)[i] <- as.character(i) + } + } + featureList <- featureList[rev(seq_along(featureList))] featureO <- lapply(seq_along(featureList), function(x){ featurex <- featureList[[x]] namex <- names(featureList)[x] + if(is.null(namex) || namex == "") { + message("Warning! Object ",x," in your GRangesList (features) is not named. Generic numbering will be used.") + namex <- as.character(x) + } mcols(featurex) <- NULL sub <- subsetByOverlaps(featurex, region, ignore.strand = TRUE) if(length(sub) > 0){ @@ -1458,7 +1475,7 @@ plotBrowserTrack <- function( scale_color_manual(values = pal) + theme(legend.text = element_text(size = baseSize)) + theme_ArchR(baseSize = baseSize, baseLineSize = borderWidth, baseRectSize = borderWidth) + - guides(color = FALSE, fill = FALSE) + theme(strip.text.y = element_text(size = facetbaseSize, angle = 0), strip.background = element_blank()) + guides(color = "none", fill = "none") + theme(strip.text.y = element_text(size = facetbaseSize, angle = 0), strip.background = element_blank()) }else{ @@ -1778,7 +1795,7 @@ plotBrowserTrack <- function( margin = margin(0,0.35,0,0.35, "cm")), strip.text.y = element_text(angle = 0), strip.background = element_rect(color="black")) + - guides(fill = FALSE, colour = FALSE) + ggtitle(title) + guides(fill = "none", colour = "none") + ggtitle(title) p @@ -1867,7 +1884,7 @@ plotBrowserTrack <- function( pal = pal ) + facet_wrap(x~., ncol=1,scales="free_y",strip.position="right") + - guides(fill = FALSE, colour = FALSE) + + guides(fill = "none", colour = "none") + theme_ArchR(baseSize = baseSize, baseRectSize = borderWidth, baseLineSize = tickWidth, diff --git a/R/ArrowRead.R b/R/ArrowRead.R index 20bbdb6c..8fbb09aa 100644 --- a/R/ArrowRead.R +++ b/R/ArrowRead.R @@ -239,13 +239,19 @@ getFragmentsFromArrow <- function( #' Get a data matrix stored in an ArchRProject #' -#' This function gets a given data matrix from an `ArchRProject`. +#' This function gets a given data matrix from an `ArchRProject` and returns it as a `SummarizedExperiment`. +#' This function will return the matrix you ask it for, without altering that matrix unless you tell it to. +#' For example, if you added your `PeakMatrix` using `addPeakMatrix()` with `binarize = TRUE`, then +#' `getMatrixFromProject()` will return a binarized `PeakMatrix`. Alternatively, you could set `binarize = TRUE` +#' in the parameters passed to `getMatrixFromProject()` and the `PeakMatrix` will be binarized as you pull +#' it out. No other normalization is applied to the matrix by this function. #' #' @param ArchRProj An `ArchRProject` object to get data matrix from. #' @param useMatrix The name of the data matrix to retrieve from the given ArrowFile. Options include "TileMatrix", "GeneScoreMatrix", etc. #' @param useSeqnames A character vector of chromosome names to be used to subset the data matrix being obtained. #' @param verbose A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output. -#' @param binarize A boolean value indicating whether the matrix should be binarized before return. This is often desired when working with insertion counts. +#' @param binarize A boolean value indicating whether the matrix should be binarized before return. +#' This is often desired when working with insertion counts. Note that if the matrix has already been binarized previously, this should be set to `TRUE`. #' @param logFile The path to a file to be used for logging ArchR output. #' @export getMatrixFromProject <- function( @@ -352,6 +358,7 @@ getMatrixFromProject <- function( .logDiffTime("Constructing SummarizedExperiment", t1 = tstart, verbose = verbose, logFile = logFile) if(!is.null(rR1)){ se <- SummarizedExperiment(assays = asy, colData = cD, rowRanges = rR1) + se <- sort(se) }else{ se <- SummarizedExperiment(assays = asy, colData = cD, rowData = rD1) } @@ -719,7 +726,8 @@ getMatrixFromArrow <- function( tmpPath = .tempfile(pattern = paste0("tmp-partial-mat")), useIndex = FALSE, tstart = NULL, - verbose = TRUE + verbose = TRUE, + logFile = NULL ){ ######################################### @@ -783,7 +791,7 @@ getMatrixFromArrow <- function( matFiles <- lapply(mat, function(x) x[[2]]) %>% Reduce("c", .) mat <- lapply(mat, function(x) x[[1]]) %>% Reduce("cbind", .) - if(!all(cellNames %in% colnames(mat))){ + if(!all(sampledCellNames %in% colnames(mat))){ .logThis(sampledCellNames, "cellNames supplied", logFile = logFile) .logThis(colnames(mat), "cellNames from matrix", logFile = logFile) stop("Error not all cellNames found in partialMatrix") @@ -915,7 +923,7 @@ getMatrixFromArrow <- function( #Check if samples have NAs due to N = 1 sample or some other weird thing. #Set it to min non NA variance dfVars <- lapply(seq_len(nrow(dfVars)), function(x){ - vx <- dfVars[x, ] + vx <- dfVars[x, , drop = FALSE] if(any(is.na(vx))){ vx[is.na(vx)] <- min(vx[!is.na(vx)]) } @@ -1020,7 +1028,7 @@ getMatrixFromArrow <- function( ){ if(tolower(method) == "fast" & is.null(index) & is.null(start) & is.null(block) & is.null(count)){ - fid <- H5Fopen(file) + fid <- H5Fopen(file, "H5F_ACC_RDONLY") dapl <- H5Pcreate("H5P_DATASET_ACCESS") did <- .Call("_H5Dopen", fid@ID, name, dapl@ID, PACKAGE='rhdf5') res <- .Call("_H5Dread", did, NULL, NULL, NULL, TRUE, 0L, FALSE, fid@native, PACKAGE='rhdf5') diff --git a/R/ArrowUtils.R b/R/ArrowUtils.R index f8e4af75..0e8a0d63 100644 --- a/R/ArrowUtils.R +++ b/R/ArrowUtils.R @@ -428,18 +428,17 @@ RGRle <- Rle(paste0(sampleName, "#", RGValues), RGLengths) #Determine Which to Keep - idx <- BiocGenerics::which(RGRle %bcin% cellsKeep) - RGRle <- RGRle[idx] - RGLengths <- RGRle@lengths + idxj <- BiocGenerics::which(RGRle %bcin% cellsKeep) - #print(head(RGRle@values)) - RGValues <- stringr::str_split(RGRle@values, pattern = "#", simplify = TRUE)[,2] + if(length(idxj) == 0){ + idxj <- 1 + } - #Create Data Sets - # o <- .suppressAll(h5createDataset(outArrow, paste0(groupJ, "/Ranges"), storage.mode = "integer", dims = c(length(RGRle), 2), level = level)) - # o <- .suppressAll(h5createDataset(outArrow, paste0(groupJ, "/RGLengths"), storage.mode = "integer", dims = c(length(RGRle), 1), level = level)) - # o <- .suppressAll(h5createDataset(outArrow, paste0(groupJ, "/RGValues"), storage.mode = "character", dims = c(length(RGRle), 1), level = level, - # size = max(nchar(RGValues) + 1))) + #Info + Ranges <- .h5read(inArrow, paste0(groupJ, "/Ranges"))[idxj, ,drop=FALSE] + RGRle <- RGRle[idxj] + RGLengths <- RGRle@lengths + RGValues <- stringr::str_split(RGRle@values, pattern = "#", simplify = TRUE)[,2] #Write Barcodes o <- .suppressAll(h5write(RGLengths, file = outArrow, name = paste0(groupJ, "/RGLengths"), level = level)) @@ -448,7 +447,7 @@ #Write Ranges o <- .suppressAll( h5write( - obj = .h5read(inArrow, paste0(groupJ, "/Ranges"))[idx, ], + obj = Ranges, file = outArrow, name = paste0(groupJ, "/Ranges"), level = level diff --git a/R/BulkProjection.R b/R/BulkProjection.R index b02d027e..882dd9a4 100644 --- a/R/BulkProjection.R +++ b/R/BulkProjection.R @@ -3,12 +3,13 @@ #' This function will Project Bulk ATAC-seq data into single cell subspace. #' #' @param ArchRProj An `ArchRProject` object containing the dimensionality reduction matrix passed by `reducedDims`. -#' @param seATAC Bulk ATAC Summarized Experiment. -#' @param reducedDims A string specifying the reducedDims. -#' @param embedding A string specifying embedding. +#' @param seATAC A `SummarizedExperiment` object containing bulk ATAC-seq data. +#' @param reducedDims A string specifying the name of the `reducedDims` object to be used. +#' @param embedding A string specifying the name of the `embedding` object to be used. #' @param n An integer specifying the number of subsampled "pseudo single cells" per bulk sample. #' @param verbose A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output. #' @param threads The number of threads used for parallel execution +#' @param force A boolean value indicating whether to force the projection of bulk ATAC data even if fewer than 25% of the features are present in the bulk ATAC data set. #' @param logFile The path to a file to be used for logging ArchR output. #' @export #' @@ -20,6 +21,7 @@ projectBulkATAC <- function( n = 250, verbose = TRUE, threads = getArchRThreads(), + force = FALSE, logFile = createLogFile("projectBulkATAC") ){ @@ -30,8 +32,9 @@ projectBulkATAC <- function( .validInput(input = n, name = "n", valid = c("integer")) .validInput(input = verbose, name = "verbose", valid = c("boolean")) .validInput(input = threads, name = "threads", valid = c("integer")) + .validInput(input = force, name = "force", valid = c("boolean")) .validInput(input = logFile, name = "logFile", valid = c("character")) - + tstart <- Sys.time() .startLogging(logFile = logFile) @@ -150,7 +153,7 @@ projectBulkATAC <- function( } if(embedding$params$nc != ncol(simRD)){ - .logMessage("Error incosistency found with matching LSI dimensions to those used in addEmbedding", + .logMessage("Error! Inconsistency found with matching LSI dimensions to those used in addUMAP or addTSNE", "\nReturning with simulated reduced dimension coordinates...", verbose = TRUE, logFile = logFile) out <- SimpleList( simulatedReducedDims = simRD diff --git a/R/ColorPalettes.R b/R/ColorPalettes.R index d64b7b1c..81694ee3 100644 --- a/R/ColorPalettes.R +++ b/R/ColorPalettes.R @@ -112,6 +112,7 @@ paletteDiscrete <- function( .validInput(input = values, name = "values", valid = c("character", "factor")) .validInput(input = reverse, name = "reverse", valid = c("boolean")) + values <- unique(values) values <- gtools::mixedsort(values) n <- length(unique(values)) pal <- ArchRPalettes[[set]] diff --git a/R/CreateArrow.R b/R/CreateArrow.R index d793bc69..ac0be626 100644 --- a/R/CreateArrow.R +++ b/R/CreateArrow.R @@ -21,6 +21,8 @@ #' Cells containing greater than or equal to `minFrags` total fragments wll be retained. #' @param maxFrags The maximum number of mapped ATAC-seq fragments required per cell to pass filtering for use in downstream analyses. #' Cells containing greater than or equal to `maxFrags` total fragments wll be retained. +#' @param minFragSize The minimum fragment size to be included into Arrow File. Fragments lower than this number are discarded. Must be less than maxFragSize. +#' @param maxFragSize The maximum fragment size to be included into Arrow File. Fragments above than this number are discarded. Must be greater than maxFragSize. #' @param QCDir The relative path to the output directory for QC-level information and plots for each sample/ArrowFile. #' @param nucLength The length in basepairs that wraps around a nucleosome. This number is used for identifying fragments as #' sub-nucleosome-spanning, mono-nucleosome-spanning, or multi-nucleosome-spanning. @@ -43,8 +45,10 @@ #' @param bamFlag A vector of bam flags to be used for reading in fragments from input bam files. Should be in the format of a #' `scanBamFlag` passed to `ScanBam` in Rsamtools. #' @param offsetPlus The numeric offset to apply to a "+" stranded Tn5 insertion to account for the precise Tn5 binding site. +#' This parameter only applies to bam file input and it is assumed that fragment files have already been offset which is the standard from 10x output. #' See Buenrostro et al. Nature Methods 2013. #' @param offsetMinus The numeric offset to apply to a "-" stranded Tn5 insertion to account for the precise Tn5 binding site. +#' This parameter only applies to bam file input and it is assumed that fragment files have already been offset which is the standard from 10x output. #' See Buenrostro et al. Nature Methods 2013. #' @param addTileMat A boolean value indicating whether to add a "Tile Matrix" to each ArrowFile. A Tile Matrix is a counts matrix that, #' instead of using peaks, uses a fixed-width sliding window of bins across the whole genome. This matrix can be used in many downstream ArchR operations. @@ -71,6 +75,8 @@ createArrowFiles <- function( minTSS = 4, minFrags = 1000, maxFrags = 100000, + minFragSize = 10, + maxFragSize = 2000, QCDir = "QualityControl", nucLength = 147, promoterRegion = c(2000, 100), @@ -142,6 +148,9 @@ createArrowFiles <- function( .validInput(input = removeFilteredCells, name = "removeFilteredCells", valid = c("boolean")) .validInput(input = minFrags, name = "minFrags", valid = c("numeric")) .validInput(input = maxFrags, name = "maxFrags", valid = c("numeric")) + .validInput(input = minFragSize, name = "minFragSize", valid = c("numeric")) + .validInput(input = maxFragSize, name = "maxFragSize", valid = c("numeric")) + stopifnot(minFragSize < maxFragSize) .validInput(input = QCDir, name = "QCDir", valid = c("character")) .validInput(input = nucLength, name = "nucLength", valid = c("integer")) .validInput(input = promoterRegion, name = "promoterRegion", valid = c("integer")) @@ -241,8 +250,10 @@ createArrowFiles <- function( offsetMinus = -5, geneAnnotation = NULL, genomeAnnotation = NULL, - minFrags = 500, + minFrags = 1000, maxFrags = 100000, + minFragSize = 10, + maxFragSize = 2000, removeFilteredCells = TRUE, filterFrags = 1000, filterTSS = 4, @@ -414,7 +425,7 @@ createArrowFiles <- function( .tmpToArrow(tmpFile = tmp, outArrow = ArrowFile, genome = genomeAnnotation$genome, minFrags = minFrags, maxFrags = maxFrags, sampleName = sampleName, prefix = prefix, threads = subThreads, - verbose = verbose, tstart = tstart, + verbose = verbose, tstart = tstart, minFragSize = minFragSize, maxFragSize = maxFragSize, chromSizes = genomeAnnotation$chromSizes, removeFilteredCells = removeFilteredCells, logFile = logFile) }, error = function(e){ @@ -569,7 +580,7 @@ createArrowFiles <- function( featureList <- list() featureList$Promoter <- extendGR( - gr = resize(geneAnnotation$genes, 1, "start"), + gr = GenomicRanges::resize(geneAnnotation$genes, 1, "start"), upstream = promoterRegion[1], downstream = promoterRegion[2] ) @@ -841,10 +852,10 @@ createArrowFiles <- function( } #Create Window and Flank - TSS <- resize(TSS, 1, fix = "start") + TSS <- GenomicRanges::resize(TSS, 1, fix = "start") strand(TSS) <- "*" TSS <- unique(TSS) - tssWindow <- resize(TSS, window, "center") + tssWindow <- GenomicRanges::resize(TSS, window, "center") tssWindow$type <- "window" tssFlank <- c( #Positive Flank @@ -854,6 +865,9 @@ createArrowFiles <- function( ) tssFlank$type <- "flank" tssFeatures <- c(tssWindow, tssFlank) + + #Trim In Case Extending beyond Chromosomes + tssFeatures <- GenomicRanges::trim(tssFeatures) #.logThis(tssFeatures, paste0(prefix, " tssFeatures"), logFile = logFile) #Counting @@ -1123,6 +1137,7 @@ createArrowFiles <- function( indexTabix(file, format = "bed") TRUE }, error = function(y){ + message("Tabix indexing failed for ", file,". Note that ArchR requires bgzipped fragment files which is different from gzip. See samtools bgzip!") FALSE }) }) @@ -1142,6 +1157,7 @@ createArrowFiles <- function( indexBam(file) TRUE }, error = function(y){ + message("Indexing of BAM file failed for ",file,".") FALSE }) }) @@ -1245,6 +1261,11 @@ createArrowFiles <- function( return(list(tmpChrFile = NULL, errorCheck = errorCheck)) } + #No NAs + dt <- dt[!is.na(dt$V2), , drop=FALSE] + dt <- dt[!is.na(dt$V3), , drop=FALSE] + dt <- dt[!is.na(dt$V4), , drop=FALSE] + #Care for Break Points dt <- dt[dt$V2 >= start(tileChromSizes[x]),] @@ -1619,13 +1640,18 @@ createArrowFiles <- function( .logThis(unique(dt$V4), name = paste0(prefix, " .bamToTmp Barcodes-Chunk-(",x," of ",length(tileChromSizes),")-", tileChromSizes[x]), logFile = logFile) } + #No NAs + dt <- dt[!is.na(dt$RG), , drop=FALSE] + dt <- dt[!is.na(dt$start), , drop=FALSE] + dt <- dt[!is.na(dt$end), , drop=FALSE] + #Care for Break Points - dt <- dt[dt$start >= start(tileChromSizes[x]),] - dt <- dt[dt$end - dt$start >= 10, ] #Minimum Fragment Size + dt <- dt[dt$start >= start(tileChromSizes[x]),, drop=FALSE] + dt <- dt[dt$end - dt$start >= 10, , drop=FALSE] #Minimum Fragment Size #Check for valid barcodes if(!is.null(validBC)){ - dt <- dt[dt$RG %in% validBC, ] + dt <- dt[dt$RG %in% validBC, , drop=FALSE] } if(all(!is.null(dt), nrow(dt) > 0)){ @@ -1787,8 +1813,10 @@ createArrowFiles <- function( outArrow = NULL, genome = NULL, chromSizes = NULL, - minFrags = 500, - maxFrags = 100000, + minFrags = 1000, + maxFrags = 100000, + minFragSize = 10, + maxFragSize = 2000, sampleName = NULL, verbose = TRUE, tstart = NULL, @@ -1856,7 +1884,7 @@ createArrowFiles <- function( bcPass <- BStringSet(dt$values.V1[dt$V1 >= minFrags & dt$V1 <= maxFrags]) if(length(bcPass) < 3){ - .logStop(sprintf("Detected 2 or less cells (%s barcodes have greater than 50 fragments) in file!\n Check inputs such as 'minFrags' or 'maxFrags' to keep cells! Exiting!", sum(dt$V1 > 50)), logFile = logFile) + .logStop(sprintf("Detected 2 or less cells (%s barcodes have greater than 50 fragments) in file!\n Check inputs such as 'minFrags' or 'maxFrags' to keep cells!\n Also check that you are using the correct reference genome.\n Exiting!", sum(dt$V1 > 50)), logFile = logFile) } .logThis(data.frame(bc = as.character(bcPass)), name = paste0(prefix, " BarcodesMinMaxFrags"), logFile = logFile) @@ -1919,6 +1947,12 @@ createArrowFiles <- function( #Order RG RLE based on bcPass fragments <- fragments[BiocGenerics::which(mcols(fragments)$RG %bcin% bcPass)] fragments <- fragments[order(S4Vectors::match(mcols(fragments)$RG, bcPass))] + + #Check if Fragments are greater than minFragSize and smaller than maxFragSize + fragments <- fragments[width(fragments) >= minFragSize] + fragments <- fragments[width(fragments) <= maxFragSize] + + #Length of BC lengthRG <- length(mcols(fragments)$RG@lengths) if(x == 1){ @@ -1999,6 +2033,12 @@ createArrowFiles <- function( #Order RG RLE based on bcPass fragments <- fragments[BiocGenerics::which(mcols(fragments)$RG %bcin% bcPass)] fragments <- fragments[order(S4Vectors::match(mcols(fragments)$RG, bcPass))] + + #Check if Fragments are greater than minFragSize and smaller than maxFragSize + fragments <- fragments[width(fragments) >= minFragSize] + fragments <- fragments[width(fragments) <= maxFragSize] + + #Length of BC lengthRG <- length(mcols(fragments)$RG@lengths) if(x == 1){ diff --git a/R/DoubletsScores.R b/R/DoubletsScores.R index c5338195..9275bd5f 100644 --- a/R/DoubletsScores.R +++ b/R/DoubletsScores.R @@ -378,7 +378,7 @@ addDoubletScores <- function( scale_colour_gradientn(colors = pal) + xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + labs(color = "Simulated Doublet Density") + - guides(fill = FALSE) + theme_ArchR(baseSize = 10) + + guides(fill = "none") + theme_ArchR(baseSize = 10) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank()) + coord_equal(ratio = diff(xlim)/diff(ylim), xlim = xlim, ylim = ylim, expand = FALSE) + @@ -395,7 +395,7 @@ addDoubletScores <- function( # geom_point(data = dfDoub, aes(x=x,y=y,colour=color), size = 0.5) + # scale_colour_gradientn(colors = pal) + # xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + - # guides(fill = FALSE) + theme_ArchR(baseSize = 10) + + # guides(fill = "none") + theme_ArchR(baseSize = 10) + # labs(color = "Simulated Doublet Density") + # theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), # axis.text.y = element_blank(), axis.ticks.y = element_blank()) + @@ -413,7 +413,7 @@ addDoubletScores <- function( # scale_colour_gradientn(colors = pal) + # xlab("UMAP Dimension 1") + ylab("UMAP Dimension 2") + # labs(color = "Simulated Doublet Density") + - # guides(fill = FALSE) + theme_ArchR(baseSize = 10) + + # guides(fill = "none") + theme_ArchR(baseSize = 10) + # theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), # axis.text.y = element_blank(), axis.ticks.y = element_blank()) + # coord_equal(ratio = diff(xlim)/diff(ylim), xlim = xlim, ylim = ylim, expand = FALSE) + diff --git a/R/Embedding.R b/R/Embedding.R index e0ead19c..c71026f4 100644 --- a/R/Embedding.R +++ b/R/Embedding.R @@ -209,6 +209,48 @@ addUMAP <- function( #New Save UWOT .saveUWOT <- function(model, file){ + + #save_uwot does not work because tarring doesnt work for some reason on Stanford's compute server + #Adapted from save_uwot + #this function is evaluated because it doesnt work on newer versions of uwot + #this is kept for legacy R versions + strUWOT <- " + .saveUWOT_Deprecated <- function(model, file){ + file <- file.path(normalizePath(dirname(file)), basename(file)) + wd <- getwd() + mod_dir <- tempfile(pattern = 'dir') + dir.create(mod_dir) + uwot_dir <- file.path(mod_dir, 'uwot') + dir.create(uwot_dir) + model_tmpfname <- file.path(uwot_dir, 'model') + .safeSaveRDS(model, file = model_tmpfname) + metrics <- names(model$metric) + n_metrics <- length(metrics) + for (i in seq_len(n_metrics)) { + nn_tmpfname <- file.path(uwot_dir, paste0('nn', i)) + if (n_metrics == 1) { + model$nn_index$save(nn_tmpfname) + model$nn_index$unload() + model$nn_index$load(nn_tmpfname) + } + else { + model$nn_index[[i]]$save(nn_tmpfname) + model$nn_index[[i]]$unload() + model$nn_index[[i]]$load(nn_tmpfname) + } + } + setwd(mod_dir) + system2('tar', '-cvf uwot.tar uwot', stdout = NULL, stderr = NULL) + o <- .fileRename('uwot.tar', file) + setwd(wd) + if (file.exists(mod_dir)) { + unlink(mod_dir, recursive = TRUE) + } + return(o) + } + " + eval(parse(text=strUWOT)) + tryCatch({ uwot::save_uwot(model = model, file = file, verbose = TRUE) }, error = function(e){ @@ -216,44 +258,61 @@ addUMAP <- function( }) } -#save_uwot does not work because tarring doesnt work for some reason on Stanford's compute server -#Adapted from save_uwot -.saveUWOT_Deprecated <- function(model, file){ - file <- file.path(normalizePath(dirname(file)), basename(file)) - wd <- getwd() - mod_dir <- tempfile(pattern = "dir") - dir.create(mod_dir) - uwot_dir <- file.path(mod_dir, "uwot") - dir.create(uwot_dir) - model_tmpfname <- file.path(uwot_dir, "model") - .safeSaveRDS(model, file = model_tmpfname) - metrics <- names(model$metric) - n_metrics <- length(metrics) - for (i in seq_len(n_metrics)) { - nn_tmpfname <- file.path(uwot_dir, paste0("nn", i)) - if (n_metrics == 1) { - model$nn_index$save(nn_tmpfname) - model$nn_index$unload() - model$nn_index$load(nn_tmpfname) - } - else { - model$nn_index[[i]]$save(nn_tmpfname) - model$nn_index[[i]]$unload() - model$nn_index[[i]]$load(nn_tmpfname) - } - } - setwd(mod_dir) - system2("tar", "-cvf uwot.tar uwot", stdout = NULL, stderr = NULL) - o <- .fileRename("uwot.tar", file) - setwd(wd) - if (file.exists(mod_dir)) { - unlink(mod_dir, recursive = TRUE) - } - return(o) -} - #New Save UWOT -.loadUWOT <- function(file){ +.loadUWOT <- function(file, nDim = NULL){ + + #load_uwot does not work because tarring doesnt work for some reason on Stanford's compute server + #Adapted from load_uwot + #this function is evaluated because it doesnt work on newer versions of uwot + #this is kept for legacy R versions + strUWOT <- " + .loadUWOT_Deprecated <- function(file, nDim = NULL){ + model <- NULL + tryCatch({ + mod_dir <- tempfile(pattern = 'dir') + dir.create(mod_dir) + utils::untar(file, exdir = mod_dir) + model_fname <- file.path(mod_dir, 'uwot/model') + if (!file.exists(model_fname)) { + stop('Cant find model in ', file) + } + model <- readRDS(file = model_fname) + metrics <- names(model$metric) + n_metrics <- length(metrics) + for (i in seq_len(n_metrics)){ + nn_fname <- file.path(mod_dir, paste0('uwot/nn', i)) + if (!file.exists(nn_fname)) { + stop('Cant find nearest neighbor index ', nn_fname, ' in ', file) + } + metric <- metrics[[i]] + if(length(model$metric[[i]]) == 0){ + if(!is.null(nDim)){ + nDim2 <- nDim + }else{ + nDim2 <- length(model$metric[[i]]) + } + } + if(!is.null(nDim)){ + nDim2 <- nDim + } + ann <- uwot:::create_ann(metric, ndim = nDim2) + ann$load(nn_fname) + if (n_metrics == 1) { + model$nn_index <- ann + }else{ + model$nn_index[[i]] <- ann + } + } + }, finally = { + if (file.exists(mod_dir)) { + unlink(mod_dir, recursive = TRUE) + } + }) + model + } + " + eval(parse(text=strUWOT)) + tryCatch({ uwot::load_uwot(file = file, verbose = TRUE) }, error = function(e){ @@ -261,52 +320,6 @@ addUMAP <- function( }) } -#Adapted from load_uwot -.loadUWOT_Deprecated <- function(file, nDim = NULL){ - model <- NULL - tryCatch({ - mod_dir <- tempfile(pattern = "dir") - dir.create(mod_dir) - utils::untar(file, exdir = mod_dir) - model_fname <- file.path(mod_dir, "uwot/model") - if (!file.exists(model_fname)) { - stop("Can't find model in ", file) - } - model <- readRDS(file = model_fname) - metrics <- names(model$metric) - n_metrics <- length(metrics) - for (i in seq_len(n_metrics)){ - nn_fname <- file.path(mod_dir, paste0("uwot/nn", i)) - if (!file.exists(nn_fname)) { - stop("Can't find nearest neighbor index ", nn_fname, " in ", file) - } - metric <- metrics[[i]] - if(length(model$metric[[i]]) == 0){ - if(!is.null(nDim)){ - nDim2 <- nDim - }else{ - nDim2 <- length(model$metric[[i]]) - } - } - if(!is.null(nDim)){ - nDim2 <- nDim - } - ann <- uwot:::create_ann(metric, ndim = nDim2) - ann$load(nn_fname) - if (n_metrics == 1) { - model$nn_index <- ann - }else{ - model$nn_index[[i]] <- ann - } - } - }, finally = { - if (file.exists(mod_dir)) { - unlink(mod_dir, recursive = TRUE) - } - }) - model -} - #' Add a TSNE embedding of a reduced dimensions object to an ArchRProject #' #' This function will compute a TSNE embedding and add it to an ArchRProject. diff --git a/R/Footprinting.R b/R/Footprinting.R index baf77703..566c41bf 100644 --- a/R/Footprinting.R +++ b/R/Footprinting.R @@ -68,7 +68,6 @@ getFootprints <- function( } genome <- getGenome(ArchRProj) - .requirePackage(genome) .requirePackage("Biostrings", source = "bioc") BSgenome <- eval(parse(text = genome)) BSgenome <- validBSgenome(BSgenome) @@ -221,7 +220,7 @@ getFootprints <- function( footprintDF <- lapply(seq_along(featureList), function(x){ outx <- tryCatch({ - featurex <- split(resize(featureList[[x]],1,"center"), seqnames(featureList[[x]])) + featurex <- split(GenomicRanges::resize(featureList[[x]],1,"center"), seqnames(featureList[[x]])) intSeq <- intersect(names(featurex), names(cov)) if(length(intSeq)==0){ .logMessage(paste0("No intersecting chromsomes for feature ", names(featureList)[x], "!")) @@ -292,7 +291,7 @@ getFootprints <- function( kmerList <- .safelapply(seq_along(featureList), function(i){ .logDiffTime(sprintf("Computing Kmer Tables for %s of %s features", i, length(featureList)), tstart, verbose=verbose, logFile = logFile) - bsv <- BSgenomeViews(genome , resize(featureList[[i]], window + k, "center")) + bsv <- BSgenomeViews(genome, GenomicRanges::resize(featureList[[i]], window + k, "center")) bsv <- bsv[width(bsv) == window + k] #none that are trimmed! #BSgenome is already stranded #kmerPositionFrequencyCpp is Rcpp export for getting kmer position frequencies from strings @@ -336,7 +335,8 @@ getFootprints <- function( #' @param smoothWindow The size in basepairs of the sliding window to be used for smoothing of the footprint signal. #' @param baseSize A numeric specifying the baseSize of font in the plots. #' @param plot A boolean value indicating whether or not the footprints should be plotted (`TRUE`) or returned as grob objects (`FALSE`). -#' @param ArchRProj An `ArchRProject` object to be used for plotting directory in `getOutputDirectory`. +#' @param ArchRProj An `ArchRProject` object to be used for plotting directory in `getOutputDirectory`. If no `ArchRProj` is supplied, +#' then plots will be stored in a directory called "Plots" in the current working directory. #' @param plotName A string indicating the name/prefix of the file to be used for output plots. #' @param height The height in inches to be used for the output PDF file. #' @param width The width in inches to be used for the output PDF file. @@ -374,7 +374,7 @@ plotFootprints <- function( .validInput(input = smoothWindow, name = "smoothWindow", valid = c("integer", "null")) .validInput(input = baseSize, name = "baseSize", valid = c("numeric")) .validInput(input = plot, name = "plot", valid = c("boolean")) - .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj")) + .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj", "null")) .validInput(input = plotName, name = "plotName", valid = c("character")) .validInput(input = height, name = "height", valid = c("numeric")) .validInput(input = width, name = "width", valid = c("numeric")) @@ -501,7 +501,7 @@ plotFootprints <- function( biasMat <- t(t(biasMat) / colMeans(biasMat[idx, ,drop=FALSE])) errorList$footMatNorm <- footMat - errorList$biasMatNorm <- footMat + errorList$biasMatNorm <- biasMat #Norm Foot By Bias if(tolower(normMethod) == "none"){ @@ -579,9 +579,10 @@ plotFootprints <- function( ylim = c(quantile(plotFootDF$mean, 0.0001), 1.15*quantile(smoothFoot, 0.999)), xlim = c(min(plotFootDF$x),max(plotFootDF$x)) ) + theme_ArchR(baseSize = baseSize) + ggtitle(name) + - guides(fill = FALSE) + - guides(color = FALSE) + ylab(paste0(title,"Normalized Insertions")) + - ggrepel::geom_label_repel(data = plotMax, aes(label = group), size = 3, xlim = c(75, NA)) + guides(fill = "none") + + guides(color = "none") + ylab(paste0(title,"Normalized Insertions")) + #removed ggrepel due to incompatibility with coord_cartesian - see https://github.com/GreenleafLab/ArchR/issues/493#issuecomment-870012873 + #ggrepel::geom_label_repel(data = plotMax, aes(label = group), size = 3, xlim = c(75, NA)) ggBias <- ggplot(plotBiasDF, aes(x = x, y = mean, color = group)) + geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, linetype = NA, fill = group), alpha = 0.4) + diff --git a/R/GgplotUtils.R b/R/GgplotUtils.R index 609caa08..ec808b4b 100644 --- a/R/GgplotUtils.R +++ b/R/GgplotUtils.R @@ -555,6 +555,9 @@ ggHex <- function( .validInput(input = hexCut, name = "quantCut", valid = c("numeric", "null")) .validInput(input = addPoints, name = "addPoints", valid = c("boolean")) + #require hexbin to be installed. otherwise, this section wont work properly + .requirePackage(x = "hexbin", source = "CRAN") + df <- data.frame(x = x, y = y) include <- which(is.finite(x) & is.finite(y)) @@ -699,8 +702,8 @@ ggGroup <- function( df$x <- factor(df$x, groupOrder) p <- ggplot(df, aes(x = x, y = y, color = x)) + - scale_color_manual(values = pal, guide = FALSE) + - scale_fill_manual(values = pal, guide = FALSE) + + scale_color_manual(values = pal, guide = "none") + + scale_fill_manual(values = pal, guide = "none") + ggtitle(title) if(tolower(plotAs) == "ridges" | tolower(plotAs) == "ggridges"){ @@ -728,7 +731,7 @@ ggGroup <- function( val <- 1/length(unique(x)) p <- p + geom_density_ridges(data = df, aes(x = y, y = x, color = x, fill = x), scale = ridgeScale, - alpha = alpha, color = "black") + scale_y_discrete(expand = expand_scale(mult = c(0.01, val))) + alpha = alpha, color = "black") + scale_y_discrete(expand = expansion(mult = c(0.01, val))) } }else{ type <- "violin" diff --git a/R/GlobalDefaults.R b/R/GlobalDefaults.R index aae3753b..aab0b8a5 100644 --- a/R/GlobalDefaults.R +++ b/R/GlobalDefaults.R @@ -12,27 +12,72 @@ ArchRDefaults <- list( ArchR.verbose = TRUE ) +ArchRDependency <- c( + "grid", + "gridExtra", + "gtools", + "gtable", + "ggplot2", + "magrittr", + "plyr", + "stringr", + "data.table", + "matrixStats", + "S4Vectors", + "GenomicRanges", + "BiocGenerics", + "Matrix", + "Rcpp", + "SummarizedExperiment", + "rhdf5" +) + .onAttach <- function(libname, pkgname){ - if(!interactive()) return() - v <- packageVersion("ArchR") + + #Logo .ArchRLogo() + + #Package Startup + v <- packageVersion("ArchR") packageStartupMessage("ArchR : Version ", v, "\nFor more information see our website : www.ArchRProject.com\nIf you encounter a bug please report : https://github.com/GreenleafLab/ArchR/issues") + + #Load Packages + packageStartupMessage("Loading Required Packages...") + pkgs <- ArchRDependency + for(i in seq_along(pkgs)){ + packageStartupMessage("\tLoading Package : ", pkgs[i], " v", packageVersion(pkgs[i])) + tryCatch({ + suppressPackageStartupMessages(require(pkgs[i], character.only=TRUE)) + }, error = function(e){ + packageStartupMessage("\tFailed To Load Package : ", pkgs[i], " v", packageVersion(pkgs[i])) + }) + } + + if(!interactive()) return() + + #Set Default Options op <- options() toset <- !(names(ArchRDefaults) %in% names(op)) + if (any(toset)) options(ArchRDefaults[toset]) + if(!.isWholenumber(options()[["ArchR.threads"]])){ addArchRThreads() }else if(options()[["ArchR.threads"]] == 1){ addArchRThreads() } + if(!.checkCairo()){ packageStartupMessage("WARNING : Cairo check shows Cairo is not functional.\n ggplot2 rasterization will not be available without Cario.\n This may cause issues editing plots with many thousands of points from single cells.") } + if(.checkJupyter()){ packageStartupMessage("Detected Jupyer Notebook session. Disabling Log Messages!\n\tIf this is undesired use `addArchRVerbose(TRUE)`") addArchRVerbose(verbose = FALSE) } + invisible() + } #Check Jupyer Status @@ -285,6 +330,9 @@ addArchRThreads <- function(threads = floor(parallel::detectCores()/ 2), force = threads <- parallel::detectCores()-2 } } + if(threads > 1){ + RNGkind("L'Ecuyer-CMRG") + } message("Setting default number of Parallel threads to ", threads, ".") options(ArchR.threads = as.integer(round(threads))) diff --git a/R/GroupCoverages.R b/R/GroupCoverages.R index 63859218..9b3f1170 100644 --- a/R/GroupCoverages.R +++ b/R/GroupCoverages.R @@ -6,6 +6,10 @@ #' @param ArchRProj An `ArchRProject` object. #' @param groupBy The name of the column in `cellColData` to use for grouping multiple cells together prior to generation of the insertion coverage file. #' @param useLabels A boolean value indicating whether to use sample labels to create sample-aware subgroupings during as pseudo-bulk replicate generation. +#' @param sampleLabels The name of a column in `cellColData` to use to identify samples. In most cases, this parameter should be left as `NULL` and you +#' should only use this parameter if you do not want to use the default sample labels stored in `cellColData$Sample`. However, if your individual Arrow +#' files do not map to individual samples, then you should set this parameter to accurately identify your samples. This is the case in (for example) +#' multiplexing applications where cells from different biological samples are mixed into the same reaction and demultiplexed based on a lipid barcode or genotype. #' @param minCells The minimum number of cells required in a given cell group to permit insertion coverage file generation. #' @param maxCells The maximum number of cells to use during insertion coverage file generation. #' @param maxFragments The maximum number of fragments per cell group to use in insertion coverage file generation. This prevents the generation @@ -28,6 +32,7 @@ addGroupCoverages <- function( ArchRProj = NULL, groupBy = "Clusters", useLabels = TRUE, + sampleLabels = "Sample", minCells = 40, maxCells = 500, maxFragments = 25*10^6, @@ -46,6 +51,7 @@ addGroupCoverages <- function( .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj")) .validInput(input = groupBy, name = "groupBy", valid = c("character")) .validInput(input = useLabels, name = "useLabels", valid = c("boolean")) + .validInput(input = sampleLabels, name = "sampleLabels", valid = c("character")) .validInput(input = minCells, name = "minCells", valid = c("integer")) .validInput(input = maxCells, name = "maxCells", valid = c("integer")) .validInput(input = maxFragments, name = "maxFragments", valid = c("integer")) @@ -64,6 +70,10 @@ addGroupCoverages <- function( stop("minReplicates must be at least 2!") } + if(sampleLabels %ni% colnames(ArchRProj@cellColData)) { + stop("sampleLabels is not a column in cellColData!") + } + tstart <- Sys.time() .startLogging(logFile = logFile) .logThis(mget(names(formals()),sys.frame(sys.nframe())), "addGroupCoverages Input-Parameters", logFile = logFile) @@ -118,8 +128,8 @@ addGroupCoverages <- function( # outListx <- SimpleList(LowCellGroup = cellNamesx) or NULL #} if(useLabels){ - sampleLabelsx <- paste0(subColDat$Sample) - }else{ + sampleLabelsx <- paste0(subColDat[,sampleLabels]) + } else { sampleLabelsx <- NULL } outListx <- .identifyGroupsForPseudoBulk( @@ -585,7 +595,6 @@ addGroupCoverages <- function( .logThis(append(args, mget(names(formals()),sys.frame(sys.nframe()))), "kmerBias-Parameters", logFile = logFile) - .requirePackage(genome) .requirePackage("Biostrings", source = "bioc") BSgenome <- eval(parse(text = genome)) BSgenome <- validBSgenome(BSgenome) @@ -752,7 +761,12 @@ addGroupCoverages <- function( if(x == 1) .logThis(iS, "InsertionSites", logFile = logFile) iS <- data.table(seqnames = allChr[x], start = iS - 1L, end = iS) if(x == 1) .logThis(iS, "InsertionSites-DT", logFile = logFile) - data.table::fwrite(iS, out, sep = "\t", col.names = FALSE, append = TRUE) + if(!any(is.na(iS$start))) { + data.table::fwrite(iS, out, sep = "\t", col.names = FALSE, append = TRUE) + } else { + message(paste0("Warning - No insertions found on seqnames ", allChr[x], " for coverageFile ", coverageFile,".")) + .logMessage(paste0("Warning - No insertions found on seqnames ", allChr[x], " for coverageFile ", coverageFile,"."), logFile = logFile) + } }, error = function(e){ errorList <- list( x = x, diff --git a/R/GroupExport.R b/R/GroupExport.R index 87bf3719..fdb15a07 100644 --- a/R/GroupExport.R +++ b/R/GroupExport.R @@ -140,7 +140,8 @@ getGroupSE <- function( #' user-supplied `cellColData` metadata columns (for example, "Clusters"). Cells with the same value annotated in this metadata #' column will be grouped together and the average signal will be plotted. #' @param normMethod The name of the column in `cellColData` by which normalization should be performed. The recommended and default value -#' is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth and sample data quality. +#' is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth and sample data quality. Accepted values are +#' "None", "ReadsInTSS", "nCells", "ReadsInPromoter", or "nFrags". #' @param tileSize The numeric width of the tile/bin in basepairs for plotting ATAC-seq signal tracks. All insertions in a single bin will be summed. #' @param maxCells Maximum number of cells used for each bigwig. #' @param ceiling Maximum contribution of accessibility per cell in each tile. @@ -327,7 +328,10 @@ getGroupBW <- function( }else{ #N Tiles - nTiles <- trunc(chromLengths[availableChr[k]] / tileSize) + 1 + nTiles <- chromLengths[availableChr[k]] / tileSize + if (nTiles%%1 != 0) { + nTiles <- trunc(nTiles) + 1 + } #Create Sparse Matrix matchID <- S4Vectors::match(mcols(fragik)$RG, cellGroupi) diff --git a/R/Harmony.R b/R/Harmony.R index 5a2f33ac..ca9fbabe 100644 --- a/R/Harmony.R +++ b/R/Harmony.R @@ -13,6 +13,8 @@ #' to sequencing depth that is greater than the `corCutOff`, it will be excluded from analysis. #' @param name The name to store harmony output as a `reducedDims` in the `ArchRProject` object. #' @param groupBy The name of the column in `cellColData` to use for grouping cells together for vars in harmony batch correction. +#' The value of `groupBy` is passed to the `vars_use` parameter in `harmony::HarmonyMatrix()`. When run through ArchR, this parameter +#' defines which variables to correct for during batch correction. See `harmony::HarmonyMatrix()` for more information. #' @param verbose A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output. #' @param force A boolean value that indicates whether or not to overwrite data in a given column when the value passed to `name` already #' exists as a column name in `cellColData`. diff --git a/R/HiddenUtils.R b/R/HiddenUtils.R index 4ef6980e..1e7844dc 100644 --- a/R/HiddenUtils.R +++ b/R/HiddenUtils.R @@ -324,8 +324,19 @@ } .tempfile <- function(pattern = "tmp", tmpdir = "tmp", fileext = "", addDOC = TRUE){ + + #if the directory doesnt already exist and file.exists evaluates to true, then a file exists with that name + if(!dir.exists(tmpdir)){ + if(file.exists(tmpdir)){ + stop(paste0("Attempted to create temporary directory ", tmpdir," but a file already exists with this name. Please remove this file and try again!")) + } + } dir.create(tmpdir, showWarnings = FALSE) + + if(!dir.exists(tmpdir)){ + stop(paste0("Unable to create temporary directory ", tmpdir,". Check file permissions!")) + } if(addDOC){ doc <- paste0("-Date-", Sys.Date(), "_Time-", gsub(":","-", stringr::str_split(Sys.time(), pattern=" ",simplify=TRUE)[1,2])) @@ -393,7 +404,7 @@ } if(threads > 1){ - + .requirePackage("parallel", source = "cran") o <- mclapply(..., mc.cores = threads, mc.preschedule = preschedule) errorMsg <- list() diff --git a/R/Imputation.R b/R/Imputation.R index 863e292a..bda3e0da 100644 --- a/R/Imputation.R +++ b/R/Imputation.R @@ -109,10 +109,9 @@ addImputeWeights <- function( }else{ weightFiles <- file.path(getOutputDirectory(ArchRProj), "ImputeWeights", paste0("Impute-Weights-Rep-", seq_len(nRep))) } + o <- suppressWarnings(file.remove(weightFiles)) } - o <- suppressWarnings(file.remove(weightFiles)) - weightList <- .safelapply(seq_len(nRep), function(y){ .logDiffTime(sprintf("Computing Partial Diffusion Matrix with Magic (%s of %s)", y, nRep), t1 = tstart, verbose = FALSE, logFile = logFile) @@ -124,9 +123,8 @@ addImputeWeights <- function( blocks <- list(rownames(matDR)) } - weightFile <- weightFiles[y] - if(useHdf5){ + weightFile <- weightFiles[y] o <- h5createFile(weightFile) } @@ -172,8 +170,8 @@ addImputeWeights <- function( for(i in seq_len(td)){ Wt <- Wt %*% W } - rownames(Wt) <- rownames(matDR)[ix] - colnames(Wt) <- rownames(matDR)[ix] + rownames(Wt) <- ix + colnames(Wt) <- ix rm(knnIdx) rm(knnDist) @@ -200,7 +198,7 @@ addImputeWeights <- function( }, threads = threads) %>% SimpleList names(weightList) <- paste0("w",seq_along(weightList)) - .logDiffTime(sprintf("Completed Getting Magic Weights!", round(object.size(weightList) / 10^9, 3)), + .logDiffTime(sprintf("Completed Getting Magic Weights! Object size - %s.", round(object.size(weightList) / 10^9, 3)), t1 = tstart, verbose = FALSE, logFile = logFile) ArchRProj@imputeWeights <- SimpleList( diff --git a/R/InputData.R b/R/InputData.R index 178572e8..ce42a3a2 100644 --- a/R/InputData.R +++ b/R/InputData.R @@ -11,45 +11,125 @@ getTutorialData <- function( tutorial = "hematopoiesis", threads = getArchRThreads() ){ - + #Validate .validInput(input = tutorial, name = "tutorial", valid = "character") .validInput(input = threads, name = "threads", valid = c("integer")) ######### - + + #Make Sure URL doesnt timeout + oldTimeout <- getOption('timeout') + options(timeout=100000) + if(tolower(tutorial) %in% c("heme","hematopoiesis")){ - if(!dir.exists("HemeFragments")){ - - filesUrl <- c( + pathDownload <- "HemeFragments" + + filesUrl <- data.frame( + fileUrl = c( "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/HemeFragments/scATAC_BMMC_R1.fragments.tsv.gz", "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/HemeFragments/scATAC_CD34_BMMC_R1.fragments.tsv.gz", "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/HemeFragments/scATAC_PBMC_R1.fragments.tsv.gz" - ) - - dir.create("HemeFragments", showWarnings = FALSE) - - downloadFiles <- .safelapply(seq_along(filesUrl), function(x){ - download.file( - url = filesUrl[x], - destfile = file.path("HemeFragments", basename(filesUrl[x])) - ) - }, threads = min(threads, length(filesUrl))) - - } - pathFragments <- "HemeFragments" - - }else{ - + ), + md5sum = c( + "77502e1f195e21d2f7a4e8ac9c96e65e", + "618613b486e4f8c0101f4c05c69723b0", + "a8d5ae747841055ef230ba496bcfe937" + ), + stringsAsFactors = FALSE + ) + + dir.create(pathDownload, showWarnings = FALSE) + + downloadFiles <- .downloadFiles(filesUrl = filesUrl, pathDownload = pathDownload, threads = threads) + + inputFiles <- list.files(pathDownload, pattern = "\\.gz$", full.names = TRUE) + names(inputFiles) <- gsub(".fragments.tsv.gz", "", list.files(pathDownload, pattern = "\\.gz$")) + inputFiles <- inputFiles[!grepl(".tbi", inputFiles)] + + }else if(tolower(tutorial) %in% c("multiome")){ + + filesUrl <- data.frame( + fileUrl = c( + "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/Multiome/pbmc_sorted_3k.fragments.tsv.gz", + "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/Multiome/pbmc_sorted_3k.filtered_feature_bc_matrix.h5", + "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/Multiome/pbmc_unsorted_3k.fragments.tsv.gz", + "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/Multiome/pbmc_unsorted_3k.filtered_feature_bc_matrix.h5" + ), + md5sum = c( + "d49f4012ff65d9edfee86281d6afb286", + "e326066b51ec8975197c29a7f911a4fd", + "5737fbfcb85d5ebf4dab234a1592e740", + "bd4cc4ff040987e1438f1737be606a27" + ), + stringsAsFactors = FALSE + ) + + pathDownload <- "Multiome" + + dir.create(pathDownload, showWarnings = FALSE) + + downloadFiles <- .downloadFiles(filesUrl = filesUrl, pathDownload = pathDownload, threads = threads) + + fragFiles <- list.files(pathDownload, pattern = "\\.gz$", full.names = TRUE) + names(fragFiles) <- gsub(".fragments.tsv.gz", "", list.files(pathDownload, pattern = "\\.gz$")) + fragFiles <- fragFiles[!grepl(".tbi", fragFiles)] + geneFiles <- list.files(pathDownload, pattern = "\\.h5$", full.names = TRUE) + names(geneFiles) <- gsub(".fragments.tsv.gz", "", list.files(pathDownload, pattern = "\\.gz$")) + + inputFiles <- c(fragFiles, geneFiles) + + } else{ + stop("There is no tutorial data for : ", tutorial) - + } - - inputFiles <- list.files(pathFragments, pattern = ".gz", full.names = TRUE) - names(inputFiles) <- gsub(".fragments.tsv.gz", "", list.files(pathFragments, pattern = ".gz")) - inputFiles <- inputFiles[!grepl(".tbi", inputFiles)] + + #Set back URL Options + options(timeout=oldTimeout) + inputFiles + +} +#helper for file downloads +.downloadFiles <- function(filesUrl = NULL, pathDownload = NULL, threads = 1){ + if(is.null(filesUrl)) { + stop("No value supplied to filesUrl in .downloadFiles()!") + } + if(is.null(pathDownload)) { + stop("No value supplied to pathDownload in .downloadFiles()!") + } + if(length(which(c("fileUrl","md5sum") %ni% colnames(filesUrl))) != 0) { + cat(colnames(filesUrl)) + stop("File download dataframe does not include columns named 'fileUrl' and 'md5sum' which are required!") + } + message(paste0("Downloading files to ",pathDownload,"...")) + downloadFiles <- .safelapply(seq_along(filesUrl$fileUrl), function(x){ + if(file.exists(file.path(pathDownload, basename(filesUrl$fileUrl[x])))){ + if(tools::md5sum(file.path(pathDownload, basename(filesUrl$fileUrl[x]))) != filesUrl$md5sum[x]) { + message(paste0("File ",basename(filesUrl$fileUrl[x])," exists but has an incorrect md5sum. Removing...")) + file.remove(file.path(pathDownload, basename(filesUrl$fileUrl[x]))) + } + } + if(!file.exists(file.path(pathDownload, basename(filesUrl$fileUrl[x])))){ + message(paste0("Downloading file ", basename(filesUrl$fileUrl[x]),"...")) + download.file( + url = filesUrl$fileUrl[x], + destfile = file.path(pathDownload, basename(filesUrl$fileUrl[x])) + ) + } else { + message(paste0("File exists! Skipping file ", basename(filesUrl$fileUrl[x]),"...")) + } + }, threads = min(threads, length(filesUrl))) + + #check for success of file download + if(!all(unlist(downloadFiles) == 0)) { + stop("Some tutorial files did not download successfully. Please try again.") + } + + downloadFiles + } #' Get PBMC Small Test Fragments @@ -58,14 +138,24 @@ getTutorialData <- function( #' #' @export getTestFragments <- function(x){ + + #Make Sure URL doesnt timeout + oldTimeout <- getOption('timeout') + options(timeout=100000) + if(!file.exists("PBMCSmall.tsv.gz")){ download.file( url = "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/PBMCSmall.tsv.gz", destfile = "PBMCSmall.tsv.gz" ) } + #Set back URL Options + options(timeout=oldTimeout) + + #Add Genome Return Name Vector addArchRGenome("hg19test") c("PBMC" = "PBMCSmall.tsv.gz") + } #' Get PBMC Small Test Project @@ -74,6 +164,10 @@ getTestFragments <- function(x){ #' #' @export getTestProject <- function(){ + #Make Sure URL doesnt timeout + oldTimeout <- getOption('timeout') + options(timeout=100000) + #Download if(!dir.exists("PBMCSmall")){ download.file( url = "https://jeffgranja.s3.amazonaws.com/ArchR/TestData/PBMCSmall.zip", @@ -82,6 +176,9 @@ getTestProject <- function(){ unzip("PBMCSmall.zip", exdir = getwd()) file.remove("PBMCSmall.zip") } + #Set back URL Options + options(timeout=oldTimeout) + #Load addArchRGenome("hg19test") loadArchRProject("PBMCSmall") } diff --git a/R/IntegrativeAnalysis.R b/R/IntegrativeAnalysis.R index 543d1eff..94116f75 100644 --- a/R/IntegrativeAnalysis.R +++ b/R/IntegrativeAnalysis.R @@ -475,7 +475,6 @@ correlateTrajectories <- function( #mcols(ranges1) <- featureDF1 names(ranges1) <- rownames(featureDF1) rowRanges(seTrajectory1) <- ranges1 - rm(ranges1) if("strand" %in% colnames(featureDF2)){ ranges2 <- GRanges( @@ -492,11 +491,12 @@ correlateTrajectories <- function( #mcols(ranges2) <- featureDF2 names(ranges2) <- rownames(featureDF2) rowRanges(seTrajectory2) <- ranges2 - rm(ranges2) .logThis(ranges1, "ranges1", logFile = logFile) .logThis(ranges2, "ranges2", logFile = logFile) - + rm(ranges1) + rm(ranges2) + #Find Associations to test isStranded1 <- any(as.integer(strand(seTrajectory1)) == 2) isStranded2 <- any(as.integer(strand(seTrajectory2)) == 2) @@ -760,7 +760,7 @@ addCoAccessibility <- function( #Create Ranges peakSummits <- resize(peakSet, 1, "center") - peakWindows <- resize(peakSummits, maxDist, "center") + peakWindows <- resize(peakSummits, 2*maxDist + 1, "center") #Create Pairwise Things to Test o <- DataFrame(findOverlaps(peakSummits, peakWindows, ignore.strand = TRUE)) @@ -774,6 +774,8 @@ addCoAccessibility <- function( #Peak Matrix ColSums cS <- .getColSums(getArrowFiles(ArchRProj), chri, verbose = FALSE, useMatrix = "PeakMatrix") + cS <- cS[ArchRProj$cellNames] + gS <- unlist(lapply(seq_along(knnObj), function(x) sum(cS[knnObj[[x]]], na.rm=TRUE))) for(x in seq_along(chri)){ @@ -844,7 +846,8 @@ addCoAccessibility <- function( #' #' @param ArchRProj An `ArchRProject` object. #' @param corCutOff A numeric describing the minimum numeric peak-to-peak correlation to return. -#' @param resolution A numeric describing the bp resolution to return loops as. This helps with overplotting of correlated regions. +#' @param resolution A numeric describing the bp resolution to use when returning loops. This helps with overplotting of correlated regions. +#' This only takes affect if `returnLoops = TRUE`. #' @param returnLoops A boolean indicating to return the co-accessibility signal as a `GRanges` "loops" object designed for use with #' the `ArchRBrowser()` or as an `ArchRBrowserTrack()`. #' @export @@ -946,6 +949,7 @@ getCoAccessibility <- function( #' #' @param ArchRProj An `ArchRProject` object. #' @param reducedDims The name of the `reducedDims` object (i.e. "IterativeLSI") to retrieve from the designated `ArchRProject`. +#' @param useMatrix The name of the matrix containing gene expression information to be used for determining peak-to-gene links. See `getAvailableMatrices(ArchRProj)` #' @param dimsToUse A vector containing the dimensions from the `reducedDims` object to use in clustering. #' @param scaleDims A boolean value that indicates whether to z-score the reduced dimensions for each cell. This is useful for minimizing #' the contribution of strong biases (dominating early PCs) and lowly abundant populations. However, this may lead to stronger sample-specific @@ -994,6 +998,7 @@ addPeak2GeneLinks <- function( .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj")) .validInput(input = reducedDims, name = "reducedDims", valid = c("character")) + .validInput(input = useMatrix, name = "useMatrix", valid = c("character")) .validInput(input = dimsToUse, name = "dimsToUse", valid = c("numeric", "null")) .validInput(input = scaleDims, name = "scaleDims", valid = c("boolean", "null")) .validInput(input = corCutOff, name = "corCutOff", valid = c("numeric", "null")) @@ -1322,7 +1327,7 @@ getPeak2GeneLinks <- function( geneTiles <- floor(start(geneStarts) / resolution) * resolution + floor(resolution / 2) }else{ summitTiles <- start(peakSummits) - geneTiles <- start(geneTiles) + geneTiles <- start(geneStarts) } loops <- .constructGR( diff --git a/R/IterativeLSI.R b/R/IterativeLSI.R index 57300734..c32f7795 100644 --- a/R/IterativeLSI.R +++ b/R/IterativeLSI.R @@ -11,9 +11,10 @@ #' "TileMatrix" or "PeakMatrix". #' @param name The name to use for storage of the IterativeLSI dimensionality reduction in the `ArchRProject` as a `reducedDims` object. #' @param iterations The number of LSI iterations to perform. -#' @param clusterParams A list of Additional parameters to be passed to `addClusters()` for clustering within each iteration. +#' @param clusterParams A list of additional parameters to be passed to `addClusters()` for clustering within each iteration. #' These params can be constant across each iteration, or specified for each iteration individually. Thus each param must be of -#' length == 1 or the total number of `iterations` - 1. PLEASE NOTE - We have updated these params to `resolution=2` and `maxClusters=6`! To use previous settings use `resolution=0.2` and `maxClusters=NULL`. +#' length == 1 or the total number of `iterations` - 1. If you want to use `scran` for clustering, you would pass this as `method="scran"`. +#` PLEASE NOTE - We have updated these params to `resolution=2` and `maxClusters=6`! To use previous settings use `resolution=0.2` and `maxClusters=NULL`. #' @param firstSelection First iteration selection method for features to use for LSI. Either "Top" for the top accessible/average or "Var" for the top variable features. #' "Top" should be used for all scATAC-seq data (binary) while "Var" should be used for all scRNA/other-seq data types (non-binary). #' @param depthCol A column in the `ArchRProject` that represents the coverage (scATAC = unique fragments, scRNA = unique molecular identifiers) per cell. @@ -24,8 +25,7 @@ #' Possible values are: 1 or "tf-logidf", 2 or "log(tf-idf)", and 3 or "logtf-logidf". #' @param scaleDims A boolean that indicates whether to z-score the reduced dimensions for each cell. This is useful forminimizing the contribution #' of strong biases (dominating early PCs) and lowly abundant populations. However, this may lead to stronger sample-specific biases since -#' it is over-weighting latent PCs. If set to `NULL` this will scale the dimensions based on the value of `scaleDims` when the `reducedDims` were -#' originally created during dimensionality reduction. This idea was introduced by Timothy Stuart. +#' it is over-weighting latent PCs. #' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to #' sequencing depth that is greater than the `corCutOff`, it will be excluded from analysis. #' @param binarize A boolean value indicating whether the matrix should be binarized before running LSI. This is often desired when working with insertion counts. @@ -47,9 +47,9 @@ #' @param totalFeatures The number of features to consider for use in LSI after ranking the features by the total number of insertions. #' These features are the only ones used throught the variance identification and LSI. These are an equivalent when using a `TileMatrix` to a defined peakSet. #' @param filterQuantile A number [0,1] that indicates the quantile above which features should be removed based on insertion counts prior -#' @param excludeChr A string of chromosomes to exclude for iterativeLSI procedure. #' to the first iteration of the iterative LSI paradigm. For example, if `filterQuantile = 0.99`, any features above the 99th percentile in #' insertion counts will be ignored for the first LSI iteration. +#' @param excludeChr A string of chromosomes to exclude for iterativeLSI procedure. #' @param saveIterations A boolean value indicating whether the results of each LSI iterations should be saved as compressed `.rds` files in #' the designated `outDir`. #' @param UMAPParams The list of parameters to pass to the UMAP function if "UMAP" if `saveIterations=TRUE`. See the function `uwot::umap()`. @@ -117,7 +117,7 @@ addIterativeLSI <- function( .validInput(input = varFeatures, name = "varFeatures", valid = c("integer")) .validInput(input = dimsToUse, name = "dimsToUse", valid = c("integer")) .validInput(input = LSIMethod, name = "LSIMethod", valid = c("integer", "character")) - .validInput(input = scaleDims, name = "scaleDims", valid = c("boolean", "null")) + .validInput(input = scaleDims, name = "scaleDims", valid = c("boolean")) .validInput(input = corCutOff, name = "corCutOff", valid = c("numeric")) .validInput(input = binarize, name = "binarize", valid = c("boolean")) .validInput(input = outlierQuantiles, name = "outlierQuantiles", valid = c("numeric", "null")) @@ -211,7 +211,10 @@ addIterativeLSI <- function( if(tolower(firstSelection) == "top"){ if(!binarize){ - stop("Please binarize data if using top selection for first iteration! Set binarize = TRUE!") + matClass <- h5read(ArrowFiles[1], paste0(useMatrix,"/Info/Class")) + if(matClass != "Sparse.Binary.Matrix"){ + stop("Input matrix is not binarized and binarize != TRUE. Please use binarized data if using top selection for first iteration! Set binarize = TRUE!") + } } #Compute Row Sums Across All Samples @@ -265,7 +268,7 @@ addIterativeLSI <- function( .logDiffTime("Computing Variable Features", tstart, addHeader = FALSE, verbose = verbose, logFile = logFile) nFeature <- varFeatures[1] if(nFeature > 0.5 * nrow(totalAcc)){ - stop("nFeature for variable selection must be at leat 1/2 the total features!") + stop("nFeature for variable selection must be less than 1/2 the total features!") } topIdx <- head(order(totalAcc$combinedVars, decreasing=TRUE), nFeature) topFeatures <- totalAcc[sort(topIdx),] @@ -285,7 +288,7 @@ addIterativeLSI <- function( v }, error = function(e){ tryCatch({ - .getColSums(ArrowFiles = ArrowFiles, useMatrix = useMatrix, seqnames = chrToRun) + .getColSums(ArrowFiles = ArrowFiles, useMatrix = useMatrix, seqnames = chrToRun)[ArchRProj$cellNames] }, error = function(y){ stop("Could not determine depth from depthCol or colSums!") }) @@ -544,7 +547,8 @@ addIterativeLSI <- function( cellNames = cellNames, doSampleCells = FALSE, threads = threads, - verbose = FALSE + verbose = FALSE, + logFile = logFile ) #Compute LSI @@ -591,7 +595,8 @@ addIterativeLSI <- function( cellNames = sampledCellNames, doSampleCells = FALSE, threads = threads, - verbose = FALSE + verbose = FALSE, + logFile = logFile ) #Compute LSI @@ -628,7 +633,8 @@ addIterativeLSI <- function( tmpPath = tmpPath, useIndex = useIndex, threads = threads, - verbose = FALSE + verbose = FALSE, + logFile = logFile ) gc() @@ -1096,7 +1102,7 @@ addIterativeLSI <- function( #.safeSaveRDS(mat, "temp.rds", compress = FALSE) matO <- mat[, idxOutlier, drop = FALSE] mat <- mat[, -idxOutlier, drop = FALSE] - mat2 <- mat[, head(seq_len(ncol(mat)), 10), drop = FALSE] # A 2nd Matrix to Check Projection is Working + mat2 <- mat[, head(seq_len(ncol(mat)), 50), drop = FALSE] # A 2nd Matrix to Check Projection is Working colSm <- colSm[-idxOutlier] filterOutliers <- 1 } @@ -1196,7 +1202,10 @@ addIterativeLSI <- function( cor(pCheck[,x], pCheck2[,x]) }) %>% unlist if(min(pCheck3) < 0.95){ - stop("Error with LSI-projection! Cor less than 0.95 of re-projection. Please report bug to github!") + .logThis(pCheck, "pCheck", logFile=logFile) + .logThis(pCheck2, "pCheck2", logFile=logFile) + .logThis(pCheck3, "pCheck3", logFile=logFile) + warning("Warning with LSI-projection! Cor less than 0.95 of re-projection. Please report this to github with logFile!") } #Project LSI Outliers out$outliers <- colnames(matO) diff --git a/R/MarkerFeatures.R b/R/MarkerFeatures.R index 0ed03a5b..b6fdbab9 100644 --- a/R/MarkerFeatures.R +++ b/R/MarkerFeatures.R @@ -129,7 +129,7 @@ getMarkerFeatures <- function( .logThis(range(as.vector(table(paste0(featureDF$seqnames)))), "FeaturesPerSeqnames", logFile = logFile) isDeviations <- FALSE - if(all(unique(paste0(featureDF$seqnames)) %in% c("z", "dev"))){ + if(all(unique(paste0(featureDF$seqnames)) %in% c("z", "deviations"))){ isDeviations <- TRUE } @@ -191,11 +191,14 @@ getMarkerFeatures <- function( ##################################################### # Pairwise Test Per Seqnames ##################################################### + #ColSums mColSums <- tryCatch({ - suppressMessages(.getColSums(ArrowFiles, seqnames = featureDF$seqnames@values, useMatrix = useMatrix, threads = threads)) + suppressMessages(tmpColSum <- .getColSums(ArrowFiles, seqnames = featureDF$seqnames@values, useMatrix = useMatrix, threads = threads)) + tmpColSum[ArchRProj$cellNames] }, error = function(x){ rep(1, nCells(ArchRProj)) }) + if(all(mColSums==1) & is.null(normBy)){ normBy <- "none" } @@ -216,7 +219,12 @@ getMarkerFeatures <- function( }else{ if(tolower(normBy) == "none"){ normFactors <- NULL + }else if(normBy %in% colnames(ArchRProj@cellColData)) { + normFactors <- getCellColData(ArchRProj, normBy, drop=FALSE) + normFactors[,1] <- median(normFactors[,1]) / normFactors[,1] }else{ + .logMessage("Warning! Parameter 'normBy' was set to ", normBy," but no matching column was found in cellColData.\n", + "Continuing with normalization based on column sums of matrix!", verbose = verbose, logFile = logFile) normFactors <- scaleTo / mColSums normFactors <- DataFrame(normFactors) } @@ -416,7 +424,16 @@ getMarkerFeatures <- function( }) %>% Reduce("rbind", .) - idxFilter <- rowSums(pairwiseDF[,c("mean1","mean2")]) != 0 + #Check for Mean being 0 for both Mean1 and Mean2 + idxFilter1 <- rowSums(pairwiseDF[,c("mean1","mean2")]) != 0 + + #Check For NA in Either Mean1 Mean2 + idxFilter2 <- rowSums(is.na(pairwiseDF[,c("mean1","mean2")])) == 0 + + #Combo Check + idxFilter <- idxFilter1 & idxFilter2 + + #FDR pairwiseDF$fdr <- NA pairwiseDF$fdr[idxFilter] <- p.adjust(pairwiseDF$pval[idxFilter], method = "fdr") pairwiseDF <- pairwiseDF[rownames(featureDF), , drop = FALSE] @@ -806,9 +823,13 @@ markerHeatmap <- function(...){ #' @param pal A custom continuous palette from `ArchRPalettes` (see `paletteContinuous()`) used to override the default continuous palette for the heatmap. #' @param binaryClusterRows A boolean value that indicates whether a binary sorting algorithm should be used for fast clustering of heatmap rows. #' @param clusterCols A boolean value that indicates whether the columns of the marker heatmap should be clustered. +#' @param subsetMarkers A vector of rownames from seMarker to use for subsetting of seMarker to only plot specific features on the heatmap. +#' Note that these rownames are expected to be integers that come from `rownames(rowData(seMarker))`. If this parameter is used for +#' subsetting, then the values provided to `cutOff` are effectively ignored. #' @param labelMarkers A character vector listing the `rownames` of `seMarker` that should be labeled on the side of the heatmap. -#' @param nLabel An integer value that indicates whether the top `n` features for each column in `seMarker` should be labeled on the side of the heatmap. -#' @param nPrint If provided `seMarker` is from "GeneScoreMatrix" print the top n genes for each group based on how uniquely up-regulated the gene is. +#' @param nLabel An integer value that indicates how many of the top `n` features for each column in `seMarker` should be labeled on the side of the heatmap. +#' To remove all feature labels, set `nLabel = 0`. +#' @param nPrint If provided `seMarker` is from "GeneScoreMatrix" print the top `n` genes for each group based on how uniquely up-regulated the gene is. #' @param labelRows A boolean value that indicates whether all rows should be labeled on the side of the heatmap. #' @param returnMatrix A boolean value that indicates whether the final heatmap matrix should be returned in lieu of plotting the actual heatmap. #' @param transpose A boolean value that indicates whether the heatmap should be transposed prior to plotting or returning. @@ -830,6 +851,7 @@ plotMarkerHeatmap <- function( pal = NULL, binaryClusterRows = TRUE, clusterCols = TRUE, + subsetMarkers = NULL, labelMarkers = NULL, nLabel = 15, nPrint = 15, @@ -851,9 +873,10 @@ plotMarkerHeatmap <- function( .validInput(input = pal, name = "pal", valid = c("character", "null")) .validInput(input = binaryClusterRows, name = "binaryClusterRows", valid = c("boolean")) .validInput(input = clusterCols, name = "clusterCols", valid = c("boolean")) + .validInput(input = subsetMarkers, name = "subsetMarkers", valid = c("integer", "null")) .validInput(input = labelMarkers, name = "labelMarkers", valid = c("character", "null")) - .validInput(input = nLabel, name = "nLabel", valid = c("integer", "null")) - .validInput(input = nPrint, name = "nPrint", valid = c("integer", "null")) + .validInput(input = nLabel, name = "nLabel", valid = c("integer")) + .validInput(input = nPrint, name = "nPrint", valid = c("integer")) .validInput(input = labelRows, name = "labelRows", valid = c("boolean")) .validInput(input = returnMatrix, name = "returnMatrix", valid = c("boolean")) .validInput(input = transpose, name = "transpose", valid = c("boolean")) @@ -902,6 +925,16 @@ plotMarkerHeatmap <- function( }else{ idx <- which(rowSums(passMat, na.rm = TRUE) > 0 & matrixStats::rowVars(mat) != 0 & !is.na(matrixStats::rowVars(mat))) } + + if(!is.null(subsetMarkers)) { + if(length(which(subsetMarkers %ni% 1:nrow(mat))) == 0){ + idx <- subsetMarkers + } else { + stop("Rownames / indices provided to the subsetMarker parameter are outside of the boundaries of seMarker.") + } + + } + mat <- mat[idx,,drop=FALSE] passMat <- passMat[idx,,drop=FALSE] @@ -934,15 +967,19 @@ plotMarkerHeatmap <- function( } spmat <- passMat / rowSums(passMat) - if(metadata(seMarker)$Params$useMatrix == "GeneScoreMatrix"){ - message("Printing Top Marker Genes:") - for(x in seq_len(ncol(spmat))){ - genes <- head(order(spmat[,x], decreasing = TRUE), nPrint) - message(colnames(spmat)[x], ":") - message("\t", paste(as.vector(rownames(mat)[genes]), collapse = ", ")) + #only print out identified marker genes if subsetMarkers is NULL + if(is.null(subsetMarkers)) { + if(metadata(seMarker)$Params$useMatrix == "GeneScoreMatrix"){ + message("Printing Top Marker Genes:") + for(x in seq_len(ncol(spmat))){ + genes <- head(order(spmat[,x], decreasing = TRUE), nPrint) + message(colnames(spmat)[x], ":") + message("\t", paste(as.vector(rownames(mat)[genes]), collapse = ", ")) + } } } + if(is.null(labelMarkers)){ labelMarkers <- lapply(seq_len(ncol(spmat)), function(x){ as.vector(rownames(mat)[head(order(spmat[,x], decreasing = TRUE), nLabel)]) @@ -962,7 +999,9 @@ plotMarkerHeatmap <- function( mat <- bS[[1]][,colnames(mat),drop=FALSE] } clusterRows <- FALSE - clusterCols <- bS[[2]] + if (clusterCols) { + clusterCols <- bS[[2]] + } }else{ clusterRows <- TRUE clusterCols <- TRUE @@ -1214,13 +1253,16 @@ markerPlot <- function(...){ #' @param cutOff A valid-syntax logical statement that defines which marker features from `seMarker` will be plotted. #' `cutoff` can contain any of the `assayNames` from `seMarker`. #' @param plotAs A string indicating whether to plot a volcano plot ("Volcano") or an MA plot ("MA"). +#' @param rastr A boolean value that indicates whether the plot should be rasterized using `ggrastr`. This does not rasterize +#' lines and labels, just the internal portions of the plot. #' @export plotMarkers <- function( seMarker = NULL, name = NULL, cutOff = "FDR <= 0.01 & abs(Log2FC) >= 0.5", plotAs = "Volcano", - scaleTo = 10^4 + scaleTo = 10^4, + rastr = TRUE ){ .validInput(input = seMarker, name = "seMarker", valid = c("SummarizedExperiment")) @@ -1228,6 +1270,7 @@ plotMarkers <- function( .validInput(input = cutOff, name = "cutOff", valid = c("character")) .validInput(input = plotAs, name = "plotAs", valid = c("character")) .validInput(input = scaleTo, name = "scaleTo", valid = c("numeric")) + .validInput(input = rastr, name = "rastr", valid = c("boolean")) #Evaluate AssayNames assayNames <- names(SummarizedExperiment::assays(seMarker)) @@ -1282,7 +1325,7 @@ plotMarkers <- function( ylim = c(-qLFC, qLFC), size = 1, extend = 0, - rastr = TRUE, + rastr = rastr, labelMeans = FALSE, labelAsFactors = FALSE, pal = pal, @@ -1299,7 +1342,7 @@ plotMarkers <- function( xlim = c(-qLFC, qLFC), extend = 0, size = 1, - rastr = TRUE, + rastr = rastr, labelMeans = FALSE, labelAsFactors = FALSE, pal = pal, @@ -1316,7 +1359,7 @@ plotMarkers <- function( xlim = c(-qDiff, qDiff), extend = 0, size = 1, - rastr = TRUE, + rastr = rastr, labelMeans = FALSE, labelAsFactors = FALSE, pal = pal, diff --git a/R/MatrixDeviations.R b/R/MatrixDeviations.R index fd9de175..904c049f 100644 --- a/R/MatrixDeviations.R +++ b/R/MatrixDeviations.R @@ -626,6 +626,11 @@ addBgdPeaks <- function( .validInput(input = outFile, name = "outFile", valid = c("character")) .validInput(input = force, name = "force", valid = c("boolean")) + if ("PeakMatrix" %ni% getAvailableMatrices(ArchRProj)) { + .logMessage(paste0("PeakMatrix does not exist in the provided ArchRProject. Add a peak matrix using addPeakMatrix(). See available matrix names from getAvailableMatrices()!"), logFile = logFile) + stop("PeakMatrix does not exist in the provided ArchRProject. Add a peak matrix using addPeakMatrix(). See available matrix names from getAvailableMatrices()!") + } + if(!is.null(metadata(getPeakSet(ArchRProj))$bgdPeaks) & !force){ if(file.exists(metadata(getPeakSet(ArchRProj))$bgdPeaks)){ @@ -755,6 +760,22 @@ getBgdPeaks <- function( useMatrix = useMatrix, filter0 = FALSE )) + + all1 <- all( + paste0(rS$seqnames, ":", rS$idx) %in% + paste0(seqnames(ArchRProj@peakSet), ":", ArchRProj@peakSet$idx) + ) + + all2 <- all( + paste0(seqnames(ArchRProj@peakSet), ":", ArchRProj@peakSet$idx) %in% + paste0(rS$seqnames, ":", rS$idx) + ) + + if(!(all1 & all2)){ + stop("PeakSet in Arrows does not match PeakSet in ArchRProject! + To try to solve this, try re-running addPeakMatrix(ArchRProj, force=TRUE)") + } + rS$start <- start(ArchRProj@peakSet) rS$end <- end(ArchRProj@peakSet) rS$GC <- ArchRProj@peakSet$GC diff --git a/R/MatrixFeatures.R b/R/MatrixFeatures.R index a13730ae..15e50137 100644 --- a/R/MatrixFeatures.R +++ b/R/MatrixFeatures.R @@ -265,6 +265,8 @@ addPeakMatrix <- function( for(z in seq_along(uniqueChr)){ + prefix <- sprintf("Chr %s (%s of %s)!", uniqueChr[z], z, length(uniqueChr)) + o <- tryCatch({ o <- h5closeAll() diff --git a/R/MatrixGeneExpression.R b/R/MatrixGeneExpression.R index 1b432e45..b54f409f 100644 --- a/R/MatrixGeneExpression.R +++ b/R/MatrixGeneExpression.R @@ -17,6 +17,9 @@ #' @param verbose A boolean describing whether to print to console messages of progress. #' @param threads The number of threads to be used for parallel computing. #' @param parallelParam A list of parameters to be passed for biocparallel/batchtools parallel computing. +#' @param strictMatch A boolean value indicating whether every cell in `input` must be represented in `seRNA`. If set to `FALSE`, +#' and this `GeneExpressionMatrix` is used for certain downstream analyses such as `addIterativeLSI()`, then errors may occur +#' because not all cells will have relevant information. #' @param force A boolean value indicating whether to force the matrix indicated by `matrixName` to be overwritten if it already exist in the given `input`. #' @param logFile The path to a file to be used for logging ArchR output. #' @export @@ -29,10 +32,24 @@ addGeneExpressionMatrix <- function( verbose = TRUE, threads = getArchRThreads(), parallelParam = NULL, + strictMatch = FALSE, force = TRUE, logFile = createLogFile("addGeneExpressionMatrix") ){ + .validInput(input = input, name = "input", valid = c("ArchRProj", "character")) + .validInput(input = seRNA, name = "seRNA", valid = c("SummarizedExperiment")) + .validInput(input = chromSizes, name = "chromSizes", valid = c("granges")) + .validInput(input = excludeChr, name = "excludeChr", valid = c("character", "null")) + .validInput(input = scaleTo, name = "scaleTo", valid = c("numeric")) + .validInput(input = verbose, name = "verbose", valid = c("boolean")) + .validInput(input = threads, name = "threads", valid = c("integer")) + .validInput(input = parallelParam, name = "parallelParam", valid = c("parallelparam", "null")) + .validInput(input = strictMatch, name = "strictMatch", valid = c("boolean")) + .validInput(input = force, name = "force", valid = c("boolean")) + .validInput(input = logFile, name = "logFile", valid = c("character")) + + if(inherits(input, "ArchRProject")){ ArrowFiles <- getArrowFiles(input) allCells <- rownames(getCellColData(input)) @@ -61,11 +78,18 @@ addGeneExpressionMatrix <- function( if(!is.null(allCells)){ cellsInArrows <- allCells } + overlap <- sum(cellsInArrows %in% colnames(seRNA)) / length(cellsInArrows) .logMessage("Overlap w/ scATAC = ", round(overlap,3), logFile = logFile, verbose = TRUE) if(overlap == 0){ - stop("No overlap found with scATAC!") + stop("No overlapping cell names found between ArrowFiles and seRNA object! Cell names in ArrowFiles must match colnames in seRNA!") + } else if(overlap != 1) { + if(strictMatch){ + stop("Error! 'strictMatch = TRUE' and not all cells in input are represented in the provided gene expression seRNA. To proceed, please subset your ArchRProject using the subsetArchRProject() function to contain only cells present in seRNA or set 'strictMatch = FALSE'.") + } else { + .logMessage("Warning! Not all cells in input exist in seRNA! This may cause downstream issues with functions that require information from all cells. For example, addIterativeLSI() will not work on this GeneExpressionMatrix! To remove these mis-matched cells, subset your ArchRProject using the subsetArchRProject() function to contain only cells present in seRNA and set 'strictMatch = TRUE'", logFile = logFile, verbose = TRUE) + } } splitCells <- split(cellsInArrows, stringr::str_split(cellsInArrows, pattern = "#", simplify=TRUE)[,1]) @@ -123,6 +147,7 @@ addGeneExpressionMatrix <- function( #Remove Input from args args$input <- NULL args$chromSizes <- NULL + args$strictMatch <- NULL #Run With Parallel or lapply outList <- .batchlapply(args) diff --git a/R/MatrixGeneScores.R b/R/MatrixGeneScores.R index ef34822d..1d1d10b3 100644 --- a/R/MatrixGeneScores.R +++ b/R/MatrixGeneScores.R @@ -103,6 +103,9 @@ addGeneScoreMatrix <- function( if(inherits(mcols(genes)$symbol, "list") | inherits(mcols(genes)$symbol, "SimpleList")){ stop("Found a list in genes symbol! This is an incorrect format. Please correct your genes!") } + if(!any(colnames(mcols(genes)) == "symbol")) { + stop("No symbol column in genes! A column named symbol is exected in the GRanges object passed to the genes parameter!") + } .startLogging(logFile = logFile) .logThis(mget(names(formals()),sys.frame(sys.nframe())), "addGeneScoreMatrix Input-Parameters", logFile = logFile) @@ -122,7 +125,7 @@ addGeneScoreMatrix <- function( if(subThreading){ h5disableFileLocking() }else{ - args$threads <- length(inputFiles) + args$threads <- length(ArrowFiles) } #Remove Input from args @@ -218,9 +221,9 @@ addGeneScoreMatrix <- function( if(useTSS){ .logMessage(paste0(sampleName, " .addGeneScoreMat useTSS = TRUE")) distMethod <- "GenePromoter" - geneRegions$geneStart <- start(resize(geneRegions, 1, "start")) - geneRegions$geneEnd <- start(resize(geneRegions, 1, "end")) - geneRegions <- resize(geneRegions, 1, "start") + geneRegions$geneStart <- start(GenomicRanges::resize(geneRegions, 1, "start")) + geneRegions$geneEnd <- start(GenomicRanges::resize(geneRegions, 1, "end")) + geneRegions <- GenomicRanges::resize(geneRegions, 1, "start") if(extendTSS){ geneRegions <- extendGR(gr = geneRegions, upstream = geneUpstream, downstream = geneDownstream) } @@ -228,8 +231,8 @@ addGeneScoreMatrix <- function( }else{ .logMessage(paste0(sampleName, " .addGeneScoreMat useTSS = FALSE")) distMethod <- "GeneBody" - geneRegions$geneStart <- start(resize(geneRegions, 1, "start")) - geneRegions$geneEnd <- start(resize(geneRegions, 1, "end")) + geneRegions$geneStart <- start(GenomicRanges::resize(geneRegions, 1, "start")) + geneRegions$geneEnd <- start(GenomicRanges::resize(geneRegions, 1, "end")) geneRegions <- extendGR(gr = geneRegions, upstream = geneUpstream, downstream = geneDownstream) m <- 1 / width(geneRegions) geneRegions$geneWeight <- 1 + m * (geneScaleFactor - 1) / (max(m) - min(m)) @@ -314,8 +317,8 @@ addGeneScoreMatrix <- function( #Time to Overlap Gene Windows if(useGeneBoundaries){ - geneStartz <- start(resize(geneRegionz, 1, "start")) - geneEndz <- start(resize(geneRegionz, 1, "end")) + geneStartz <- start(GenomicRanges::resize(geneRegionz, 1, "start")) + geneEndz <- start(GenomicRanges::resize(geneRegionz, 1, "end")) pminGene <- pmin(geneStartz, geneEndz) pmaxGene <- pmax(geneStartz, geneEndz) @@ -377,7 +380,7 @@ addGeneScoreMatrix <- function( #Determine Sign for Distance relative to strand (Directionality determined based on dist from gene start) isMinus <- BiocGenerics::which(strand(geneRegionz) == "-") - signDist <- sign(start(uniqueTiles)[subjectHits(tmp)] - start(resize(geneRegionz,1,"start"))[queryHits(tmp)]) + signDist <- sign(start(uniqueTiles)[subjectHits(tmp)] - start(GenomicRanges::resize(geneRegionz,1,"start"))[queryHits(tmp)]) signDist[isMinus] <- signDist[isMinus] * -1 #Correct the orientation for the distance! @@ -393,7 +396,7 @@ addGeneScoreMatrix <- function( if(!is.null(blacklist)){ if(length(blacklist) > 0){ blacklistz <- blacklist[[chrz]] - if(is.null(blacklistz) | length(blacklistz) > 0){ + if(!is.null(blacklistz) | length(blacklistz) > 0){ tilesBlacklist <- 1 * (!overlapsAny(uniqueTiles, ranges(blacklistz))) if(sum(tilesBlacklist == 0) > 0){ x <- x * tilesBlacklist[subjectHits(tmp)] #Multiply Such That All Blacklisted Tiles weight is now 0! diff --git a/R/MatrixTiles.R b/R/MatrixTiles.R index fa22896f..4b420c50 100644 --- a/R/MatrixTiles.R +++ b/R/MatrixTiles.R @@ -5,7 +5,6 @@ #' Add TileMatrix to ArrowFiles or an ArchRProject #' #' This function, for each sample, will independently compute counts for each tile -#' per cell in the ArrowFile #' #' @param input An `ArchRProject` object or character vector of ArrowFiles. #' @param chromSizes A named numeric vector containing the chromsome names and lengths. The default behavior is to retrieve @@ -204,6 +203,29 @@ addTileMatrix <- function( .logThis(min(matchID), paste0("MinCell_TileMatrix_",z,"_",chr), logFile = logFile) .logThis(max(matchID), paste0("MaxCell_TileMatrix_",z,"_",chr), logFile = logFile) + #Check Fragments for validity in case + nf1 <- length(fragments) + + #Check 1 + fragmentsBad1 <- fragments[!(start(fragments) >= 1)] + fragments <- fragments[start(fragments) >= 1] + + #Check 2 + fragmentsBad2 <- fragments[!(end(fragments) <= chromLengths[z])] + fragments <- fragments[end(fragments) <= chromLengths[z]] + + #Check N + nf2 <- length(fragments) + if(nf2 < nf1){ + warning("Skipping over fragments not within chromosome range on Chr:", chr) + .logThis(fragmentsBad1, "fragmentsBad1", logFile = logFile) + print("Bad1 (Start not greater than 0): ") + print(fragmentsBad1) + print("Bad2 (End greater than chromsome length): ") + .logThis(fragmentsBad2, "fragmentsBad2", logFile = logFile) + print(fragmentsBad2) + } + #Create Sparse Matrix mat <- Matrix::sparseMatrix( i = c(trunc(start(fragments) / tileSize), trunc(end(fragments) / tileSize)) + 1, @@ -244,7 +266,8 @@ addTileMatrix <- function( Group = paste0("TileMatrix/", chr), binarize = binarize, addColSums = TRUE, - addRowSums = TRUE + addRowSums = TRUE, + addRowVarsLog2 = TRUE ) gc() diff --git a/R/ModuleScore.R b/R/ModuleScore.R index 99af34de..725276ab 100644 --- a/R/ModuleScore.R +++ b/R/ModuleScore.R @@ -1,12 +1,20 @@ #' Add Module Scores to an ArchRProject #' -#' This function computes imputations weights that describe each cell as a linear combination of many cells based on a MAGIC diffusion matrix. -#' -#' RRR +#' This function calculates a module score from a set of features across all cells. This allows for +#' grouping of multiple features together into a single quantitative measurement. Currently, this +#' function only works for modules derived from the `GeneScoreMatrix`. Each module is added as a +#' new column in `cellColData` #' #' @param ArchRProj An `ArchRProject` object. -#' @param seed A number to be used as the seed for random number generation. It is recommended to keep track of the seed used so that you can -#' reproduce results downstream. +#' @param useMatrix The name of the matrix to be used for calculation of the module score. See `getAvailableMatrices()` to view available options. +#' @param name The name to be given to the designated module. If `features` is a list, this name will be prepended to the feature set names given in the list as shown below. +#' @param features A list of feature names to be grouped into modules. For example, `list(BScore = c("MS4A1", "CD79A", "CD74"), TScore = c("CD3D", "CD8A", "GZMB", "CCR7", "LEF1"))`. +#' Each named element in this list will be stored as a separate module. The examples given in these parameters would yield two modules called `Module.Bscore` and `Module.Tscore`. +#' If the elements of this list are not named, they will be numbered in order, i.e. `Module1`, `Module2`. +#' @param nBin The number of bins to use to divide all features for identification of signal-matched features for background calculation +#' @param nBgd The number of background features to use for signal normalization. +#' @param seed A number to be used as the seed for random number generation required when sampling cells for the background set. It is recommended +#' to keep track of the seed used so that you can reproduce results downstream. #' @param threads The number of threads to be used for parallel computing. #' @param logFile The path to a file to be used for logging ArchR output. #' @export @@ -22,6 +30,20 @@ addModuleScore <- function( logFile = createLogFile("addModuleScore") ){ + .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj")) + .validInput(input = useMatrix, name = "useMatrix", valid = c("character")) + .validInput(input = name, name = "name", valid = c("character")) + .validInput(input = features, name = "features", valid = c("list")) + .validInput(input = nBin, name = "nBin", valid = c("integer")) + .validInput(input = nBgd, name = "nBgd", valid = c("integer")) + .validInput(input = seed, name = "seed", valid = c("integer","null")) + .validInput(input = threads, name = "threads", valid = c("integer")) + .validInput(input = logFile, name = "logFile", valid = c("character", "null")) + + if(useMatrix %ni% getAvailableMatrices(ArchRProj)){ + stop("useMatrix not in available matrices! See getAvailableMatrices!") + } + if(!is.null(seed)) set.seed(seed) #Get Feature DF @@ -29,10 +51,6 @@ addModuleScore <- function( rownames(featureDF) <- paste0(featureDF$seqnames, ":", featureDF$idx) featureDF$Match <- seq_len(nrow(featureDF)) - if(useMatrix %ni% getAvailableMatrices(ArchRProj)){ - stop("useMatrix not in available matrices! See getAvailableMatrices!") - } - matrixClass <- h5read(getArrowFiles(ArchRProj)[1], paste0(useMatrix, "/Info/Class")) if(matrixClass == "Sparse.Assays.Matrix"){ @@ -42,6 +60,7 @@ addModuleScore <- function( } } + #Figure out the index numbers of the selected features within the given matrix if(grepl(":",unlist(features)[1])){ sname <- stringr::str_split(unlist(features),pattern=":",simplify=TRUE)[,1] @@ -50,7 +69,7 @@ addModuleScore <- function( idx <- lapply(seq_along(name), function(x){ ix <- intersect(which(tolower(name[x]) == tolower(featureDF$name)), BiocGenerics::which(tolower(sname[x]) == tolower(featureDF$seqnames))) if(length(ix)==0){ - .logStop(sprintf("FeatureName (%s) does not exist! See getFeatures", name[x]), logFile = logFile) + .logStop(sprintf("FeatureName (%s) does not exist! See available features using getFeatures()", name[x]), logFile = logFile) } ix }) %>% unlist @@ -59,8 +78,8 @@ addModuleScore <- function( idx <- lapply(seq_along(unlist(features)), function(x){ ix <- which(tolower(unlist(features)[x]) == tolower(featureDF$name))[1] - if(length(ix)==0){ - .logStop(sprintf("FeatureName (%s) does not exist! See getFeatures", unlist(features)[x]), logFile = logFile) + if(is.na(ix)){ + .logStop(sprintf("FeatureName (%s) does not exist! See available features using getFeatures()", unlist(features)[x]), logFile = logFile) } ix }) %>% unlist @@ -76,24 +95,28 @@ addModuleScore <- function( featuresUse <- featureDF[idx,] featuresUse$Module <- Rle(stack(features)[,2]) - #Get Averages + #Get average values for all features and then order the features based on their average values + #so that the features can be binned into nBins rS <- ArchR:::.getRowSums(ArrowFiles = getArrowFiles(ArchRProj), useMatrix = useMatrix) rS <- rS[order(rS[,3]), ] rS$Bins <- Rle(ggplot2::cut_number(x = rS[,3] + rnorm(length(rS[,3]))/1e30, n = nBin, labels = FALSE, right = FALSE)) rS$Match <- match(paste0(rS$seqnames, ":", rS$idx), rownames(featureDF)) + #check that the number of selected background features isnt bigger than the size of each bin if(nBgd > min(rS$Bins@lengths)){ stop("nBgd must be lower than ", min(rS$Bins@lengths), "!") } + #Match the indicies across the different vectors idxMatch <- match(paste0(featuresUse$seqnames, ":", featuresUse$idx), paste0(rS$seqnames, ":", rS$idx)) featuresUse$Bins <- as.vector(rS$Bins[idxMatch]) - #MakeLists - featureList <- split(featuresUse$Match, featuresUse$Module) - moduleList <- split(featuresUse$Bins, featuresUse$Module) - binList <- split(rS$Match, rS$Bins) + #Make lists + featureList <- split(featuresUse$Match, featuresUse$Module) #feature indicies per module + moduleList <- split(featuresUse$Bins, featuresUse$Module) #bins for each feature per module + binList <- split(rS$Match, rS$Bins) #list of all indicies for each bin + #calculate the module score by normalizing to a background set of features dfM <- lapply(seq_along(featureList), function(x){ message("Computing Module ",x, " of ", length(featureList)) binx <- binList[moduleList[[x]]] @@ -109,8 +132,15 @@ addModuleScore <- function( doSampleCells = FALSE ) Matrix::colMeans(m[seq_along(idxFgd), ]) - Matrix::colMeans(m[-seq_along(idxFgd), ]) - }) %>% Reduce("cbind", .) - + }) + + if (length(features) > 1) { + dfM <- Reduce("cbind", dfM) + } else { + dfM <- as.data.frame(dfM[[1]], row.names = names(dfM), drop = FALSE) + } + + #add the module scores as new columns in cellColData for(x in seq_len(ncol(dfM))){ ArchRProj <- addCellColData(ArchRProj, data = dfM[,x], name=names(featureList)[x], cells=rownames(dfM), force = TRUE) } diff --git a/R/MultiModal.R b/R/MultiModal.R index ac20df74..6733d78c 100644 --- a/R/MultiModal.R +++ b/R/MultiModal.R @@ -8,35 +8,120 @@ #' #' @param input A character of paths to 10x feature hdf5 file(s). These will traditionally have a suffix similar to "filtered_feature_bc_matrix.h5". #' @param names A character of sample names associated with each input file. +#' @param strictMatch Only relevant when multiple input files are used. A boolean that indictes whether rows (genes) that do not match perfectly in the matrices +#' should be removed (`strictMatch = TRUE`) or coerced (`strictMatch = FALSE`). CellRanger seems to occassionally use different ensembl ids for the same gene across +#' different samples. If you are comfortable tolerating such mismatches, you can coerce all matrices to fit together, in which case the gene metadata present in +#' the first listed sample will be applied to all matrices for that particular gene entry. Regardless of what value is used for `strictMatch`, this function +#' cannot tolerate mismatched gene names, only mismatched metadata for the same gene. +#' @param verbose Only relevant when multiple input files are used. A boolean that indicates whether messaging about mismatches should be verbose (`TRUE`) or minimal (`FALSE`) #' @param featureType The name of the feature to extract from the 10x feature file. #' See https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices for more information. #' @export import10xFeatureMatrix <- function( input = NULL, - names = NULL, + names = NULL, + strictMatch = TRUE, + verbose = TRUE, featureType = "Gene Expression" - ){ - - if(!all(file.exists(input))){ +){ + + .validInput(input = input, name = "input", valid = c("character")) + .validInput(input = names, name = "names", valid = c("character")) + .validInput(input = strictMatch, name = "strictMatch", valid = c("boolean")) + .validInput(input = verbose, name = "verbose", valid = c("boolean")) + .validInput(input = featureType, name = "featureType", valid = c("character")) + + if (!all(file.exists(input))) { stop("Not all input file paths exist!") } - - featureMats <- lapply(seq_along(input), function(y){ + featureMats <- lapply(seq_along(input), function(y) { message("Importing Feature Matrix ", y, " of ", length(input)) - .importFM(featureMatrix = input[y], featureType = featureType, name = names[y]) - }) - - featureMats <- tryCatch({ - Reduce("cbind", featureMats) - }, error = function(e){ - message("Error in combining individual feature matrices! Returning as a list of individual feature matrices!") - featureMats + .importFM(featureMatrix = input[y], featureType = featureType, + name = names[y]) }) + + message("Re-ordering RNA matricies for consistency.") + for(j in 1:length(featureMats)) { + featureMats[[j]] <- sort.GenomicRanges(sortSeqlevels(featureMats[[j]]), ignore.strand = TRUE) + } + + #if more than one filtered feature barcode matrix is supplied, then merge the RSE objects + if (length(featureMats) > 1) { + message("Merging individual RNA objects...") + #make the first matrix the base matrix and merge all others into it + rse_final <- featureMats[[1]] + + rowsToRemove <- c() #rows that have previously been removed from rse_final + + #for each additional feature matrix (starting with the second), look for mismatches with rse_final and merge accordingly + for (i in 2:length(featureMats)) { + mismatchWarning <- TRUE #a boolean to prevent output of the warning message many times and only output it once + + message(sprintf("\nMerging %s", names[i])) + + if (!identical(rownames(rse_final), rownames(featureMats[[i]]))) { + stop("Error - rownames (genes) of individual RNA objects are not equivalent.") + } + if (!identical(colnames(rowData(rse_final)), colnames(rowData(featureMats[[i]])))) { + stop("Error - rowData (gene metadata) of individual RNA objects have different columns. This is highly unusual and merging has been aborted.") + } + if (!identical(names(assays(rse_final)), names(assays(featureMats[[i]])))) { + stop("Error - available assays of individual RNA objects are not equivalent. Each object is expected to only have one assay named 'counts'.") + } + + #check each column in rowData to check for mismatches that should be thrown as warnings + #occasionally, it seems like 10x is annotating different ensembl IDs to the same gene which seems like a bad way to go + #this is a bit heavy-handed but it seems like the safest thing to do is report any mismatch rather than merge blindly + + for (x in 1:ncol(rowData(rse_final))) { + if (!identical(rowData(rse_final)[,x], rowData(featureMats[[i]])[,x])) { + if(mismatchWarning) { + message(sprintf("Warning! Some values within column \"%s\" of the rowData (gene metadata) of your objects do not precisely match!", colnames(rowData(rse_final))[x])) + message("This is often caused by slight variations in Ensembl IDs and gene locations used by cellranger across different samples. ArchR will ignore these mismatches and allow merging to proceed but you should check to make sure that these are ok for your data.\n") + mismatchWarning <- FALSE + } + + #detect all of the mismatches betwenn rse_final and the current featureMat + mismatch <- which(rowData(rse_final)[,x] != rowData(featureMats[[i]])[,x]) + #for each detected mismatch, handle the mismatch according to the value of strictMatch + for (y in 1:length(mismatch)) { + if (verbose) { + message(sprintf("Mismatch in column \"%s\" row %s for %s: %s does not exactly match %s!", colnames(rowData(rse_final))[x], mismatch[y], names[i], rowData(rse_final)[mismatch[y],x], rowData(featureMats[[i]])[mismatch[y],x])) + } + if (strictMatch) { + if (verbose) { + message("strictMatch = TRUE so the corresponding gene entry with mismatching information will be removed.") + } + rowsToRemove <- unique(c(rowsToRemove, mismatch[y])) + #temporarily force the data to match so that merging can occur easily. Mismatched rows will be removed later + rowData(featureMats[[i]])[mismatch[y],] <- rowData(rse_final)[mismatch[y],] + rowRanges(featureMats[[i]])[mismatch[y]] <- rowRanges(rse_final)[mismatch[y]] + } else { + if (verbose) { + message("strictMatch = FALSE so mismatching information will be coerced to match the first sample provided.") + } + rowData(featureMats[[i]])[mismatch[y],] <- rowData(rse_final)[mismatch[y],] + rowRanges(featureMats[[i]])[mismatch[y]] <- rowRanges(rse_final)[mismatch[y]] + } + } + } + } - featureMats - + rse_final <- SummarizedExperiment::cbind(rse_final, featureMats[[i]]) + } + if (strictMatch) { + if(length(rowsToRemove) > 0) { + rse_final <- rse_final[-rowsToRemove,] + } + } + return(rse_final) + } + else { + return(featureMats[[1]]) + } } + .importFM <- function(featureMatrix = NULL, featureType = NULL, name = NULL){ o <- h5closeAll() diff --git a/R/ProjectMethods.R b/R/ProjectMethods.R index 1cb18b7d..0ac8258c 100644 --- a/R/ProjectMethods.R +++ b/R/ProjectMethods.R @@ -390,7 +390,6 @@ addPeakSet <- function( #Get NucleoTide Content peakSet <- tryCatch({ - .requirePackage(genomeAnnotation$genome) .requirePackage("Biostrings",source="bioc") BSgenome <- eval(parse(text = genomeAnnotation$genome)) BSgenome <- validBSgenome(BSgenome) diff --git a/R/QualityControl.R b/R/QualityControl.R index 20c83f30..39672ab5 100644 --- a/R/QualityControl.R +++ b/R/QualityControl.R @@ -47,8 +47,10 @@ plotTSSEnrichment <- function( chr <- paste0(seqnames(chromSizes)) chr <- gtools::mixedsort(intersect(chr, paste0(seqnames(TSS)))) + .logThis(chr, paste0("chr"), logFile = logFile) TSS <- sort(sortSeqlevels(TSS)) - splitTSS <- split(resize(TSS,1,"start"), seqnames(TSS))[chr] + splitTSS <- split(GenomicRanges::resize(TSS,1,"start"), seqnames(TSS))[chr] + .logThis(splitTSS, paste0("splitTSS"), logFile = logFile) window <- 2 * flank + 1 groups <- getCellColData(ArchRProj = ArchRProj, select = groupBy, drop = FALSE) uniqGroups <- gtools::mixedsort(unique(groups[,1])) @@ -57,50 +59,68 @@ plotTSSEnrichment <- function( h5disableFileLocking() } - dfTSS <- .safelapply(seq_along(uniqGroups), function(x){ + dfTSS <- .safelapply(seq_along(uniqGroups), function(z){ - .logDiffTime(paste0(uniqGroups[x], " Computing TSS (",x," of ",length(uniqGroups),")!"), t1 = tstart, logFile = logFile) + .logDiffTime(paste0(uniqGroups[z], " Computing TSS (",z," of ",length(uniqGroups),")!"), t1 = tstart, logFile = logFile) - cellx <- rownames(groups)[which(paste0(groups[,1]) == uniqGroups[x])] + cellx <- rownames(groups)[which(paste0(groups[,1]) == uniqGroups[z])] - for(i in seq_along(chr)){ + for(k in seq_along(chr)){ - TSSi <- splitTSS[[chr[i]]] + #TSS for Chr + TSSi <- splitTSS[[chr[k]]] - covi <- unlist(suppressMessages(getFragmentsFromProject( + #Set TSS To be a dummy chr1 + TSSi <- GRanges(seqnames=rep("chr1",length(TSSi)), ranges = ranges(TSSi), strand = strand(TSSi)) + .logThis(TSSi, paste0(uniqGroups[z], " : TSSi : ", chr[k]), logFile = logFile) + + #Extract Fragments + covi <- suppressMessages(getFragmentsFromProject( ArchRProj = ArchRProj, - subsetBy = chromSizes[paste0(seqnames(chromSizes)) %in% chr[i]], + subsetBy = chromSizes[paste0(seqnames(chromSizes)) %in% chr[k]], cellNames = cellx, logFile = logFile - )), use.names=FALSE) %>% - sort %>% - {coverage(IRanges(c(start(.), end(.)), width = 1))} - - .logThis(covi, paste0(uniqGroups[x], " : Cov : ", chr[i]), logFile = logFile) - - if(i == 1){ - sumTSS <- rleSumsStranded(list(chr1=covi), list(chr1=TSSi), window, as.integer) + ) %>% unlist(use.names = FALSE)) + .logThis(covi, paste0(uniqGroups[z], " : Fragments : ", chr[k]), logFile = logFile) + + #Get Insertions + covi <- sort(c(start(covi), end(covi))) + .logThis(covi, paste0(uniqGroups[z], " : Insertions : ", chr[k]), logFile = logFile) + + #IRanges + covi <- IRanges(start = covi, width = 1) + .logThis(covi, paste0(uniqGroups[z], " : Insertions2 : ", chr[k]), logFile = logFile) + + #Coverage + covi <- IRanges::coverage(covi) + .logThis(covi, paste0(uniqGroups[z], " : Cov : ", chr[k]), logFile = logFile) + + #Compute Sum + sumTSSi <- rleSumsStranded(list(chr1=covi), list(chr1=TSSi), window, as.integer) + .logThis(sumTSSi, paste0(uniqGroups[z], " : SumTSS 1 : ", chr[k]), logFile = logFile) + + if(k == 1){ + sumTSS <- sumTSSi }else{ - sumTSS <- sumTSS + rleSumsStranded(list(chr1=covi), list(chr1=TSSi), window, as.integer) + sumTSS <- sumTSS + sumTSSi } - - .logThis(sumTSS, paste0(uniqGroups[x], " : SumTSS : ", chr[i]), logFile = logFile) + .logThis(sumTSS, paste0(uniqGroups[z], " : SumTSS : ", chr[k]), logFile = logFile) } normBy <- mean(sumTSS[c(1:norm,(flank*2-norm+1):(flank*2+1))]) df <- DataFrame( - group = uniqGroups[x], + group = uniqGroups[z], x = seq_along(sumTSS) - flank - 1, value = sumTSS, normValue = sumTSS / normBy, smoothValue = .centerRollMean(sumTSS/normBy, 11) ) - .logThis(df, paste0(uniqGroups[x], " : TSSDf"), logFile = logFile) + .logThis(df, paste0(uniqGroups[z], " : TSSDf"), logFile = logFile) - .logDiffTime(paste0(uniqGroups[x], " Finished Computing TSS (",x," of ",length(uniqGroups),")!"), t1 = tstart, logFile = logFile) + .logDiffTime(paste0(uniqGroups[z], " Finished Computing TSS (",z," of ",length(uniqGroups),")!"), t1 = tstart, logFile = logFile) df diff --git a/R/RNAIntegration.R b/R/RNAIntegration.R index 8e11b27e..d1137e79 100644 --- a/R/RNAIntegration.R +++ b/R/RNAIntegration.R @@ -143,6 +143,11 @@ addGeneIntegrationMatrix <- function( ######################################################################################### .logDiffTime("Checking ATAC Input", tstart, verbose = verbose, logFile = logFile) + if (useMatrix %ni% getAvailableMatrices(ArchRProj)) { + .logMessage(paste0("Matrix ", useMatrix, " does not exist in the provided ArchRProject. See available matrix names from getAvailableMatrices()!"), logFile = logFile) + stop("Matrix name provided to useMatrix does not exist in ArchRProject!") + } + if(!is.null(groupATAC)){ dfATAC <- getCellColData(ArchRProj = ArchRProj, select = groupATAC, drop = FALSE) } @@ -203,6 +208,12 @@ addGeneIntegrationMatrix <- function( seuratRNA$Group <- paste0(seRNA@meta.data[,groupRNA]) rm(seRNA) } + + if("RNA" %in% names(seuratRNA@assays)){ + DefaultAssay(seuratRNA) <- "RNA" + }else{ + stop("'RNA' is not present in Seurat Object's Assays! Please make sure that this assay is present!") + } gc() if(!is.null(groupRNA)){ diff --git a/R/ReproduciblePeakSet.R b/R/ReproduciblePeakSet.R index 8699e748..a5e47a53 100644 --- a/R/ReproduciblePeakSet.R +++ b/R/ReproduciblePeakSet.R @@ -20,7 +20,7 @@ #' This is important to allow for exclusion of pseudo-bulk replicates derived from very low cell numbers. #' @param excludeChr A character vector containing the `seqnames` of the chromosomes that should be excluded from peak calling. #' @param pathToMacs2 The full path to the MACS2 executable. -#' @param genomeSize The genome size to be used for MACS2 peak calling (see MACS2 documentation). +#' @param genomeSize The genome size to be used for MACS2 peak calling (see MACS2 documentation). This is required if genome is not hg19, hg38, mm9, or mm10. #' @param shift The number of basepairs to shift each Tn5 insertion. When combined with `extsize` this allows you to create proper fragments, #' centered at the Tn5 insertion site, for use with MACS2 (see MACS2 documentation). #' @param extsize The number of basepairs to extend the MACS2 fragment after `shift` has been applied. When combined with `extsize` this @@ -166,6 +166,8 @@ addReproduciblePeakSet <- function( genomeSize <- 2.7e9 }else if(grepl("mm9|mm10", getGenome(ArchRProj), ignore.case = TRUE)){ genomeSize <- 1.87e9 + }else { + stop("Non-standard genome detected. Argument genomeSize is required!") } } @@ -211,7 +213,6 @@ addReproduciblePeakSet <- function( ##################################################### # BSgenome for Add Nucleotide Frequencies! ##################################################### - .requirePackage(genomeAnnotation$genome) .requirePackage("Biostrings",source="bioc") BSgenome <- eval(parse(text = genomeAnnotation$genome)) BSgenome <- validBSgenome(BSgenome) @@ -596,7 +597,7 @@ addReproduciblePeakSet <- function( #Validate peaks <- .validGRanges(peaks) - peakSummits <- resize(peaks,1,"center") + peakSummits <- GenomicRanges::resize(peaks,1,"center") geneAnnotation$genes <- .validGRanges(geneAnnotation$genes) geneAnnotation$exons <- .validGRanges(geneAnnotation$exons) geneAnnotation$TSS <- .validGRanges(geneAnnotation$TSS) @@ -604,11 +605,11 @@ addReproduciblePeakSet <- function( #First Lets Get Distance to Nearest Gene Start .logMessage("Annotating Peaks : Nearest Gene", logFile = logFile) - distPeaks <- distanceToNearest(peakSummits, resize(geneAnnotation$genes, 1, "start"), ignore.strand = TRUE) + distPeaks <- distanceToNearest(peakSummits, GenomicRanges::resize(geneAnnotation$genes, 1, "start"), ignore.strand = TRUE) mcols(peaks)$distToGeneStart <- mcols(distPeaks)$distance mcols(peaks)$nearestGene <- mcols(geneAnnotation$genes)$symbol[subjectHits(distPeaks)] .logMessage("Annotating Peaks : Gene", logFile = logFile) - promoters <- extendGR(resize(geneAnnotation$genes, 1, "start"), upstream = promoterRegion[1], downstream = promoterRegion[2]) + promoters <- extendGR(GenomicRanges::resize(geneAnnotation$genes, 1, "start"), upstream = promoterRegion[1], downstream = promoterRegion[2]) op <- overlapsAny(peakSummits, promoters, ignore.strand = TRUE) og <- overlapsAny(peakSummits, geneAnnotation$genes, ignore.strand = TRUE) oe <- overlapsAny(peakSummits, geneAnnotation$exons, ignore.strand = TRUE) @@ -620,12 +621,12 @@ addReproduciblePeakSet <- function( #First Lets Get Distance to Nearest TSS's .logMessage("Annotating Peaks : TSS", logFile = logFile) - distTSS <- distanceToNearest(peakSummits, resize(geneAnnotation$TSS, 1, "start"), ignore.strand = TRUE) + distTSS <- distanceToNearest(peakSummits, GenomicRanges::resize(geneAnnotation$TSS, 1, "start"), ignore.strand = TRUE) mcols(peaks)$distToTSS <- mcols(distTSS)$distance if("symbol" %in% colnames(mcols(geneAnnotation$TSS))){ - mcols(peaks)$nearestTSS <- mcols(geneAnnotation$TSS)$symbol[subjectHits(distPeaks)] + mcols(peaks)$nearestTSS <- mcols(geneAnnotation$TSS)$symbol[subjectHits(distTSS)] }else if("tx_name" %in% colnames(mcols(geneAnnotation$TSS))){ - mcols(peaks)$nearestTSS <- mcols(geneAnnotation$TSS)$tx_name[subjectHits(distPeaks)] + mcols(peaks)$nearestTSS <- mcols(geneAnnotation$TSS)$tx_name[subjectHits(distTSS)] } #Get NucleoTide Content @@ -661,7 +662,7 @@ addReproduciblePeakSet <- function( summits <- Reduce("c", as(summits, "GRangesList")) .logMessage(paste0(prefix, " Extending Summits"), logFile = logFile) - extendedSummits <- resize(summits, extendSummits * 2 + 1, "center") + extendedSummits <- GenomicRanges::resize(summits, extendSummits * 2 + 1, "center") extendedSummits <- lapply(split(extendedSummits, extendedSummits$GroupReplicate), function(x){ nonES <- nonOverlappingGR(x, by = "score", decreasing = TRUE) nonES$replicateScoreQuantile <- round(.getQuantiles(nonES$score),3) @@ -834,7 +835,7 @@ findMacs2 <- function(){ if(search2[1] != "ERROR"){ path2Install <- gsub("Location: ","",search2[grep("Location", search2, ignore.case=TRUE)]) path2Bin <- gsub("lib/python/site-packages", "bin/macs2",path2Install) - if(.suppressAll(.checkPath(path2Bin, throwError = error))){ + if(.suppressAll(.checkPath(path2Bin, throwError = FALSE))){ message("Found with pip!") return(path2Bin) } @@ -847,7 +848,7 @@ findMacs2 <- function(){ if(search3[1] != "ERROR"){ path2Install <- gsub("Location: ","",search3[grep("Location", search3, ignore.case=TRUE)]) path2Bin <- gsub("lib/python/site-packages", "bin/macs2",path2Install) - if(.suppressAll(.checkPath(path2Bin, throwError = error))){ + if(.suppressAll(.checkPath(path2Bin, throwError = FALSE))){ message("Found with pip3!") return(path2Bin) } diff --git a/R/VisualizeData.R b/R/VisualizeData.R index d675abd4..ae9eff3e 100644 --- a/R/VisualizeData.R +++ b/R/VisualizeData.R @@ -173,7 +173,13 @@ plotPDF <- function( #' @param log2Norm A boolean value indicating whether a log2 transformation should be performed on the values (if continuous) in plotting. #' @param imputeWeights The weights to be used for imputing numerical values for each cell as a linear combination of other cells values. #' See `addImputationWeights()` and `getImutationWeights()` for more information. -#' @param pal A custom palette (see `paletteDiscrete` or `ArchRPalettes`) used to override discreteSet/continuousSet for coloring vector. +#' @param pal A custom palette used to override discreteSet/continuousSet for coloring cells. Typically created using `paletteDiscrete()` or `paletteContinuous()`. +#' To make a custom palette, you must construct this following strict specifications. If the coloring is for discrete data (i.e. "Clusters"), +#' then this palette must be a named vector of colors where each color is named for the corresponding group (e.g. `"C1" = "#F97070"`). If the coloring +#' for continuous data, then it just needs to be a vector of colors. If you are using `pal` in conjuction with `highlightCells`, your palette +#' must be a named vector with two entries, one named for the value of the cells in the `name` column of `cellColData` and the other named +#' "Non.Highlighted". For example, `pal=c("Mono" = "green", "Non.Highlighted" = "lightgrey")` would be used to change the color of cells with the value +#' "Mono" in the `cellColData` column indicated by `name`. Because of this, the cells indicated by `highlightCells` must also match this value in the `name` column. #' @param size A number indicating the size of the points to plot if `plotAs` is set to "points". #' @param sampleCells A numeric describing number of cells to use for plot. If using impute weights, this will occur after imputation. #' @param highlightCells A character vector of cellNames describing which cells to hightlight if using `plotAs = "points"` (default if discrete). @@ -415,8 +421,10 @@ plotEmbedding <- function( if(!plotParamsx$discrete){ - plotParamsx$color <- .quantileCut(plotParamsx$color, min(quantCut), max(quantCut)) - + if(!is.null(quantCut)){ + plotParamsx$color <- .quantileCut(plotParamsx$color, min(quantCut), max(quantCut)) + } + plotParamsx$pal <- paletteContinuous(set = plotParamsx$continuousSet) if(!is.null(pal)){ diff --git a/README.md b/README.md index 01464e8f..0fee95a7 100755 --- a/README.md +++ b/README.md @@ -34,22 +34,22 @@ ArchR is a full-featured R package for processing and analyzing single-cell ATAC For a full walk through of installation and frequently related issues please visit www.ArchRProject.com. **First, install devtools (for installing GitHub packages) if it isn't already installed:** -```{r} +``` r if (!requireNamespace("devtools", quietly = TRUE)) install.packages("devtools") ``` **Then, install BiocManager (for installing bioconductor packages) if it isn't already installed:** -```{r} +``` r if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") ``` **Then, install ArchR:** -```{r} +``` r devtools::install_github("GreenleafLab/ArchR", ref="master", repos = BiocManager::repositories()) ``` **Lastly, install all of the ArchR dependencies that aren't installed by default:** -```{r} +``` r library(ArchR) ArchR::installExtraPackages() ``` @@ -58,7 +58,7 @@ If any of these steps fails, you should identify the offending package and troub # Issues using ArchR? ArchR is currently in __beta__. We expect there to be bumps in the road. If you think you have found a bug, please first install the latest version of ArchR via -```{r} +``` r devtools::install_github("GreenleafLab/ArchR", ref="master", repos = BiocManager::repositories()) ``` If this does not fix your problem, please [report an issue on Github](https://github.com/GreenleafLab/ArchR/issues) with the __Bug Report__ form. diff --git a/man/ArchRBrowser.Rd b/man/ArchRBrowser.Rd index 62f8fb45..bbe9fa5e 100644 --- a/man/ArchRBrowser.Rd +++ b/man/ArchRBrowser.Rd @@ -55,5 +55,6 @@ To install try devtools::install_github("rstudio/shinythemes").} } \description{ This function will open an interactive shiny session in style of a browser track. It allows for normalization of the signal which -enables direct comparison across samples. +enables direct comparison across samples. Note that the genes displayed in this browser are derived from your \code{geneAnnotation} +(i.e. the \code{BSgenome} object you used) so they may not match other online genome browsers that use different gene annotations. } diff --git a/man/addGeneExpressionMatrix.Rd b/man/addGeneExpressionMatrix.Rd index e728ac04..06365823 100644 --- a/man/addGeneExpressionMatrix.Rd +++ b/man/addGeneExpressionMatrix.Rd @@ -13,6 +13,7 @@ addGeneExpressionMatrix( verbose = TRUE, threads = getArchRThreads(), parallelParam = NULL, + strictMatch = FALSE, force = TRUE, logFile = createLogFile("addGeneExpressionMatrix") ) @@ -36,6 +37,10 @@ for Seurat Objects (see \code{Seurat::as.SingleCellExperiment}). The provided va \item{parallelParam}{A list of parameters to be passed for biocparallel/batchtools parallel computing.} +\item{strictMatch}{A boolean value indicating whether every cell in \code{input} must be represented in \code{seRNA}. If set to \code{FALSE}, +this and this \code{GeneExpressionMatrix} is used for certain downstream analyses such as \code{addIterativeLSI()}, then errors may occur +because not all cells will have relevant information.} + \item{force}{A boolean value indicating whether to force the matrix indicated by \code{matrixName} to be overwritten if it already exist in the given \code{input}.} \item{logFile}{The path to a file to be used for logging ArchR output.} diff --git a/man/addGroupCoverages.Rd b/man/addGroupCoverages.Rd index be1d3792..3622b119 100644 --- a/man/addGroupCoverages.Rd +++ b/man/addGroupCoverages.Rd @@ -8,6 +8,7 @@ addGroupCoverages( ArchRProj = NULL, groupBy = "Clusters", useLabels = TRUE, + sampleLabels = "Sample", minCells = 40, maxCells = 500, maxFragments = 25 * 10^6, @@ -30,6 +31,11 @@ addGroupCoverages( \item{useLabels}{A boolean value indicating whether to use sample labels to create sample-aware subgroupings during as pseudo-bulk replicate generation.} +\item{sampleLabels}{The name of a column in \code{cellColData} to use to identify samples. In most cases, this parameter should be left as \code{NULL} and you +should only use this parameter if you do not want to use the default sample labels stored in \code{cellColData$Sample}. However, if your individual Arrow +files do not map to individual samples, then you should set this parameter to accurately identify your samples. This is the case in (for example) +multiplexing applications where cells from different biological samples are mixed into the same reaction and demultiplexed based on a lipid barcode or genotype.} + \item{minCells}{The minimum number of cells required in a given cell group to permit insertion coverage file generation.} \item{maxCells}{The maximum number of cells to use during insertion coverage file generation.} diff --git a/man/addHarmony.Rd b/man/addHarmony.Rd index 4f82147e..260514d4 100644 --- a/man/addHarmony.Rd +++ b/man/addHarmony.Rd @@ -34,7 +34,9 @@ to sequencing depth that is greater than the \code{corCutOff}, it will be exclud \item{name}{The name to store harmony output as a \code{reducedDims} in the \code{ArchRProject} object.} -\item{groupBy}{The name of the column in \code{cellColData} to use for grouping cells together for vars in harmony batch correction.} +\item{groupBy}{The name of the column in \code{cellColData} to use for grouping cells together for vars in harmony batch correction. +The value of \code{groupBy} is passed to the \code{vars_use} parameter in \code{harmony::HarmonyMatrix()}. When run through ArchR, this parameter +defines which variables to correct for during batch correction. See \code{harmony::HarmonyMatrix()} for more information.} \item{verbose}{A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output.} diff --git a/man/addIterativeLSI.Rd b/man/addIterativeLSI.Rd index 40afe0b2..ac601e66 100644 --- a/man/addIterativeLSI.Rd +++ b/man/addIterativeLSI.Rd @@ -51,9 +51,9 @@ addIterativeLSI( \item{iterations}{The number of LSI iterations to perform.} -\item{clusterParams}{A list of Additional parameters to be passed to \code{addClusters()} for clustering within each iteration. +\item{clusterParams}{A list of additional parameters to be passed to \code{addClusters()} for clustering within each iteration. These params can be constant across each iteration, or specified for each iteration individually. Thus each param must be of -length == 1 or the total number of \code{iterations} - 1. PLEASE NOTE - We have updated these params to \code{resolution=2} and \code{maxClusters=6}! To use previous settings use \code{resolution=0.2} and \code{maxClusters=NULL}.} +length == 1 or the total number of \code{iterations} - 1. If you want to use \code{scran} for clustering, you would pass this as \code{method="scran"}.} \item{firstSelection}{First iteration selection method for features to use for LSI. Either "Top" for the top accessible/average or "Var" for the top variable features. "Top" should be used for all scATAC-seq data (binary) while "Var" should be used for all scRNA/other-seq data types (non-binary).} @@ -70,8 +70,7 @@ Possible values are: 1 or "tf-logidf", 2 or "log(tf-idf)", and 3 or "logtf-logid \item{scaleDims}{A boolean that indicates whether to z-score the reduced dimensions for each cell. This is useful forminimizing the contribution of strong biases (dominating early PCs) and lowly abundant populations. However, this may lead to stronger sample-specific biases since -it is over-weighting latent PCs. If set to \code{NULL} this will scale the dimensions based on the value of \code{scaleDims} when the \code{reducedDims} were -originally created during dimensionality reduction. This idea was introduced by Timothy Stuart.} +it is over-weighting latent PCs.} \item{corCutOff}{A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to sequencing depth that is greater than the \code{corCutOff}, it will be excluded from analysis.} @@ -103,12 +102,12 @@ variance calculation and TF-IDF normalization.} \item{totalFeatures}{The number of features to consider for use in LSI after ranking the features by the total number of insertions. These features are the only ones used throught the variance identification and LSI. These are an equivalent when using a \code{TileMatrix} to a defined peakSet.} -\item{filterQuantile}{A number \link{0,1} that indicates the quantile above which features should be removed based on insertion counts prior} - -\item{excludeChr}{A string of chromosomes to exclude for iterativeLSI procedure. +\item{filterQuantile}{A number \link{0,1} that indicates the quantile above which features should be removed based on insertion counts prior to the first iteration of the iterative LSI paradigm. For example, if \code{filterQuantile = 0.99}, any features above the 99th percentile in insertion counts will be ignored for the first LSI iteration.} +\item{excludeChr}{A string of chromosomes to exclude for iterativeLSI procedure.} + \item{saveIterations}{A boolean value indicating whether the results of each LSI iterations should be saved as compressed \code{.rds} files in the designated \code{outDir}.} diff --git a/man/addModuleScore.Rd b/man/addModuleScore.Rd index d1a2964e..2a27fe91 100644 --- a/man/addModuleScore.Rd +++ b/man/addModuleScore.Rd @@ -19,16 +19,28 @@ addModuleScore( \arguments{ \item{ArchRProj}{An \code{ArchRProject} object.} -\item{seed}{A number to be used as the seed for random number generation. It is recommended to keep track of the seed used so that you can -reproduce results downstream.} +\item{useMatrix}{The name of the matrix to be used for calculation of the module score. See \code{getAvailableMatrices()} to view available options.} + +\item{name}{The name to be given to the designated module. If \code{features} is a list, this name will be prepended to the feature set names given in the list as shown below.} + +\item{features}{A list of feature names to be grouped into modules. For example, \code{list(BScore = c("MS4A1", "CD79A", "CD74"), TScore = c("CD3D", "CD8A", "GZMB", "CCR7", "LEF1"))}. +Each named element in this list will be stored as a separate module. The examples given in these parameters would yield two modules called \code{Module.Bscore} and \code{Module.Tscore}. +If the elements of this list are not named, they will be numbered in order, i.e. \code{Module1}, \code{Module2}.} + +\item{nBin}{The number of bins to use to divide all features for identification of signal-matched features for background calculation} + +\item{nBgd}{The number of background features to use for signal normalization.} + +\item{seed}{A number to be used as the seed for random number generation required when sampling cells for the background set. It is recommended +to keep track of the seed used so that you can reproduce results downstream.} \item{threads}{The number of threads to be used for parallel computing.} \item{logFile}{The path to a file to be used for logging ArchR output.} } \description{ -This function computes imputations weights that describe each cell as a linear combination of many cells based on a MAGIC diffusion matrix. -} -\details{ -RRR +This function calculates a module score from a set of features across all cells. This allows for +grouping of multiple features together into a single quantitative measurement. Currently, this +function only works for modules derived from the \code{GeneScoreMatrix}. Each module is added as a +new column in \code{cellColData} } diff --git a/man/addMotifAnnotations.Rd b/man/addMotifAnnotations.Rd index e78aa964..296da8cc 100644 --- a/man/addMotifAnnotations.Rd +++ b/man/addMotifAnnotations.Rd @@ -7,7 +7,7 @@ addMotifAnnotations( ArchRProj = NULL, motifSet = "cisbp", - name = "Motif", + annoName = "Motif", species = NULL, collection = "CORE", motifPWMs = NULL, @@ -23,16 +23,19 @@ addMotifAnnotations( \item{ArchRProj}{An \code{ArchRProject} object.} \item{motifSet}{The motif set to be used for annotation. Options include: (i) "JASPAR2016", "JASPAR2018", "JASPAR2020" -which gives the 2016, 2018 or 2020 version of JASPAR motifs or (ii) one of "cisbp", "encode", or "homer" which gives the -corresponding motif sets from the \code{chromVAR} package.} +which gives the 2016, 2018 or 2020 version of JASPAR motifs, (ii) one of "cisbp", "encode", or "homer" which gives the +corresponding motif sets from the \code{chromVAR} package, or (iii) "vierstra" which gives the clustered archetype motifs +created by Jeff Vierstra (https://github.com/jvierstra/motif-clustering).} -\item{name}{The name of the \code{peakAnnotation} object to be stored in the provided \code{ArchRProject}} +\item{annoName}{The name of the \code{peakAnnotation} object to be stored in the provided \code{ArchRProject}} \item{species}{The name of the species relevant to the supplied \code{ArchRProject}. This is used for identifying which motif to be used from CisBP/JASPAR. By default, this function will attempt to guess the species based on the value from \code{getGenome()}.} \item{collection}{If one of the JASPAR motif sets is used via \code{motifSet}, this parameter allows you to indicate the JASPAR -collection to be used. See \code{getMatrixSet()} from \code{TFBSTools} for all options to supply for collection.} +collection to be used. See \code{getMatrixSet()} from \code{TFBSTools} for all options to supply for collection. If \code{motifSet} is +"vierstra", then this must either be "archetype" (for the v2.1 clustered models) or "individual" (for the original v1 individual motif models). +NOTE: vierstra archetype motifs are currently in beta and have not been finalized by Jeff Vierstra.} \item{motifPWMs}{A custom set of motif PWMs as a PWMList for adding motif annotations.} @@ -43,7 +46,7 @@ collection to be used. See \code{getMatrixSet()} from \code{TFBSTools} for all o \item{version}{An integer specifying version 1 or version 2 of chromVARmotifs see github for more info GreenleafLab/chromVARmotifs.} -\item{force}{A boolean value indicating whether to force the \code{peakAnnotation} object indicated by \code{name} to be overwritten if +\item{force}{A boolean value indicating whether to force the \code{peakAnnotation} object indicated by \code{annoName} to be overwritten if it already exists in the given \code{ArchRProject}.} \item{logFile}{The path to a file to be used for logging ArchR output.} diff --git a/man/addPeak2GeneLinks.Rd b/man/addPeak2GeneLinks.Rd index 11edf142..891f9ff8 100644 --- a/man/addPeak2GeneLinks.Rd +++ b/man/addPeak2GeneLinks.Rd @@ -31,6 +31,8 @@ addPeak2GeneLinks( \item{reducedDims}{The name of the \code{reducedDims} object (i.e. "IterativeLSI") to retrieve from the designated \code{ArchRProject}.} +\item{useMatrix}{The name of the matrix containing gene expression information to be used for determining peak-to-gene links. See \code{getAvailableMatrices(ArchRProj)}} + \item{dimsToUse}{A vector containing the dimensions from the \code{reducedDims} object to use in clustering.} \item{scaleDims}{A boolean value that indicates whether to z-score the reduced dimensions for each cell. This is useful for minimizing diff --git a/man/addReproduciblePeakSet.Rd b/man/addReproduciblePeakSet.Rd index 29bea491..7bd22e82 100644 --- a/man/addReproduciblePeakSet.Rd +++ b/man/addReproduciblePeakSet.Rd @@ -56,7 +56,7 @@ This is important to allow for exclusion of pseudo-bulk replicates derived from \item{pathToMacs2}{The full path to the MACS2 executable.} -\item{genomeSize}{The genome size to be used for MACS2 peak calling (see MACS2 documentation).} +\item{genomeSize}{The genome size to be used for MACS2 peak calling (see MACS2 documentation). This is required if genome is not hg19, hg38, mm9, or mm10.} \item{shift}{The number of basepairs to shift each Tn5 insertion. When combined with \code{extsize} this allows you to create proper fragments, centered at the Tn5 insertion site, for use with MACS2 (see MACS2 documentation).} diff --git a/man/addTileMatrix.Rd b/man/addTileMatrix.Rd index 9c8849ea..b1e71406 100644 --- a/man/addTileMatrix.Rd +++ b/man/addTileMatrix.Rd @@ -42,5 +42,4 @@ is to retrieve this from the \code{ArchRProject} using \code{getBlacklist()}.} } \description{ This function, for each sample, will independently compute counts for each tile -per cell in the ArrowFile } diff --git a/man/createArrowFiles.Rd b/man/createArrowFiles.Rd index 14648b84..f1bd3018 100644 --- a/man/createArrowFiles.Rd +++ b/man/createArrowFiles.Rd @@ -14,6 +14,8 @@ createArrowFiles( minTSS = 4, minFrags = 1000, maxFrags = 1e+05, + minFragSize = 10, + maxFragSize = 2000, QCDir = "QualityControl", nucLength = 147, promoterRegion = c(2000, 100), @@ -69,6 +71,10 @@ Cells containing greater than or equal to \code{minFrags} total fragments wll be \item{maxFrags}{The maximum number of mapped ATAC-seq fragments required per cell to pass filtering for use in downstream analyses. Cells containing greater than or equal to \code{maxFrags} total fragments wll be retained.} +\item{minFragSize}{The minimum fragment size to be included into Arrow File. Fragments lower than this number are discarded. Must be less than maxFragSize.} + +\item{maxFragSize}{The maximum fragment size to be included into Arrow File. Fragments above than this number are discarded. Must be greater than maxFragSize.} + \item{QCDir}{The relative path to the output directory for QC-level information and plots for each sample/ArrowFile.} \item{nucLength}{The length in basepairs that wraps around a nucleosome. This number is used for identifying fragments as @@ -100,9 +106,11 @@ gsubExpression would be ":.*". This would retrieve the string after the colon as \code{scanBamFlag} passed to \code{ScanBam} in Rsamtools.} \item{offsetPlus}{The numeric offset to apply to a "+" stranded Tn5 insertion to account for the precise Tn5 binding site. +This parameter only applies to bam file input and it is assumed that fragment files have already been offset which is the standard from 10x output. See Buenrostro et al. Nature Methods 2013.} \item{offsetMinus}{The numeric offset to apply to a "-" stranded Tn5 insertion to account for the precise Tn5 binding site. +This parameter only applies to bam file input and it is assumed that fragment files have already been offset which is the standard from 10x output. See Buenrostro et al. Nature Methods 2013.} \item{addTileMat}{A boolean value indicating whether to add a "Tile Matrix" to each ArrowFile. A Tile Matrix is a counts matrix that, diff --git a/man/createGenomeAnnotation.Rd b/man/createGenomeAnnotation.Rd index f90fe5a9..e52160c0 100644 --- a/man/createGenomeAnnotation.Rd +++ b/man/createGenomeAnnotation.Rd @@ -20,10 +20,11 @@ createGenomeAnnotation( \item{blacklist}{A \code{GRanges} object containing regions that should be excluded from analyses due to unwanted biases.} \item{filter}{A boolean value indicating whether non-standard chromosome scaffolds should be excluded. -These "non-standard" chromosomes are defined by \code{filterChrGR()}.} +These "non-standard" chromosomes are defined by \code{filterChrGR()} and by manual annotation using the \code{filterChr} parameter.} \item{filterChr}{A character vector indicating the seqlevels that should be removed if manual removal is desired for certain seqlevels. -If no manual removal is desired, \code{filterChr} should be set to \code{NULL}.} +If no manual removal is desired, \code{filterChr} should be set to \code{NULL}. If \code{filter} is set to \code{TRUE} but \code{filterChr} is set to \code{NULL}, +non-standard chromosomes will still be removed as defined in \code{filterChrGR()}.} } \description{ This function will create a genome annotation object that can be used for creating ArrowFiles or an ArchRProject, etc. diff --git a/man/dot-DollarNames.ArchRProject.Rd b/man/dot-DollarNames.ArchRProject.Rd index 2a965405..d0905606 100644 --- a/man/dot-DollarNames.ArchRProject.Rd +++ b/man/dot-DollarNames.ArchRProject.Rd @@ -4,7 +4,7 @@ \alias{.DollarNames.ArchRProject} \title{Accessing cellColData directly from dollar.sign accessor} \usage{ -\method{.DollarNames}{ArchRProject}(x, pattern = "") +.DollarNames.ArchRProject(x, pattern = "") } \description{ This function will allow direct access to cellColData with a \code{$} accessor. diff --git a/man/getCoAccessibility.Rd b/man/getCoAccessibility.Rd index ce8b1c10..b97163e2 100644 --- a/man/getCoAccessibility.Rd +++ b/man/getCoAccessibility.Rd @@ -16,7 +16,8 @@ getCoAccessibility( \item{corCutOff}{A numeric describing the minimum numeric peak-to-peak correlation to return.} -\item{resolution}{A numeric describing the bp resolution to return loops as. This helps with overplotting of correlated regions.} +\item{resolution}{A numeric describing the bp resolution to use when returning loops. This helps with overplotting of correlated regions. +This only takes affect if \code{returnLoops = TRUE}.} \item{returnLoops}{A boolean indicating to return the co-accessibility signal as a \code{GRanges} "loops" object designed for use with the \code{ArchRBrowser()} or as an \code{ArchRBrowserTrack()}.} diff --git a/man/getGroupBW.Rd b/man/getGroupBW.Rd index 10e80ef0..f90166fb 100644 --- a/man/getGroupBW.Rd +++ b/man/getGroupBW.Rd @@ -24,7 +24,8 @@ user-supplied \code{cellColData} metadata columns (for example, "Clusters"). Cel column will be grouped together and the average signal will be plotted.} \item{normMethod}{The name of the column in \code{cellColData} by which normalization should be performed. The recommended and default value -is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth and sample data quality.} +is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth and sample data quality. Accepted values are +"None", "ReadsInTSS", "nCells", "ReadsInPromoter", or "nFrags".} \item{tileSize}{The numeric width of the tile/bin in basepairs for plotting ATAC-seq signal tracks. All insertions in a single bin will be summed.} diff --git a/man/getMatches.Rd b/man/getMatches.Rd index 3c6f7211..36e369ab 100644 --- a/man/getMatches.Rd +++ b/man/getMatches.Rd @@ -14,5 +14,6 @@ getMatches(ArchRProj = NULL, name = NULL, annoName = NULL) \item{annoName}{The name of a specific annotation to subset within the \code{peakAnnotation}.} } \description{ -This function gets peak annotation matches from a given ArchRProject. +This function gets peak annotation matches from a given ArchRProject. The peaks in the returned object are in the +same order as the peaks returned by \code{getPeakSet()}. } diff --git a/man/getMatrixFromProject.Rd b/man/getMatrixFromProject.Rd index 96b01a92..2e96d395 100644 --- a/man/getMatrixFromProject.Rd +++ b/man/getMatrixFromProject.Rd @@ -23,10 +23,16 @@ getMatrixFromProject( \item{verbose}{A boolean value indicating whether to use verbose output during execution of this function. Can be set to FALSE for a cleaner output.} -\item{binarize}{A boolean value indicating whether the matrix should be binarized before return. This is often desired when working with insertion counts.} +\item{binarize}{A boolean value indicating whether the matrix should be binarized before return. +This is often desired when working with insertion counts. Note that if the matrix has already been binarized previously, this should be set to \code{TRUE}.} \item{logFile}{The path to a file to be used for logging ArchR output.} } \description{ -This function gets a given data matrix from an \code{ArchRProject}. +This function gets a given data matrix from an \code{ArchRProject} and returns it as a \code{SummarizedExperiment}. +This function will return the matrix you ask it for, without altering that matrix unless you tell it to. +For example, if you added your \code{PeakMatrix} using \code{addPeakMatrix()} with \code{binarize = TRUE}, then +\code{getMatrixFromProject()} will return a binarized \code{PeakMatrix}. Alternatively, you could set \code{binarize = TRUE} +in the parameters passed to \code{getMatrixFromProject()} and the \code{PeakMatrix} will be binarized as you pull +it out. No other normalization is applied to the matrix by this function. } diff --git a/man/import10xFeatureMatrix.Rd b/man/import10xFeatureMatrix.Rd index 79f000fc..61261bd5 100644 --- a/man/import10xFeatureMatrix.Rd +++ b/man/import10xFeatureMatrix.Rd @@ -7,6 +7,8 @@ import10xFeatureMatrix( input = NULL, names = NULL, + strictMatch = TRUE, + verbose = TRUE, featureType = "Gene Expression" ) } @@ -15,6 +17,14 @@ import10xFeatureMatrix( \item{names}{A character of sample names associated with each input file.} +\item{strictMatch}{Only relevant when multiple input files are used. A boolean that indictes whether rows (genes) that do not match perfectly in the matrices +should be removed (\code{strictMatch = TRUE}) or coerced (\code{strictMatch = FALSE}). CellRanger seems to occassionally use different ensembl ids for the same gene across +different samples. If you are comfortable tolerating such mismatches, you can coerce all matrices to fit together, in which case the gene metadata present in +the first listed sample will be applied to all matrices for that particular gene entry. Regardless of what value is used for \code{strictMatch}, this function +cannot tolerate mismatched gene names, only mismatched metadata for the same gene.} + +\item{verbose}{Only relevant when multiple input files are used. A boolean that indicates whether messaging about mismatches should be verbose (\code{TRUE}) or minimal (\code{FALSE})} + \item{featureType}{The name of the feature to extract from the 10x feature file. See https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/advanced/h5_matrices for more information.} } diff --git a/man/plotBrowserTrack.Rd b/man/plotBrowserTrack.Rd index f3374f1b..ada70e90 100644 --- a/man/plotBrowserTrack.Rd +++ b/man/plotBrowserTrack.Rd @@ -87,7 +87,7 @@ is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth \item{threads}{The number of threads to use for parallel execution.} -\item{ylim}{The numeric quantile y-axis limit to be used for for "bulkTrack" plotting. If not provided, the y-axis limit will be c(0, 0.999).} +\item{ylim}{The numeric quantile y-axis limit to be used for for "bulkTrack" plotting. This should be expressed as \verb{c(lower limit, upper limit)} such as \code{c(0,0.99)}. If not provided, the y-axis limit will be c(0, 0.999).} \item{pal}{A custom palette (see \code{paletteDiscrete} or \code{ArchRPalettes}) used to override coloring for groups.} @@ -113,5 +113,6 @@ is "ReadsInTSS" which simultaneously normalizes tracks based on sequencing depth } \description{ This function will plot the coverage at an input region in the style of a browser track. It allows for normalization of the signal -which enables direct comparison across samples. +which enables direct comparison across samples. Note that the genes displayed in these plots are derived from your \code{geneAnnotation} +(i.e. the \code{BSgenome} object you used) so they may not match other online genome browsers that use different gene annotations. } diff --git a/man/plotEmbedding.Rd b/man/plotEmbedding.Rd index 147d5cd5..4ed4836a 100644 --- a/man/plotEmbedding.Rd +++ b/man/plotEmbedding.Rd @@ -46,7 +46,13 @@ is "GeneScoreMatrix" then \code{name} refers to a gene name which can be listed \item{imputeWeights}{The weights to be used for imputing numerical values for each cell as a linear combination of other cells values. See \code{addImputationWeights()} and \code{getImutationWeights()} for more information.} -\item{pal}{A custom palette (see \code{paletteDiscrete} or \code{ArchRPalettes}) used to override discreteSet/continuousSet for coloring vector.} +\item{pal}{A custom palette used to override discreteSet/continuousSet for coloring cells. Typically created using \code{paletteDiscrete()} or \code{paletteContinuous()}. +To make a custom palette, you must construct this following strict specifications. If the coloring is for discrete data (i.e. "Clusters"), +then this palette must be a named vector of colors where each color is named for the corresponding group (e.g. \code{"C1" = "#F97070"}). If the coloring +for continuous data, then it just needs to be a vector of colors. If you are using \code{pal} in conjuction with \code{highlightCells}, your palette +must be a named vector with two entries, one named for the value of the cells in the \code{name} column of \code{cellColData} and the other named +"Non.Highlighted". For example, \code{pal=c("Mono" = "green", "Non.Highlighted" = "lightgrey")} would be used to change the color of cells with the value +"Mono" in the \code{cellColData} column indicated by \code{name}. Because of this, the cells indicated by \code{highlightCells} must also match this value in the \code{name} column.} \item{size}{A number indicating the size of the points to plot if \code{plotAs} is set to "points".} diff --git a/man/plotMarkers.Rd b/man/plotMarkers.Rd index f0baad6b..6b1a0078 100644 --- a/man/plotMarkers.Rd +++ b/man/plotMarkers.Rd @@ -9,7 +9,8 @@ plotMarkers( name = NULL, cutOff = "FDR <= 0.01 & abs(Log2FC) >= 0.5", plotAs = "Volcano", - scaleTo = 10^4 + scaleTo = 10^4, + rastr = TRUE ) } \arguments{ @@ -22,6 +23,9 @@ To see available options try \code{colnames(seMarker)}.} \code{cutoff} can contain any of the \code{assayNames} from \code{seMarker}.} \item{plotAs}{A string indicating whether to plot a volcano plot ("Volcano") or an MA plot ("MA").} + +\item{rastr}{A boolean value that indicates whether the plot should be rasterized using \code{ggrastr}. This does not rasterize +lines and labels, just the internal portions of the plot.} } \description{ This function will plot one group/column of a differential markers as an MA or Volcano plot. diff --git a/man/projectBulkATAC.Rd b/man/projectBulkATAC.Rd index 62c15c6f..31859ab2 100644 --- a/man/projectBulkATAC.Rd +++ b/man/projectBulkATAC.Rd @@ -12,17 +12,18 @@ projectBulkATAC( n = 250, verbose = TRUE, threads = getArchRThreads(), + force = FALSE, logFile = createLogFile("projectBulkATAC") ) } \arguments{ \item{ArchRProj}{An \code{ArchRProject} object containing the dimensionality reduction matrix passed by \code{reducedDims}.} -\item{seATAC}{Bulk ATAC Summarized Experiment.} +\item{seATAC}{A \code{SummarizedExperiment} object containing bulk ATAC-seq data.} -\item{reducedDims}{A string specifying the reducedDims.} +\item{reducedDims}{A string specifying the name of the \code{reducedDims} object to be used.} -\item{embedding}{A string specifying embedding.} +\item{embedding}{A string specifying the name of the \code{embedding} object to be used.} \item{n}{An integer specifying the number of subsampled "pseudo single cells" per bulk sample.} @@ -30,6 +31,8 @@ projectBulkATAC( \item{threads}{The number of threads used for parallel execution} +\item{force}{A boolean value indicating whether to force the projection of bulk ATAC data even if fewer than 25\% of the features are present in the bulk ATAC data set.} + \item{logFile}{The path to a file to be used for logging ArchR output.} } \description{ diff --git a/src/Correlation.cpp b/src/Correlation.cpp index 419da16d..e86ea2c4 100644 --- a/src/Correlation.cpp +++ b/src/Correlation.cpp @@ -3,7 +3,7 @@ using namespace Rcpp; using namespace std; -// Adapted from https://github.com/AEBilgrau/correlateR/blob/master/src/auxiliary_functions.cpp +// Pearson Correlation, Adapted from https://github.com/AEBilgrau/correlateR/blob/master/src/auxiliary_functions.cpp // [[Rcpp::export]] Rcpp::NumericVector rowCorCpp(IntegerVector idxX, IntegerVector idxY, Rcpp::NumericMatrix X, Rcpp::NumericMatrix Y) { diff --git a/src/Footprinting_utils.cpp b/src/Footprinting_utils.cpp index 8698e16f..566fefb6 100644 --- a/src/Footprinting_utils.cpp +++ b/src/Footprinting_utils.cpp @@ -104,6 +104,9 @@ IntegerVector rleSumsStranded(List rleList, List grList, int width, Function as_ IntegerVector strand, debug, start; IntegerVector out = IntegerVector(width); + // Clone grList + grList = Rcpp::clone(grList); + int n = grList.size(); int shift = floor(width/2); diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index adae0404..308aeb3c 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // rowCorCpp Rcpp::NumericVector rowCorCpp(IntegerVector idxX, IntegerVector idxY, Rcpp::NumericMatrix X, Rcpp::NumericMatrix Y); RcppExport SEXP _ArchR_rowCorCpp(SEXP idxXSEXP, SEXP idxYSEXP, SEXP XSEXP, SEXP YSEXP) {