diff --git a/.DS_Store b/.DS_Store index ddbe2f5d..c0f50e6f 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/R/AllClasses.R b/R/AllClasses.R index 466aadaa..299b1655 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -391,16 +391,20 @@ loadArchRProject <- function( #Postions if(!is.null(ArchRProj@peakAnnotation[[i]]$Positions)){ - PositionsNew <- gsub(outputDir, outputDirNew, ArchRProj@peakAnnotation[[i]]$Positions) - if(!all(file.exists(PositionsNew))){ - if(force){ - keepAnno[i] <- FALSE - message("Positions for peakAnnotation do not exist in saved ArchRProject!") - }else{ - stop("Positions for peakAnnotation do not exist in saved ArchRProject!") + if(tolower(ArchRProj@peakAnnotation[[i]]$Positions) != "none"){ + + PositionsNew <- gsub(outputDir, outputDirNew, ArchRProj@peakAnnotation[[i]]$Positions) + if(!all(file.exists(PositionsNew))){ + if(force){ + keepAnno[i] <- FALSE + message("Positions for peakAnnotation do not exist in saved ArchRProject!") + }else{ + stop("Positions for peakAnnotation do not exist in saved ArchRProject!") + } } + ArchRProj@peakAnnotation[[i]]$Positions <- PositionsNew + } - ArchRProj@peakAnnotation[[i]]$Positions <- PositionsNew } diff --git a/R/AnnotationPeaks.R b/R/AnnotationPeaks.R index 5413120a..eda4bb88 100644 --- a/R/AnnotationPeaks.R +++ b/R/AnnotationPeaks.R @@ -254,6 +254,7 @@ addMotifAnnotations <- function( name = "Motif", species = NULL, collection = "CORE", + motifPWMs = NULL, cutOff = 5e-05, width = 7, version = 2, @@ -263,7 +264,7 @@ addMotifAnnotations <- function( ){ .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj")) - .validInput(input = motifSet, name = "motifSet", valid = c("character")) + .validInput(input = motifSet, name = "motifSet", valid = c("character", "null")) .validInput(input = name, name = "name", valid = c("character")) .validInput(input = species, name = "species", valid = c("character", "null")) .validInput(input = collection, name = "collection", valid = c("character", "null")) @@ -272,6 +273,17 @@ addMotifAnnotations <- function( .validInput(input = force, name = "force", valid = c("boolean")) .validInput(input = logFile, name = "logFile", valid = c("character")) + if(!is.null(motifPWMs)){ + if(!is(motifPWMs, "PWMatrixList")){ + stop("User Supplied motifPWMS must be a PWMatrixList!") + } + motifSet <- "Custom" + } + + if(is.null(motifSet)){ + stop("Must provide motifSet or motifPWMs!") + } + .requirePackage("motifmatchr", installInfo='BiocManager::install("motifmatchr")') tstart <- Sys.time() @@ -389,6 +401,12 @@ addMotifAnnotations <- function( motifs <- obj$motifs motifSummary <- obj$motifSummary + }else if(tolower(motifSet)=="custom"){ + + obj <- NULL + motifs <- motifPWMs + motifSummary <- NULL + }else{ stop("Error MotifSet Not Recognized!") @@ -501,6 +519,10 @@ addMotifAnnotations <- function( motifNames <- lapply(seq_along(motifs), function(x){ namex <- make.names(motifs[[x]]@name) + if(grepl("LINE", namex)){ + splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE) + namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1] + } if(substr(namex,nchar(namex),nchar(namex))=="."){ namex <- substr(namex,1,nchar(namex)-1) } @@ -508,10 +530,22 @@ addMotifAnnotations <- function( namex }) %>% unlist(.) + motifNames2 <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(grepl("LINE", namex)){ + splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE) + namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1] + } + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex + }) %>% unlist(.) + motifDF <- lapply(seq_along(motifs), function(x){ df <- data.frame( row.names = motifNames[x], - name = motifs[[x]]@name[[1]], + name = motifNames2[[x]], ID = motifs[[x]]@ID, strand = motifs[[x]]@strand, stringsAsFactors = FALSE diff --git a/R/GroupExport.R b/R/GroupExport.R index c99b6465..c9025f57 100644 --- a/R/GroupExport.R +++ b/R/GroupExport.R @@ -184,7 +184,7 @@ getGroupBW <- function( ArrowFiles <- getArrowFiles(ArchRProj) Groups <- getCellColData(ArchRProj = ArchRProj, select = groupBy, drop = TRUE) - if(tolower(normMethod) %in% c("ReadsInTSS", "ReadsInPromoter", "nFrags")){ + if(tolower(normMethod) %in% tolower(c("ReadsInTSS", "ReadsInPromoter", "nFrags"))){ normBy <- getCellColData(ArchRProj = ArchRProj, select = normMethod) }else{ normBy <- NULL @@ -292,7 +292,8 @@ getGroupBW <- function( #Cells cellGroupi <- cellGroups[[i]] - + #print(sum(normBy[cellGroupi, 1])) + #Bigwig File! covFile <- file.path(bwDir, paste0(make.names(names(cellGroups)[i]), "-TileSize-",tileSize,"-normMethod-",normMethod,"-ArchR.bw")) rmf <- .suppressAll(file.remove(covFile)) @@ -345,7 +346,7 @@ getGroupBW <- function( tilesk$reads <- mat if(tolower(normMethod) %in% c("ReadsInTSS", "ReadsInPromoter", "nFrags")){ - tilesk$reads <- tilesk$reads * 10^6 / sum(normBy[cellGroupi, 1]) + tilesk$reads <- tilesk$reads * 10^4 / sum(normBy[cellGroupi, 1]) }else if(tolower(normMethod) %in% c("nCells")){ tilesk$reads <- tilesk$reads / length(cellGroupi) } diff --git a/R/IntegrativeAnalysis.R b/R/IntegrativeAnalysis.R index e4b6f936..69425cca 100644 --- a/R/IntegrativeAnalysis.R +++ b/R/IntegrativeAnalysis.R @@ -668,6 +668,7 @@ correlateTrajectories <- function( #' `reducedDims` were originally created during dimensionality reduction. This idea was introduced by Timothy Stuart. #' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to #' sequencing depth that is greater than the `corCutOff`, it will be excluded from analysis. +#' @param cellsToUse A character vector of cellNames to compute coAccessibility on if desired to run on a subset of the total cells. #' @param k The number of k-nearest neighbors to use for creating single-cell groups for correlation analyses. #' @param knnIteration The number of k-nearest neighbor groupings to test for passing the supplied `overlapCutoff`. #' @param overlapCutoff The maximum allowable overlap between the current group and all previous groups to permit the current group be @@ -688,6 +689,7 @@ addCoAccessibility <- function( dimsToUse = 1:30, scaleDims = NULL, corCutOff = 0.75, + cellsToUse = NULL, k = 100, knnIteration = 500, overlapCutoff = 0.8, @@ -705,6 +707,7 @@ addCoAccessibility <- function( .validInput(input = dimsToUse, name = "dimsToUse", valid = c("numeric", "null")) .validInput(input = scaleDims, name = "scaleDims", valid = c("boolean", "null")) .validInput(input = corCutOff, name = "corCutOff", valid = c("numeric", "null")) + .validInput(input = cellsToUse, name = "cellsToUse", valid = c("character", "null")) .validInput(input = k, name = "k", valid = c("integer")) .validInput(input = knnIteration, name = "knnIteration", valid = c("integer")) .validInput(input = overlapCutoff, name = "overlapCutoff", valid = c("numeric")) @@ -726,6 +729,9 @@ addCoAccessibility <- function( #Get Reduced Dims rD <- getReducedDims(ArchRProj, reducedDims = reducedDims, corCutOff = corCutOff, dimsToUse = dimsToUse) + if(!is.null(cellsToUse)){ + rD <- rD[cellsToUse, ,drop=FALSE] + } #Subsample idx <- sample(seq_len(nrow(rD)), knnIteration, replace = !nrow(rD) >= knnIteration) @@ -762,7 +768,7 @@ addCoAccessibility <- function( o$seqnames <- seqnames(peakSet)[o[,1]] o$idx1 <- peakSet$idx[o[,1]] o$idx2 <- peakSet$idx[o[,2]] - o$correlation <- NA + o$correlation <- -999 #Peak Matrix ColSums cS <- .getColSums(getArrowFiles(ArchRProj), chri, verbose = FALSE, useMatrix = "PeakMatrix") @@ -795,7 +801,10 @@ addCoAccessibility <- function( #Correlations idx <- BiocGenerics::which(o$seqnames==chri[x]) - o[idx,]$correlation <- rowCorCpp(idxX = o[idx,]$idx1, idxY = o[idx,]$idx2, X = as.matrix(groupMat), Y = as.matrix(groupMat)) + corVals <- rowCorCpp(idxX = o[idx,]$idx1, idxY = o[idx,]$idx2, X = as.matrix(groupMat), Y = as.matrix(groupMat)) + .logThis(head(corVals), paste0("SubsetCorVals-", x), logFile = logFile) + + o[idx,]$correlation <- as.numeric(corVals) .logThis(groupMat, paste0("SubsetGroupMat-", x), logFile = logFile) .logThis(o[idx,], paste0("SubsetCoA-", x), logFile = logFile) @@ -1425,7 +1434,7 @@ plotPeak2GeneHeatmap <- function( KNNx <- KNNList[[x]] names(sort(table(ccd[KNNx, 1, drop = TRUE]), decreasing = TRUE))[1] }) %>% unlist - cD <- DataFrame(row.names=paste0("K", seq_len(ncol(mATAC))), groupBy = KNNGroups) + cD <- DataFrame(row.names=paste0("K_", seq_len(ncol(mATAC))), groupBy = KNNGroups) pal <- paletteDiscrete(values=gtools::mixedsort(unique(ccd[,1]))) if(!is.null(palGroup)){ pal[names(palGroup)[names(palGroup) %in% names(pal)]] <- palGroup[names(palGroup) %in% names(pal)] diff --git a/R/LoggerUtils.R b/R/LoggerUtils.R index d593ddc9..4df2e5b3 100644 --- a/R/LoggerUtils.R +++ b/R/LoggerUtils.R @@ -547,15 +547,18 @@ createLogFile <- function( } rL <- readLines(logFile) - t1 <- gsub("Start Time : ","", grep("Start Time", rL, ignore.case = TRUE, value = TRUE)) - mn <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "mins")) - hr <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "hours")) - cat("\n------- Completed\n\n", file = logFile, append = TRUE) - cat(paste0("End Time : ",Sys.time(),"\n"), file = logFile, append = TRUE) - cat(paste0("Elapsed Time Minutes = ", mn), file = logFile, append = TRUE) - cat(paste0("\nElapsed Time Hours = ", hr), file = logFile, append = TRUE) - cat("\n\n-------\n\n\n\n", file = logFile, append = TRUE) - message("ArchR logging successful to : ", logFile) + o <- tryCatch({ + t1 <- gsub("Start Time : ","", grep("Start Time", rL, ignore.case = TRUE, value = TRUE)) + mn <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "mins")) + hr <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "hours")) + cat("\n------- Completed\n\n", file = logFile, append = TRUE) + cat(paste0("End Time : ",Sys.time(),"\n"), file = logFile, append = TRUE) + cat(paste0("Elapsed Time Minutes = ", mn), file = logFile, append = TRUE) + cat(paste0("\nElapsed Time Hours = ", hr), file = logFile, append = TRUE) + cat("\n\n-------\n\n\n\n", file = logFile, append = TRUE) + message("ArchR logging successful to : ", logFile) + }, error = function(x){ + }) # tryCatch({ # R.utils::gzip(logFile, paste0(logFile, ".gz")) diff --git a/R/MarkerFeatures.R b/R/MarkerFeatures.R index 4ce95035..c6e9a38e 100644 --- a/R/MarkerFeatures.R +++ b/R/MarkerFeatures.R @@ -251,6 +251,7 @@ getMarkerFeatures <- function( Log2FC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$log2FC)) %>% Reduce("cbind",.), Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), + Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.), MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.), AUC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$auc)) %>% Reduce("cbind",.), MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.) @@ -265,6 +266,7 @@ getMarkerFeatures <- function( Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), Variance = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), + Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.), MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.), MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.), VarianceBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var2)) %>% Reduce("cbind",.) @@ -278,6 +280,7 @@ getMarkerFeatures <- function( Log2FC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$log2FC)) %>% Reduce("cbind",.), Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), + Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.), MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.), MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.) ), @@ -863,8 +866,8 @@ plotMarkerHeatmap <- function( .logThis(mat, "mat", logFile = logFile) idx <- which(rowSums(passMat, na.rm = TRUE) > 0 & matrixStats::rowVars(mat) != 0 & !is.na(matrixStats::rowVars(mat))) - mat <- mat[idx,] - passMat <- passMat[idx,] + mat <- mat[idx,,drop=FALSE] + passMat <- passMat[idx,,drop=FALSE] if(nrow(mat) == 0){ stop("No Makers Found!") @@ -887,7 +890,7 @@ plotMarkerHeatmap <- function( #identify to remove if(!is.null(grepExclude) & !is.null(rownames(mat))){ idx2 <- which(!grepl(grepExclude, rownames(mat))) - mat <- mat[idx2,] + mat <- mat[idx2,,drop=FALSE] } if(nrow(mat)==0){ @@ -906,11 +909,11 @@ plotMarkerHeatmap <- function( if(binaryClusterRows){ if(invert){ - bS <- .binarySort(-mat, lmat = passMat[rownames(mat), colnames(mat)], clusterCols = clusterCols) - mat <- -bS[[1]][,colnames(mat)] + bS <- .binarySort(-mat, lmat = passMat[rownames(mat), colnames(mat),drop=FALSE], clusterCols = clusterCols) + mat <- -bS[[1]][,colnames(mat),drop=FALSE] }else{ - bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat)], clusterCols = clusterCols) - mat <- bS[[1]][,colnames(mat)] + bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat),drop=FALSE], clusterCols = clusterCols) + mat <- bS[[1]][,colnames(mat),drop=FALSE] } clusterRows <- FALSE clusterCols <- bS[[2]] @@ -945,9 +948,9 @@ plotMarkerHeatmap <- function( #mat <- t(mat[rev(seq_len(nrow(mat))), rev(clusterCols$order)]) if(!is.null(clusterCols)){ - mat <- t(mat[seq_len(nrow(mat)), clusterCols$order]) + mat <- t(mat[seq_len(nrow(mat)), clusterCols$order, drop = FALSE]) }else{ - mat <- t(mat[seq_len(nrow(mat)), ]) + mat <- t(mat[seq_len(nrow(mat)), , drop = FALSE]) } if(!is.null(labelMarkers)){ diff --git a/R/MatrixGeneScores.R b/R/MatrixGeneScores.R index 10aca1ef..5231d596 100644 --- a/R/MatrixGeneScores.R +++ b/R/MatrixGeneScores.R @@ -14,12 +14,21 @@ #' @param matrixName The name to be used for storage of the gene activity score matrix in the provided `ArchRProject` or ArrowFiles. #' @param extendUpstream The minimum and maximum number of basepairs upstream of the transcription start site to consider for gene #' activity score calculation. -#' @param extendDownstream The minimum and maximum number of basepairs downstream of the transcription start site to consider for gene activity score calculation. -#' @param tileSize The size of the tiles used for binning counts prior to gene activity score calculation. -#' @param ceiling The maximum counts per tile allowed. This is used to prevent large biases in tile counts. +#' @param extendDownstream The minimum and maximum number of basepairs downstream of the transcription start site or transcription termination site +#' (based on 'useTSS') to consider for gene activity score calculation. #' @param useGeneBoundaries A boolean value indicating whether gene boundaries should be employed during gene activity score #' calculation. Gene boundaries refers to the process of preventing tiles from contributing to the gene score of a given gene #' if there is a second gene's transcription start site between the tile and the gene of interest. +#' @param geneUpstream An integer describing the number of bp upstream the gene to extend the gene body. This effectively makes the gene body larger as there +#' are proximal peaks that should be weighted equally to the gene body. This parameter is used if 'useTSS=FALSE'. +#' @param geneDownstream An integer describing the number of bp downstream the gene to extend the gene body.This effectively makes the gene body larger as there +#' are proximal peaks that should be weighted equally to the gene body. This parameter is used if 'useTSS=FALSE'. +#' @param useTSS A boolean describing whether to build gene model based on gene TSS or the gene body. +#' @param tileSize The size of the tiles used for binning counts prior to gene activity score calculation. +#' @param ceiling The maximum counts per tile allowed. This is used to prevent large biases in tile counts. +#' @param geneScaleFactor A numeric scaling factor to weight genes based on the inverse of there length i.e. [(Scale Factor)/(Gene Length)]. This +#' is scaled from 1 to the scale factor. Small genes will be the scale factor while extremely large genes will be closer to 1. This scaling helps with +#' the relative gene score value. #' @param scaleTo Each column in the calculated gene score matrix will be normalized to a column sum designated by `scaleTo`. #' @param excludeChr A character vector containing the `seqnames` of the chromosomes that should be excluded from this analysis. #' @param blacklist A `GRanges` object containing genomic regions to blacklist that may be extremeley over-represented and thus diff --git a/R/MatrixTiles.R b/R/MatrixTiles.R index a4f8be2d..b00901fd 100644 --- a/R/MatrixTiles.R +++ b/R/MatrixTiles.R @@ -96,6 +96,7 @@ addTileMatrix <- function( excludeChr = c("chrM", "chrY"), blacklist = NULL, chromLengths = NULL, + chromSizes = NULL, force = FALSE, subThreads = 1, tstart = NULL, diff --git a/R/RNAIntegration.R b/R/RNAIntegration.R index 7c4b1be4..f9773a29 100644 --- a/R/RNAIntegration.R +++ b/R/RNAIntegration.R @@ -160,8 +160,8 @@ addGeneIntegrationMatrix <- function( } if(!all(nCell == 1)){ - .logMessage(paste0("Missing ", length(which(nCell == 0)), " Overlapping ", length(which(nCell > 1))," cells from ArchRProj in groupList!"), logFile = logFile) - stop("Missing ", length(which(nCell == 0)), " Overlapping ", length(which(nCell > 1))," cells from ArchRProj in groupList!") + .logMessage(paste0("Missing ", length(which(nCell == 0)), " cells. Found ", length(which(nCell > 1))," overlapping cells from ArchRProj in groupList! Cannot have overlapping/missing cells in ATAC input, check 'groupList' argument!"), logFile = logFile) + stop("Missing ", length(which(nCell == 0)), " cells. Found ", length(which(nCell > 1))," overlapping cells from ArchRProj in groupList! Cannot have overlapping/missing cells in ATAC input, check 'groupList' argument!") } #########################################################################################