Dev 053120 (GreenleafLab#242)

* update error meesages in RNA integration * update motif names and gene score matrix documentation * bugfix positions * attempt to error handle co-accessibiltiy error * add cellsToUse option to addCoAccessibility * bug fix args chromSizes * bugfix plotPeak2GeneHeatmap * bugfix bigwig not being normalized * add tryCatch to endLogging * add pvalue to output of differential testing * updates
LornaWessels · Jul 13, 2020 · ddcaae4 · ddcaae4
1 parent 5f855d7
commit ddcaae4
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 39 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/R/AllClasses.R b/R/AllClasses.R
@@ -391,16 +391,20 @@ loadArchRProject <- function(
       #Postions
       if(!is.null(ArchRProj@peakAnnotation[[i]]$Positions)){
 
-        PositionsNew <- gsub(outputDir, outputDirNew, ArchRProj@peakAnnotation[[i]]$Positions)
-        if(!all(file.exists(PositionsNew))){
-          if(force){
-            keepAnno[i] <- FALSE
-            message("Positions for peakAnnotation do not exist in saved ArchRProject!")
-          }else{
-            stop("Positions for peakAnnotation do not exist in saved ArchRProject!")
+        if(tolower(ArchRProj@peakAnnotation[[i]]$Positions) != "none"){
+
+          PositionsNew <- gsub(outputDir, outputDirNew, ArchRProj@peakAnnotation[[i]]$Positions)
+          if(!all(file.exists(PositionsNew))){
+            if(force){
+              keepAnno[i] <- FALSE
+              message("Positions for peakAnnotation do not exist in saved ArchRProject!")
+            }else{
+              stop("Positions for peakAnnotation do not exist in saved ArchRProject!")
+            }
           }
+          ArchRProj@peakAnnotation[[i]]$Positions <- PositionsNew
+
         }
-        ArchRProj@peakAnnotation[[i]]$Positions <- PositionsNew
 
       }
 

diff --git a/R/AnnotationPeaks.R b/R/AnnotationPeaks.R
@@ -254,6 +254,7 @@ addMotifAnnotations <- function(
   name = "Motif",
   species = NULL,
   collection = "CORE",
+  motifPWMs = NULL,
   cutOff = 5e-05, 
   width = 7,
   version = 2,
@@ -263,7 +264,7 @@ addMotifAnnotations <- function(
   ){
 
   .validInput(input = ArchRProj, name = "ArchRProj", valid = c("ArchRProj"))
-  .validInput(input = motifSet, name = "motifSet", valid = c("character"))
+  .validInput(input = motifSet, name = "motifSet", valid = c("character", "null"))
   .validInput(input = name, name = "name", valid = c("character"))
   .validInput(input = species, name = "species", valid = c("character", "null"))
   .validInput(input = collection, name = "collection", valid = c("character", "null"))
@@ -272,6 +273,17 @@ addMotifAnnotations <- function(
   .validInput(input = force, name = "force", valid = c("boolean"))
   .validInput(input = logFile, name = "logFile", valid = c("character"))
 
+  if(!is.null(motifPWMs)){
+    if(!is(motifPWMs, "PWMatrixList")){
+      stop("User Supplied motifPWMS must be a PWMatrixList!")
+    }
+    motifSet <- "Custom"
+  }
+
+  if(is.null(motifSet)){
+    stop("Must provide motifSet or motifPWMs!")
+  }
+
   .requirePackage("motifmatchr", installInfo='BiocManager::install("motifmatchr")')
 
   tstart <- Sys.time()
@@ -389,6 +401,12 @@ addMotifAnnotations <- function(
     motifs <- obj$motifs
     motifSummary <- obj$motifSummary
 
+  }else if(tolower(motifSet)=="custom"){
+
+    obj <- NULL
+    motifs <- motifPWMs
+    motifSummary <- NULL
+
   }else{
 
     stop("Error MotifSet Not Recognized!")
@@ -501,17 +519,33 @@ addMotifAnnotations <- function(
 
   motifNames <- lapply(seq_along(motifs), function(x){
     namex <- make.names(motifs[[x]]@name)
+    if(grepl("LINE", namex)){
+      splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE)
+      namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1]
+    }
     if(substr(namex,nchar(namex),nchar(namex))=="."){
       namex <- substr(namex,1,nchar(namex)-1)
     }
     namex <- paste0(namex, "_", x)
     namex
   }) %>% unlist(.)
 
+  motifNames2 <- lapply(seq_along(motifs), function(x){
+    namex <- make.names(motifs[[x]]@name)
+    if(grepl("LINE", namex)){
+      splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE)
+      namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1]
+    }
+    if(substr(namex,nchar(namex),nchar(namex))=="."){
+      namex <- substr(namex,1,nchar(namex)-1)
+    }
+    namex
+  }) %>% unlist(.)
+
   motifDF <- lapply(seq_along(motifs), function(x){
     df <- data.frame(
       row.names = motifNames[x],
-      name = motifs[[x]]@name[[1]],
+      name = motifNames2[[x]],
       ID = motifs[[x]]@ID,
       strand = motifs[[x]]@strand,
       stringsAsFactors = FALSE

diff --git a/R/GroupExport.R b/R/GroupExport.R
@@ -184,7 +184,7 @@ getGroupBW <- function(
   ArrowFiles <- getArrowFiles(ArchRProj)
   Groups <- getCellColData(ArchRProj = ArchRProj, select = groupBy, drop = TRUE)
 
-  if(tolower(normMethod) %in% c("ReadsInTSS", "ReadsInPromoter", "nFrags")){
+  if(tolower(normMethod) %in% tolower(c("ReadsInTSS", "ReadsInPromoter", "nFrags"))){
     normBy <- getCellColData(ArchRProj = ArchRProj, select = normMethod)
   }else{
     normBy <- NULL
@@ -292,7 +292,8 @@ getGroupBW <- function(
 
   #Cells
   cellGroupi <- cellGroups[[i]]
-
+  #print(sum(normBy[cellGroupi, 1]))
+
   #Bigwig File!
   covFile <- file.path(bwDir, paste0(make.names(names(cellGroups)[i]), "-TileSize-",tileSize,"-normMethod-",normMethod,"-ArchR.bw"))
   rmf <- .suppressAll(file.remove(covFile))
@@ -345,7 +346,7 @@ getGroupBW <- function(
       tilesk$reads <- mat
 
       if(tolower(normMethod) %in% c("ReadsInTSS", "ReadsInPromoter", "nFrags")){
-        tilesk$reads <- tilesk$reads * 10^6 / sum(normBy[cellGroupi, 1])
+        tilesk$reads <- tilesk$reads * 10^4 / sum(normBy[cellGroupi, 1])
       }else if(tolower(normMethod) %in% c("nCells")){
         tilesk$reads <- tilesk$reads / length(cellGroupi)
       }

diff --git a/R/IntegrativeAnalysis.R b/R/IntegrativeAnalysis.R
@@ -668,6 +668,7 @@ correlateTrajectories <- function(
 #' `reducedDims` were originally created during dimensionality reduction. This idea was introduced by Timothy Stuart.
 #' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to
 #' sequencing depth that is greater than the `corCutOff`, it will be excluded from analysis.
+#' @param cellsToUse A character vector of cellNames to compute coAccessibility on if desired to run on a subset of the total cells.
 #' @param k The number of k-nearest neighbors to use for creating single-cell groups for correlation analyses.
 #' @param knnIteration The number of k-nearest neighbor groupings to test for passing the supplied `overlapCutoff`.
 #' @param overlapCutoff The maximum allowable overlap between the current group and all previous groups to permit the current group be
@@ -688,6 +689,7 @@ addCoAccessibility <- function(
   dimsToUse = 1:30,
   scaleDims = NULL,
   corCutOff = 0.75,
+  cellsToUse = NULL,
   k = 100, 
   knnIteration = 500, 
   overlapCutoff = 0.8, 
@@ -705,6 +707,7 @@ addCoAccessibility <- function(
   .validInput(input = dimsToUse, name = "dimsToUse", valid = c("numeric", "null"))
   .validInput(input = scaleDims, name = "scaleDims", valid = c("boolean", "null"))
   .validInput(input = corCutOff, name = "corCutOff", valid = c("numeric", "null"))
+  .validInput(input = cellsToUse, name = "cellsToUse", valid = c("character", "null"))
   .validInput(input = k, name = "k", valid = c("integer"))
   .validInput(input = knnIteration, name = "knnIteration", valid = c("integer"))
   .validInput(input = overlapCutoff, name = "overlapCutoff", valid = c("numeric"))
@@ -726,6 +729,9 @@ addCoAccessibility <- function(
 
   #Get Reduced Dims
   rD <- getReducedDims(ArchRProj, reducedDims = reducedDims, corCutOff = corCutOff, dimsToUse = dimsToUse)
+  if(!is.null(cellsToUse)){
+    rD <- rD[cellsToUse, ,drop=FALSE]
+  }
 
   #Subsample
   idx <- sample(seq_len(nrow(rD)), knnIteration, replace = !nrow(rD) >= knnIteration)
@@ -762,7 +768,7 @@ addCoAccessibility <- function(
   o$seqnames <- seqnames(peakSet)[o[,1]]
   o$idx1 <- peakSet$idx[o[,1]]
   o$idx2 <- peakSet$idx[o[,2]]
-  o$correlation <- NA
+  o$correlation <- -999
 
   #Peak Matrix ColSums
   cS <- .getColSums(getArrowFiles(ArchRProj), chri, verbose = FALSE, useMatrix = "PeakMatrix")
@@ -795,7 +801,10 @@ addCoAccessibility <- function(
 
     #Correlations
     idx <- BiocGenerics::which(o$seqnames==chri[x])
-    o[idx,]$correlation <- rowCorCpp(idxX = o[idx,]$idx1, idxY = o[idx,]$idx2, X = as.matrix(groupMat), Y = as.matrix(groupMat))
+    corVals <- rowCorCpp(idxX = o[idx,]$idx1, idxY = o[idx,]$idx2, X = as.matrix(groupMat), Y = as.matrix(groupMat))
+    .logThis(head(corVals), paste0("SubsetCorVals-", x), logFile = logFile)
+
+    o[idx,]$correlation <- as.numeric(corVals)
 
     .logThis(groupMat, paste0("SubsetGroupMat-", x), logFile = logFile)
     .logThis(o[idx,], paste0("SubsetCoA-", x), logFile = logFile)
@@ -1425,7 +1434,7 @@ plotPeak2GeneHeatmap <- function(
     KNNx <- KNNList[[x]]
     names(sort(table(ccd[KNNx, 1, drop = TRUE]), decreasing = TRUE))[1]
   }) %>% unlist
-  cD <- DataFrame(row.names=paste0("K", seq_len(ncol(mATAC))), groupBy = KNNGroups)
+  cD <- DataFrame(row.names=paste0("K_", seq_len(ncol(mATAC))), groupBy = KNNGroups)
   pal <- paletteDiscrete(values=gtools::mixedsort(unique(ccd[,1])))
   if(!is.null(palGroup)){
     pal[names(palGroup)[names(palGroup) %in% names(pal)]] <- palGroup[names(palGroup) %in% names(pal)]

diff --git a/R/LoggerUtils.R b/R/LoggerUtils.R
@@ -547,15 +547,18 @@ createLogFile <- function(
   }
 
   rL <- readLines(logFile)
-  t1 <- gsub("Start Time : ","", grep("Start Time", rL, ignore.case = TRUE, value = TRUE))
-  mn <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "mins"))
-  hr <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "hours"))
-  cat("\n------- Completed\n\n", file = logFile, append = TRUE)
-  cat(paste0("End Time : ",Sys.time(),"\n"), file = logFile, append = TRUE)
-  cat(paste0("Elapsed Time Minutes = ", mn), file = logFile, append = TRUE)
-  cat(paste0("\nElapsed Time Hours = ", hr), file = logFile, append = TRUE)
-  cat("\n\n-------\n\n\n\n", file = logFile, append = TRUE)
-  message("ArchR logging successful to : ", logFile)
+  o <- tryCatch({
+    t1 <- gsub("Start Time : ","", grep("Start Time", rL, ignore.case = TRUE, value = TRUE))
+    mn <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "mins"))
+    hr <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "hours"))
+    cat("\n------- Completed\n\n", file = logFile, append = TRUE)
+    cat(paste0("End Time : ",Sys.time(),"\n"), file = logFile, append = TRUE)
+    cat(paste0("Elapsed Time Minutes = ", mn), file = logFile, append = TRUE)
+    cat(paste0("\nElapsed Time Hours = ", hr), file = logFile, append = TRUE)
+    cat("\n\n-------\n\n\n\n", file = logFile, append = TRUE)
+    message("ArchR logging successful to : ", logFile)
+  }, error = function(x){
+  })
 
   # tryCatch({
   #   R.utils::gzip(logFile, paste0(logFile, ".gz"))

diff --git a/R/MarkerFeatures.R b/R/MarkerFeatures.R
@@ -251,6 +251,7 @@ getMarkerFeatures <- function(
               Log2FC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$log2FC)) %>% Reduce("cbind",.),
               Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.),
               FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.),
+              Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.),
               MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.),
               AUC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$auc)) %>% Reduce("cbind",.),
               MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.)
@@ -265,6 +266,7 @@ getMarkerFeatures <- function(
               Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.),
               Variance = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var1)) %>% Reduce("cbind",.),
               FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.),
+              Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.),
               MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.),
               MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.),
               VarianceBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var2)) %>% Reduce("cbind",.)
@@ -278,6 +280,7 @@ getMarkerFeatures <- function(
               Log2FC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$log2FC)) %>% Reduce("cbind",.),
               Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.),
               FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.),
+              Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.),
               MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.),
               MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.)
             ),
@@ -863,8 +866,8 @@ plotMarkerHeatmap <- function(
   .logThis(mat, "mat", logFile = logFile) 
 
   idx <- which(rowSums(passMat, na.rm = TRUE) > 0 & matrixStats::rowVars(mat) != 0 & !is.na(matrixStats::rowVars(mat)))
-  mat <- mat[idx,]
-  passMat <- passMat[idx,]
+  mat <- mat[idx,,drop=FALSE]
+  passMat <- passMat[idx,,drop=FALSE]
 
   if(nrow(mat) == 0){
     stop("No Makers Found!")
@@ -887,7 +890,7 @@ plotMarkerHeatmap <- function(
   #identify to remove
   if(!is.null(grepExclude) & !is.null(rownames(mat))){
     idx2 <- which(!grepl(grepExclude, rownames(mat)))
-    mat <- mat[idx2,]
+    mat <- mat[idx2,,drop=FALSE]
   }
 
   if(nrow(mat)==0){
@@ -906,11 +909,11 @@ plotMarkerHeatmap <- function(
 
   if(binaryClusterRows){
     if(invert){
-      bS <- .binarySort(-mat, lmat = passMat[rownames(mat), colnames(mat)], clusterCols = clusterCols)
-      mat <- -bS[[1]][,colnames(mat)]
+      bS <- .binarySort(-mat, lmat = passMat[rownames(mat), colnames(mat),drop=FALSE], clusterCols = clusterCols)
+      mat <- -bS[[1]][,colnames(mat),drop=FALSE]
     }else{
-      bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat)], clusterCols = clusterCols)
-      mat <- bS[[1]][,colnames(mat)]
+      bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat),drop=FALSE], clusterCols = clusterCols)
+      mat <- bS[[1]][,colnames(mat),drop=FALSE]
     }
     clusterRows <- FALSE
     clusterCols <- bS[[2]]
@@ -945,9 +948,9 @@ plotMarkerHeatmap <- function(
 
     #mat <- t(mat[rev(seq_len(nrow(mat))), rev(clusterCols$order)])
     if(!is.null(clusterCols)){
-      mat <- t(mat[seq_len(nrow(mat)), clusterCols$order])
+      mat <- t(mat[seq_len(nrow(mat)), clusterCols$order, drop = FALSE])
     }else{
-      mat <- t(mat[seq_len(nrow(mat)), ])
+      mat <- t(mat[seq_len(nrow(mat)), , drop = FALSE])
     }
 
     if(!is.null(labelMarkers)){

diff --git a/R/MatrixGeneScores.R b/R/MatrixGeneScores.R
@@ -14,12 +14,21 @@
 #' @param matrixName The name to be used for storage of the gene activity score matrix in the provided `ArchRProject` or ArrowFiles.
 #' @param extendUpstream The minimum and maximum number of basepairs upstream of the transcription start site to consider for gene
 #' activity score calculation.
-#' @param extendDownstream The minimum and maximum number of basepairs downstream of the transcription start site to consider for gene activity score calculation.
-#' @param tileSize The size of the tiles used for binning counts prior to gene activity score calculation.
-#' @param ceiling The maximum counts per tile allowed. This is used to prevent large biases in tile counts.
+#' @param extendDownstream The minimum and maximum number of basepairs downstream of the transcription start site or transcription termination site 
+#' (based on 'useTSS') to consider for gene activity score calculation.
 #' @param useGeneBoundaries A boolean value indicating whether gene boundaries should be employed during gene activity score
 #' calculation. Gene boundaries refers to the process of preventing tiles from contributing to the gene score of a given gene
 #' if there is a second gene's transcription start site between the tile and the gene of interest.
+#' @param geneUpstream An integer describing the number of bp upstream the gene to extend the gene body. This effectively makes the gene body larger as there
+#' are proximal peaks that should be weighted equally to the gene body. This parameter is used if 'useTSS=FALSE'.
+#' @param geneDownstream An integer describing the number of bp downstream the gene to extend the gene body.This effectively makes the gene body larger as there
+#' are proximal peaks that should be weighted equally to the gene body. This parameter is used if 'useTSS=FALSE'.
+#' @param useTSS A boolean describing whether to build gene model based on gene TSS or the gene body.
+#' @param tileSize The size of the tiles used for binning counts prior to gene activity score calculation.
+#' @param ceiling The maximum counts per tile allowed. This is used to prevent large biases in tile counts.
+#' @param geneScaleFactor A numeric scaling factor to weight genes based on the inverse of there length i.e. [(Scale Factor)/(Gene Length)]. This
+#' is scaled from 1 to the scale factor. Small genes will be the scale factor while extremely large genes will be closer to 1. This scaling helps with
+#' the relative gene score value.
 #' @param scaleTo Each column in the calculated gene score matrix will be normalized to a column sum designated by `scaleTo`.
 #' @param excludeChr A character vector containing the `seqnames` of the chromosomes that should be excluded from this analysis.
 #' @param blacklist A `GRanges` object containing genomic regions to blacklist that may be extremeley over-represented and thus

diff --git a/R/MatrixTiles.R b/R/MatrixTiles.R
@@ -96,6 +96,7 @@ addTileMatrix <- function(
   excludeChr = c("chrM", "chrY"), 
   blacklist = NULL, 
   chromLengths = NULL, 
+  chromSizes = NULL,
   force = FALSE,
   subThreads = 1,
   tstart = NULL,

diff --git a/R/RNAIntegration.R b/R/RNAIntegration.R
@@ -160,8 +160,8 @@ addGeneIntegrationMatrix <- function(
   }
 
   if(!all(nCell == 1)){
-    .logMessage(paste0("Missing ", length(which(nCell == 0)), " Overlapping ", length(which(nCell > 1))," cells from ArchRProj in groupList!"), logFile = logFile)
-    stop("Missing ", length(which(nCell == 0)), " Overlapping ", length(which(nCell > 1))," cells from ArchRProj in groupList!")
+    .logMessage(paste0("Missing ", length(which(nCell == 0)), " cells. Found ", length(which(nCell > 1))," overlapping cells from ArchRProj in groupList! Cannot have overlapping/missing cells in ATAC input, check 'groupList' argument!"), logFile = logFile)
+    stop("Missing ", length(which(nCell == 0)), " cells. Found ", length(which(nCell > 1))," overlapping cells from ArchRProj in groupList! Cannot have overlapping/missing cells in ATAC input, check 'groupList' argument!")
   }
 
   #########################################################################################