From ddcaae4a6093685875052219141e5ea41030fc55 Mon Sep 17 00:00:00 2001 From: jgranja24 Date: Mon, 13 Jul 2020 11:17:44 -0700 Subject: [PATCH] Dev 053120 (#242) * update error meesages in RNA integration * update motif names and gene score matrix documentation * bugfix positions * attempt to error handle co-accessibiltiy error * add cellsToUse option to addCoAccessibility * bug fix args chromSizes * bugfix plotPeak2GeneHeatmap * bugfix bigwig not being normalized * add tryCatch to endLogging * add pvalue to output of differential testing * updates --- .DS_Store | Bin 12292 -> 12292 bytes R/AllClasses.R | 20 ++++++++++++-------- R/AnnotationPeaks.R | 38 ++++++++++++++++++++++++++++++++++++-- R/GroupExport.R | 7 ++++--- R/IntegrativeAnalysis.R | 15 ++++++++++++--- R/LoggerUtils.R | 21 ++++++++++++--------- R/MarkerFeatures.R | 21 ++++++++++++--------- R/MatrixGeneScores.R | 15 ++++++++++++--- R/MatrixTiles.R | 1 + R/RNAIntegration.R | 4 ++-- 10 files changed, 103 insertions(+), 39 deletions(-) diff --git a/.DS_Store b/.DS_Store index ddbe2f5d6cfd576ddafdbee0242a81c1a2287bac..c0f50e6f66facda6c8a73b019cf69d3547665ea7 100644 GIT binary patch delta 1396 zcmdUtO-xi*6vxkh7~o#H$i*4p0n&kve5hlb44`6&l+M(Ops0mtZHvtKp2>^D49<*1 z#db_ELlckNlc zcklh*bI-~7{SOWw96r*I(uoZ>%TPQiUM{%pFLsgl=N1dn_D%%p=oijePvf9S0>Bn39qQMp4VvEt)^}yGUg7`NbzzQ zd>|1WG4fZmYOR(@Kijf9X71W;X7=Xh$?~$g*EI3I9?cl*lDeIx%ZH3R_x)NUlULb$ zwi-?u>Z?1oTC_!6CfGpQ+-Kw;2`mv}V5gbTWpJH;wijsV}XJvlO>T2DL2-CCH*yKKlV-WN@0`bUk#Y<4qoxY9Nh#S$z>7rGI_TUf0qY{WZ?#2|*S4N=8oCr0u# zc3~g(WAGznaR8s;a~xJgzEnhx=b4SFL%{%RKIh1HZc_OX{B#)fw?<#vC&u%8_Xa{`kL*1^!8Q%hUf~*M9>0<1S_Z delta 2310 zcmd6oYfw~W7{{OgvcS1;Ie@t5tirA=$i-E5VF48t5WIsL(s&KI?AEL<3$lQtX4}Q4 zOe>v}Syp4(Ok>fPRGdn?*axf8YEnwn{7^GFQy=m}O;(d*?|ILHm$drSv>*1J_jz{C z`<(ap|NnPO%a)cMi;(u_;-jgM4<#vkExOD3Xv$=-+ga)@+3a?1@_Kyw5oe_v)EJ8N z)wYE?+lVYj+qy!*NRP*J9+o(3d_tliO0t}+&Yd@ZL4AY1&zhmXm9b8*wPx5vu|LH& zc3fUjv2V(Z+QC#MEj?qDEbFJNMNtyP!E_}pRn^ong5naE^|l4q^#*%3@SkO6tMV#; zYfqq|(;vJy81O3tg2+2L*{ZtI=+t%x*tcI0_+#XJwNO-Ag{DYwW5BGqFv5u4Tq z!+!H9uUsl9e(~;5Yg;rBGHd1XWIB7N7uJTdKPsB!fdRM+4;tM)v>Xb8@(F8-xx}8Rrl^7;48Vzg{Udq+7S)7 zsy${w0SS~v9-2w>shPTH18t*cX)hh70Xjk-(nfCW~>BLz0( zVm$Iu057Iv2C6U@^RNK*xC<+=68E4P?FgX*o#;XzqIeJw;}Jx+;Ti10b9f&6(T{g< z4DaGSe1tD>8fP$wi@1c#_!(EtAuWhazi{|1%oW$zY6_$)CXknJFAxV8Na1Ql>5T*$ z^#j(VVVx+VcrL{hN9JgoGp|UOY%3t^0b71_jAmCIIgVU`-LxVM8e2Kv!B7tf3i#HM z@0chEie*sM>|=^O4q1s~IuQBRS0+1@c!pfy+bL6JS+O!rnB5swtee2#$DUg!&lb}x zBq}sQFM0=j*XjK&yHDseeZ#o@PS+SM3F(ZNlksw+oUy7#E$T1}3$cjNYD5#K)q>U- zu`s$;VFNZ{Gag{z9>){djwkUHb~9=(;6=QIS8)KZGk$Mi07vk4jN=J>jv-t$35}3z zxP{y${j8dw7)KH6rgmL*?#w7**7te2^-DdI-Oh4fg?`LgRu=os4I!|5nDNjJA)w8h zFV!y#ws(e?FcUJmlk^9bQ9Kc9teK-n1TXd{WoG4!pWrE*T2-g-8&{gNl@S?jQ#i7j+gc~wpBnS-u2Sjdr zjdI9G(`Yd*CqJ#H?X-(oe}9zvne&g)3HprAF!!IOA*Q46=`#JswDbpEMFLVW7TItx zJr$yi>8XP0X%^;i|2JYOmUI96v4*=pjP*=H4{`5rj~RatUco^e=F)i|AK*An;uOBb zc}%$Q585%bHLRR%iQ|%I&ynixNIvU2`EyCewHbc{nc1#_i6s@&YV=f_syF8rCf`I? ze=K)q)ShS07YwIc40UKyk=<=%yb{N^_F~Cv#JLi0c70O0k@t$#>`sv?jbK+2%% unlist(.) + motifNames2 <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(grepl("LINE", namex)){ + splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE) + namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1] + } + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex + }) %>% unlist(.) + motifDF <- lapply(seq_along(motifs), function(x){ df <- data.frame( row.names = motifNames[x], - name = motifs[[x]]@name[[1]], + name = motifNames2[[x]], ID = motifs[[x]]@ID, strand = motifs[[x]]@strand, stringsAsFactors = FALSE diff --git a/R/GroupExport.R b/R/GroupExport.R index c99b6465..c9025f57 100644 --- a/R/GroupExport.R +++ b/R/GroupExport.R @@ -184,7 +184,7 @@ getGroupBW <- function( ArrowFiles <- getArrowFiles(ArchRProj) Groups <- getCellColData(ArchRProj = ArchRProj, select = groupBy, drop = TRUE) - if(tolower(normMethod) %in% c("ReadsInTSS", "ReadsInPromoter", "nFrags")){ + if(tolower(normMethod) %in% tolower(c("ReadsInTSS", "ReadsInPromoter", "nFrags"))){ normBy <- getCellColData(ArchRProj = ArchRProj, select = normMethod) }else{ normBy <- NULL @@ -292,7 +292,8 @@ getGroupBW <- function( #Cells cellGroupi <- cellGroups[[i]] - + #print(sum(normBy[cellGroupi, 1])) + #Bigwig File! covFile <- file.path(bwDir, paste0(make.names(names(cellGroups)[i]), "-TileSize-",tileSize,"-normMethod-",normMethod,"-ArchR.bw")) rmf <- .suppressAll(file.remove(covFile)) @@ -345,7 +346,7 @@ getGroupBW <- function( tilesk$reads <- mat if(tolower(normMethod) %in% c("ReadsInTSS", "ReadsInPromoter", "nFrags")){ - tilesk$reads <- tilesk$reads * 10^6 / sum(normBy[cellGroupi, 1]) + tilesk$reads <- tilesk$reads * 10^4 / sum(normBy[cellGroupi, 1]) }else if(tolower(normMethod) %in% c("nCells")){ tilesk$reads <- tilesk$reads / length(cellGroupi) } diff --git a/R/IntegrativeAnalysis.R b/R/IntegrativeAnalysis.R index e4b6f936..69425cca 100644 --- a/R/IntegrativeAnalysis.R +++ b/R/IntegrativeAnalysis.R @@ -668,6 +668,7 @@ correlateTrajectories <- function( #' `reducedDims` were originally created during dimensionality reduction. This idea was introduced by Timothy Stuart. #' @param corCutOff A numeric cutoff for the correlation of each dimension to the sequencing depth. If the dimension has a correlation to #' sequencing depth that is greater than the `corCutOff`, it will be excluded from analysis. +#' @param cellsToUse A character vector of cellNames to compute coAccessibility on if desired to run on a subset of the total cells. #' @param k The number of k-nearest neighbors to use for creating single-cell groups for correlation analyses. #' @param knnIteration The number of k-nearest neighbor groupings to test for passing the supplied `overlapCutoff`. #' @param overlapCutoff The maximum allowable overlap between the current group and all previous groups to permit the current group be @@ -688,6 +689,7 @@ addCoAccessibility <- function( dimsToUse = 1:30, scaleDims = NULL, corCutOff = 0.75, + cellsToUse = NULL, k = 100, knnIteration = 500, overlapCutoff = 0.8, @@ -705,6 +707,7 @@ addCoAccessibility <- function( .validInput(input = dimsToUse, name = "dimsToUse", valid = c("numeric", "null")) .validInput(input = scaleDims, name = "scaleDims", valid = c("boolean", "null")) .validInput(input = corCutOff, name = "corCutOff", valid = c("numeric", "null")) + .validInput(input = cellsToUse, name = "cellsToUse", valid = c("character", "null")) .validInput(input = k, name = "k", valid = c("integer")) .validInput(input = knnIteration, name = "knnIteration", valid = c("integer")) .validInput(input = overlapCutoff, name = "overlapCutoff", valid = c("numeric")) @@ -726,6 +729,9 @@ addCoAccessibility <- function( #Get Reduced Dims rD <- getReducedDims(ArchRProj, reducedDims = reducedDims, corCutOff = corCutOff, dimsToUse = dimsToUse) + if(!is.null(cellsToUse)){ + rD <- rD[cellsToUse, ,drop=FALSE] + } #Subsample idx <- sample(seq_len(nrow(rD)), knnIteration, replace = !nrow(rD) >= knnIteration) @@ -762,7 +768,7 @@ addCoAccessibility <- function( o$seqnames <- seqnames(peakSet)[o[,1]] o$idx1 <- peakSet$idx[o[,1]] o$idx2 <- peakSet$idx[o[,2]] - o$correlation <- NA + o$correlation <- -999 #Peak Matrix ColSums cS <- .getColSums(getArrowFiles(ArchRProj), chri, verbose = FALSE, useMatrix = "PeakMatrix") @@ -795,7 +801,10 @@ addCoAccessibility <- function( #Correlations idx <- BiocGenerics::which(o$seqnames==chri[x]) - o[idx,]$correlation <- rowCorCpp(idxX = o[idx,]$idx1, idxY = o[idx,]$idx2, X = as.matrix(groupMat), Y = as.matrix(groupMat)) + corVals <- rowCorCpp(idxX = o[idx,]$idx1, idxY = o[idx,]$idx2, X = as.matrix(groupMat), Y = as.matrix(groupMat)) + .logThis(head(corVals), paste0("SubsetCorVals-", x), logFile = logFile) + + o[idx,]$correlation <- as.numeric(corVals) .logThis(groupMat, paste0("SubsetGroupMat-", x), logFile = logFile) .logThis(o[idx,], paste0("SubsetCoA-", x), logFile = logFile) @@ -1425,7 +1434,7 @@ plotPeak2GeneHeatmap <- function( KNNx <- KNNList[[x]] names(sort(table(ccd[KNNx, 1, drop = TRUE]), decreasing = TRUE))[1] }) %>% unlist - cD <- DataFrame(row.names=paste0("K", seq_len(ncol(mATAC))), groupBy = KNNGroups) + cD <- DataFrame(row.names=paste0("K_", seq_len(ncol(mATAC))), groupBy = KNNGroups) pal <- paletteDiscrete(values=gtools::mixedsort(unique(ccd[,1]))) if(!is.null(palGroup)){ pal[names(palGroup)[names(palGroup) %in% names(pal)]] <- palGroup[names(palGroup) %in% names(pal)] diff --git a/R/LoggerUtils.R b/R/LoggerUtils.R index d593ddc9..4df2e5b3 100644 --- a/R/LoggerUtils.R +++ b/R/LoggerUtils.R @@ -547,15 +547,18 @@ createLogFile <- function( } rL <- readLines(logFile) - t1 <- gsub("Start Time : ","", grep("Start Time", rL, ignore.case = TRUE, value = TRUE)) - mn <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "mins")) - hr <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "hours")) - cat("\n------- Completed\n\n", file = logFile, append = TRUE) - cat(paste0("End Time : ",Sys.time(),"\n"), file = logFile, append = TRUE) - cat(paste0("Elapsed Time Minutes = ", mn), file = logFile, append = TRUE) - cat(paste0("\nElapsed Time Hours = ", hr), file = logFile, append = TRUE) - cat("\n\n-------\n\n\n\n", file = logFile, append = TRUE) - message("ArchR logging successful to : ", logFile) + o <- tryCatch({ + t1 <- gsub("Start Time : ","", grep("Start Time", rL, ignore.case = TRUE, value = TRUE)) + mn <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "mins")) + hr <- as.numeric(difftime(Sys.time(), as.POSIXct(t1), units = "hours")) + cat("\n------- Completed\n\n", file = logFile, append = TRUE) + cat(paste0("End Time : ",Sys.time(),"\n"), file = logFile, append = TRUE) + cat(paste0("Elapsed Time Minutes = ", mn), file = logFile, append = TRUE) + cat(paste0("\nElapsed Time Hours = ", hr), file = logFile, append = TRUE) + cat("\n\n-------\n\n\n\n", file = logFile, append = TRUE) + message("ArchR logging successful to : ", logFile) + }, error = function(x){ + }) # tryCatch({ # R.utils::gzip(logFile, paste0(logFile, ".gz")) diff --git a/R/MarkerFeatures.R b/R/MarkerFeatures.R index 4ce95035..c6e9a38e 100644 --- a/R/MarkerFeatures.R +++ b/R/MarkerFeatures.R @@ -251,6 +251,7 @@ getMarkerFeatures <- function( Log2FC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$log2FC)) %>% Reduce("cbind",.), Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), + Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.), MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.), AUC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$auc)) %>% Reduce("cbind",.), MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.) @@ -265,6 +266,7 @@ getMarkerFeatures <- function( Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), Variance = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), + Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.), MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.), MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.), VarianceBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$var2)) %>% Reduce("cbind",.) @@ -278,6 +280,7 @@ getMarkerFeatures <- function( Log2FC = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$log2FC)) %>% Reduce("cbind",.), Mean = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1)) %>% Reduce("cbind",.), FDR = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$fdr)) %>% Reduce("cbind",.), + Pval = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$pval)) %>% Reduce("cbind",.), MeanDiff = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean1 - diffList[[x]]$mean2)) %>% Reduce("cbind",.), MeanBGD = lapply(seq_along(diffList), function(x) data.frame(x = diffList[[x]]$mean2)) %>% Reduce("cbind",.) ), @@ -863,8 +866,8 @@ plotMarkerHeatmap <- function( .logThis(mat, "mat", logFile = logFile) idx <- which(rowSums(passMat, na.rm = TRUE) > 0 & matrixStats::rowVars(mat) != 0 & !is.na(matrixStats::rowVars(mat))) - mat <- mat[idx,] - passMat <- passMat[idx,] + mat <- mat[idx,,drop=FALSE] + passMat <- passMat[idx,,drop=FALSE] if(nrow(mat) == 0){ stop("No Makers Found!") @@ -887,7 +890,7 @@ plotMarkerHeatmap <- function( #identify to remove if(!is.null(grepExclude) & !is.null(rownames(mat))){ idx2 <- which(!grepl(grepExclude, rownames(mat))) - mat <- mat[idx2,] + mat <- mat[idx2,,drop=FALSE] } if(nrow(mat)==0){ @@ -906,11 +909,11 @@ plotMarkerHeatmap <- function( if(binaryClusterRows){ if(invert){ - bS <- .binarySort(-mat, lmat = passMat[rownames(mat), colnames(mat)], clusterCols = clusterCols) - mat <- -bS[[1]][,colnames(mat)] + bS <- .binarySort(-mat, lmat = passMat[rownames(mat), colnames(mat),drop=FALSE], clusterCols = clusterCols) + mat <- -bS[[1]][,colnames(mat),drop=FALSE] }else{ - bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat)], clusterCols = clusterCols) - mat <- bS[[1]][,colnames(mat)] + bS <- .binarySort(mat, lmat = passMat[rownames(mat), colnames(mat),drop=FALSE], clusterCols = clusterCols) + mat <- bS[[1]][,colnames(mat),drop=FALSE] } clusterRows <- FALSE clusterCols <- bS[[2]] @@ -945,9 +948,9 @@ plotMarkerHeatmap <- function( #mat <- t(mat[rev(seq_len(nrow(mat))), rev(clusterCols$order)]) if(!is.null(clusterCols)){ - mat <- t(mat[seq_len(nrow(mat)), clusterCols$order]) + mat <- t(mat[seq_len(nrow(mat)), clusterCols$order, drop = FALSE]) }else{ - mat <- t(mat[seq_len(nrow(mat)), ]) + mat <- t(mat[seq_len(nrow(mat)), , drop = FALSE]) } if(!is.null(labelMarkers)){ diff --git a/R/MatrixGeneScores.R b/R/MatrixGeneScores.R index 10aca1ef..5231d596 100644 --- a/R/MatrixGeneScores.R +++ b/R/MatrixGeneScores.R @@ -14,12 +14,21 @@ #' @param matrixName The name to be used for storage of the gene activity score matrix in the provided `ArchRProject` or ArrowFiles. #' @param extendUpstream The minimum and maximum number of basepairs upstream of the transcription start site to consider for gene #' activity score calculation. -#' @param extendDownstream The minimum and maximum number of basepairs downstream of the transcription start site to consider for gene activity score calculation. -#' @param tileSize The size of the tiles used for binning counts prior to gene activity score calculation. -#' @param ceiling The maximum counts per tile allowed. This is used to prevent large biases in tile counts. +#' @param extendDownstream The minimum and maximum number of basepairs downstream of the transcription start site or transcription termination site +#' (based on 'useTSS') to consider for gene activity score calculation. #' @param useGeneBoundaries A boolean value indicating whether gene boundaries should be employed during gene activity score #' calculation. Gene boundaries refers to the process of preventing tiles from contributing to the gene score of a given gene #' if there is a second gene's transcription start site between the tile and the gene of interest. +#' @param geneUpstream An integer describing the number of bp upstream the gene to extend the gene body. This effectively makes the gene body larger as there +#' are proximal peaks that should be weighted equally to the gene body. This parameter is used if 'useTSS=FALSE'. +#' @param geneDownstream An integer describing the number of bp downstream the gene to extend the gene body.This effectively makes the gene body larger as there +#' are proximal peaks that should be weighted equally to the gene body. This parameter is used if 'useTSS=FALSE'. +#' @param useTSS A boolean describing whether to build gene model based on gene TSS or the gene body. +#' @param tileSize The size of the tiles used for binning counts prior to gene activity score calculation. +#' @param ceiling The maximum counts per tile allowed. This is used to prevent large biases in tile counts. +#' @param geneScaleFactor A numeric scaling factor to weight genes based on the inverse of there length i.e. [(Scale Factor)/(Gene Length)]. This +#' is scaled from 1 to the scale factor. Small genes will be the scale factor while extremely large genes will be closer to 1. This scaling helps with +#' the relative gene score value. #' @param scaleTo Each column in the calculated gene score matrix will be normalized to a column sum designated by `scaleTo`. #' @param excludeChr A character vector containing the `seqnames` of the chromosomes that should be excluded from this analysis. #' @param blacklist A `GRanges` object containing genomic regions to blacklist that may be extremeley over-represented and thus diff --git a/R/MatrixTiles.R b/R/MatrixTiles.R index a4f8be2d..b00901fd 100644 --- a/R/MatrixTiles.R +++ b/R/MatrixTiles.R @@ -96,6 +96,7 @@ addTileMatrix <- function( excludeChr = c("chrM", "chrY"), blacklist = NULL, chromLengths = NULL, + chromSizes = NULL, force = FALSE, subThreads = 1, tstart = NULL, diff --git a/R/RNAIntegration.R b/R/RNAIntegration.R index 7c4b1be4..f9773a29 100644 --- a/R/RNAIntegration.R +++ b/R/RNAIntegration.R @@ -160,8 +160,8 @@ addGeneIntegrationMatrix <- function( } if(!all(nCell == 1)){ - .logMessage(paste0("Missing ", length(which(nCell == 0)), " Overlapping ", length(which(nCell > 1))," cells from ArchRProj in groupList!"), logFile = logFile) - stop("Missing ", length(which(nCell == 0)), " Overlapping ", length(which(nCell > 1))," cells from ArchRProj in groupList!") + .logMessage(paste0("Missing ", length(which(nCell == 0)), " cells. Found ", length(which(nCell > 1))," overlapping cells from ArchRProj in groupList! Cannot have overlapping/missing cells in ATAC input, check 'groupList' argument!"), logFile = logFile) + stop("Missing ", length(which(nCell == 0)), " cells. Found ", length(which(nCell > 1))," overlapping cells from ArchRProj in groupList! Cannot have overlapping/missing cells in ATAC input, check 'groupList' argument!") } #########################################################################################