From ee3ea2b80f6b8552982c4c77ee8b9976992522c1 Mon Sep 17 00:00:00 2001
From: jiwen90 <70122688+jiwen90@users.noreply.github.com>
Date: Sat, 28 Jan 2023 01:37:05 -0500
Subject: [PATCH 1/3] parallel implementation

---
 NAMESPACE                        |   1 +
 R/ComparePathways.R              | 169 +++++++++++++++++++++++++++++++
 man/compare_pathways_parallel.Rd |  54 ++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 man/compare_pathways_parallel.Rd

diff --git a/NAMESPACE b/NAMESPACE
index a9eba47..b7dab19 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export("%>%")
 export(compare_pathways)
+export(compare_pathways_parallel)
 export(compare_sce)
 export(compare_seurat)
 export(format_pathways)
diff --git a/R/ComparePathways.R b/R/ComparePathways.R
index c7b469b..4b6e01d 100644
--- a/R/ComparePathways.R
+++ b/R/ComparePathways.R
@@ -163,5 +163,174 @@ compare_pathways <- function(samples,
 
 }
 
+#' Use SCPA to compare gene sets
+#'
+#' This function takes an input of samples and pathways
+#' to compare gene set perturbations over different conditions with SCPA.
+#'
+#' @param samples List of samples, each supplied as an expression matrix with cells in columns
+#'     and genes in rows.
+#' @param pathways Pathways and their genes with each pathway in a separate list. For formatting of
+#'     gene lists, see documentation at https://jackbibby1.github.io/SCPA/articles/using_gene_sets.html
+#' @param downsample Option to downsample cell numbers. Defaults to 500 cells per condition. If a population
+#'     has < 500 cells, all cells from that condition are used.
+#' @param min_genes Gene sets with fewer than this number of genes will be excluded
+#' @param max_genes Gene sets with more than this number of genes will be excluded
+#' @param cores Number of cores to use for parallel processing
+#'
+#' @examples \dontrun{
+#' scpa_result <- compare_pathways_parallel(
+#'      list(sample1, sample2, sample3),
+#'      pathways = pathways),
+#'      cores = 2)
+#' }
+#'
+#' @return Statistical results from the SCPA analysis. The qval should be the
+#' primary metric that is used to interpret pathway differences i.e. a higher
+#' qval translates to larger pathway differences between conditions.
+#' If only two samples are provided, a fold change (FC) enrichment score will also be
+#' calculated. The FC statistic is generated from a running sum of mean changes in gene
+#' expression from all genes of the pathway. It's calculated from average pathway
+#' expression in population1 - population2, so a negative FC means the pathway is
+#' higher in population2.
+#'
+#' @export
+
+compare_pathways_parallel <- function(samples,
+                             pathways,
+                             downsample = 500,
+                             min_genes = 15,
+                             max_genes = 500,
+                             cores = 2) {
+
+  # get pathways for analysis
+  if (class(pathways)[1] == "character") {
+    pathways <- get_paths(pathways)
+  }
+
+  # define the number of cells in each condition
+  cell_number <- sapply(samples, function(x) ncol(x))
+
+  for (i in 1:length(cell_number)) {
+    message(paste("Cell numbers in population", i, "=", cell_number[i]))
+  }
+
+  message("- If greater than ", downsample,
+          " cells, these populations will be downsampled", "\n")
+
+  # randomly sample cells
+  for (i in 1:length(samples)) {
+
+    samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < 500, cell_number[i], downsample))
+
+  }
+
+  # only take shared genes
+  genes <- lapply(samples, function(x) rownames(x))
+  genes <- table(unlist(genes))
+  genes <- genes[genes == length(samples)]
+  genes <- names(genes)
+  samples <- lapply(samples, function(x) x[rownames(x) %in% genes, ])
+
+  # filter out pathways
+  gene_numbers <- lapply(pathways, function(x) nrow(samples[[1]][rownames(samples[[1]]) %in% x$Genes, ]))
+  keep_pathway <- sapply(gene_numbers, function(x) any(x >= min_genes & x <= max_genes))
+  excluded_pathways <- sapply(pathways[!keep_pathway], function(x) unique(dplyr::pull(x, Pathway)))
+  pathways_filtered <- pathways[keep_pathway]
 
+  if (length(pathways_filtered) == 0) {
 
+    stop(call. = F, "No pathways passed the min/max genes threshold")
+
+  } else if (length(excluded_pathways) > 0) {
+
+    message("Excluding ", length(excluded_pathways),
+            " pathway(s) based on min/max genes parameter: ",
+            paste(utils::head(excluded_pathways, 5), collapse = ", "), "...", "\n")
+
+  } else {
+
+    message("All ", length(pathways), " pathways passed the min/max genes threshold", "\n")
+
+  }
+
+
+  if (length(samples) > 2) {
+
+    message("Performing a multisample analysis with SCPA...")
+
+  } else {
+
+    message("Calculating pathway fold changes...", "\n")
+    message("Performing a two-sample analysis with SCPA...")
+
+  }
+
+  if (!require(doParallel)) {
+        stop('doParallel library not loaded. Please exeucte library("doParallel").')
+  } else {
+    cluster <- makeCluster(cores, type = "PSOCK")
+    registerDoParallel(cluster)
+
+    scpa_result <- foreach(pathway = pathways_filtered) %dopar% {
+      res <- tryCatch(
+        expr = {
+          # subset data to get one pathway
+          path_subset <- lapply(samples, function(x) x[rownames(x) %in% pathway$Genes, ])
+          path_subset <- lapply(path_subset, function(x) t(x))
+          path_subset <- lapply(path_subset, function(x) x[, sort(colnames(x))])
+
+          if (length(path_subset) == 2) {
+
+            avg_expression <- lapply(path_subset, function(x) data.frame(colMeans(x)))
+            samp_combined <- cbind(avg_expression[[1]], avg_expression[[2]])
+            samp_combined <- magrittr::set_colnames(samp_combined, c("Pop1", "Pop2"))
+            samp_combined <- cbind(samp_combined, logFC = samp_combined[, "Pop1"]-samp_combined[, "Pop2"])
+            path_fc <- sum(samp_combined[, "logFC"])
+
+            multicross::mcm(path_subset, level = 0.05) %>%
+              data.frame() %>%
+              t() %>%
+              data.frame() %>%
+              dplyr::mutate(FC = path_fc) %>%
+              dplyr::mutate(Pathway = pathway$Pathway[1]) %>%
+              dplyr::select(-X2) %>%
+              dplyr::mutate(Pval = as.numeric(X1)) %>%
+              dplyr::select(-X1) %>%
+              dplyr::mutate(adjPval = stats::p.adjust(Pval , method = "bonferroni",
+                                              n = length(pathways_filtered))) %>%
+              dplyr::mutate(qval = sqrt(-log10(adjPval))) %>%
+              dplyr::select(Pathway, Pval, adjPval, qval, FC)
+
+          } else {
+
+            multicross::mcm(path_subset, level = 0.05) %>%
+              data.frame() %>%
+              t() %>%
+              data.frame() %>%
+              dplyr::mutate(Pathway = pathway$Pathway[1]) %>%
+              dplyr::select(-X2) %>%
+              dplyr::mutate(Pval = as.numeric(X1)) %>%
+              dplyr::select(-X1) %>%
+              dplyr::mutate(adjPval = stats::p.adjust(Pval , method = "bonferroni",
+                                              n = length(pathways_filtered))) %>%
+              dplyr::mutate(qval = sqrt(-log10(adjPval))) %>%
+              dplyr::select(Pathway, Pval, adjPval, qval)
+          }
+        },
+        error = function(e) {
+          return("Error in pathway ", pathway$Pathway[1], ": ", e)
+        })
+    }
+
+    stopCluster(cluster)
+
+    scpa_result <- scpa_result %>%
+      dplyr::bind_rows() %>%
+      tibble::remove_rownames() %>%
+      dplyr::arrange(desc(qval))
+
+    return(scpa_result)
+
+  }
+}
diff --git a/man/compare_pathways_parallel.Rd b/man/compare_pathways_parallel.Rd
new file mode 100644
index 0000000..9551dd2
--- /dev/null
+++ b/man/compare_pathways_parallel.Rd
@@ -0,0 +1,54 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ComparePathways.R
+\name{compare_pathways_parallel}
+\alias{compare_pathways_parallel}
+\title{Use SCPA to compare gene sets}
+\usage{
+compare_pathways_parallel(
+  samples,
+  pathways,
+  downsample = 500,
+  min_genes = 15,
+  max_genes = 500,
+  cores = 2
+)
+}
+\arguments{
+\item{samples}{List of samples, each supplied as an expression matrix with cells in columns
+and genes in rows.}
+
+\item{pathways}{Pathways and their genes with each pathway in a separate list. For formatting of
+gene lists, see documentation at https://jackbibby1.github.io/SCPA/articles/using_gene_sets.html}
+
+\item{downsample}{Option to downsample cell numbers. Defaults to 500 cells per condition. If a population
+has < 500 cells, all cells from that condition are used.}
+
+\item{min_genes}{Gene sets with fewer than this number of genes will be excluded}
+
+\item{max_genes}{Gene sets with more than this number of genes will be excluded}
+
+\item{cores}{Number of cores to use for parallel processing}
+}
+\value{
+Statistical results from the SCPA analysis. The qval should be the
+primary metric that is used to interpret pathway differences i.e. a higher
+qval translates to larger pathway differences between conditions.
+If only two samples are provided, a fold change (FC) enrichment score will also be
+calculated. The FC statistic is generated from a running sum of mean changes in gene
+expression from all genes of the pathway. It's calculated from average pathway
+expression in population1 - population2, so a negative FC means the pathway is
+higher in population2.
+}
+\description{
+This function takes an input of samples and pathways
+to compare gene set perturbations over different conditions with SCPA.
+}
+\examples{
+\dontrun{
+scpa_result <- compare_pathways_parallel(
+     list(sample1, sample2, sample3),
+     pathways = pathways),
+     cores = 2)
+}
+
+}

From da5b7bf3a11abbf071ca5e2a9c5743a3a9f320fb Mon Sep 17 00:00:00 2001
From: jiwen90 <70122688+jiwen90@users.noreply.github.com>
Date: Sun, 29 Jan 2023 04:13:19 -0500
Subject: [PATCH 2/3] fix downsample

---
 R/ComparePathways.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/ComparePathways.R b/R/ComparePathways.R
index 4b6e01d..68b5d97 100644
--- a/R/ComparePathways.R
+++ b/R/ComparePathways.R
@@ -53,7 +53,7 @@ compare_pathways <- function(samples,
   # randomly sample cells
   for (i in 1:length(samples)) {
 
-    samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < 500, cell_number[i], downsample))
+    samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < downsample, cell_number[i], downsample))
 
   }
 
@@ -221,7 +221,7 @@ compare_pathways_parallel <- function(samples,
   # randomly sample cells
   for (i in 1:length(samples)) {
 
-    samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < 500, cell_number[i], downsample))
+    samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < downsample, cell_number[i], downsample))
 
   }
 

From ed704d7e5107336f60fa2519b8c7c269faae5493 Mon Sep 17 00:00:00 2001
From: jiwen90 <70122688+jiwen90@users.noreply.github.com>
Date: Mon, 30 Jan 2023 14:29:08 -0500
Subject: [PATCH 3/3] fix catch

---
 R/ComparePathways.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/ComparePathways.R b/R/ComparePathways.R
index 68b5d97..aaa5e1f 100644
--- a/R/ComparePathways.R
+++ b/R/ComparePathways.R
@@ -319,7 +319,6 @@ compare_pathways_parallel <- function(samples,
           }
         },
         error = function(e) {
-          return("Error in pathway ", pathway$Pathway[1], ": ", e)
         })
     }