From ee3ea2b80f6b8552982c4c77ee8b9976992522c1 Mon Sep 17 00:00:00 2001 From: jiwen90 <70122688+jiwen90@users.noreply.github.com> Date: Sat, 28 Jan 2023 01:37:05 -0500 Subject: [PATCH 1/3] parallel implementation --- NAMESPACE | 1 + R/ComparePathways.R | 169 +++++++++++++++++++++++++++++++ man/compare_pathways_parallel.Rd | 54 ++++++++++ 3 files changed, 224 insertions(+) create mode 100644 man/compare_pathways_parallel.Rd diff --git a/NAMESPACE b/NAMESPACE index a9eba47..b7dab19 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export("%>%") export(compare_pathways) +export(compare_pathways_parallel) export(compare_sce) export(compare_seurat) export(format_pathways) diff --git a/R/ComparePathways.R b/R/ComparePathways.R index c7b469b..4b6e01d 100644 --- a/R/ComparePathways.R +++ b/R/ComparePathways.R @@ -163,5 +163,174 @@ compare_pathways <- function(samples, } +#' Use SCPA to compare gene sets +#' +#' This function takes an input of samples and pathways +#' to compare gene set perturbations over different conditions with SCPA. +#' +#' @param samples List of samples, each supplied as an expression matrix with cells in columns +#' and genes in rows. +#' @param pathways Pathways and their genes with each pathway in a separate list. For formatting of +#' gene lists, see documentation at https://jackbibby1.github.io/SCPA/articles/using_gene_sets.html +#' @param downsample Option to downsample cell numbers. Defaults to 500 cells per condition. If a population +#' has < 500 cells, all cells from that condition are used. +#' @param min_genes Gene sets with fewer than this number of genes will be excluded +#' @param max_genes Gene sets with more than this number of genes will be excluded +#' @param cores Number of cores to use for parallel processing +#' +#' @examples \dontrun{ +#' scpa_result <- compare_pathways_parallel( +#' list(sample1, sample2, sample3), +#' pathways = pathways), +#' cores = 2) +#' } +#' +#' @return Statistical results from the SCPA analysis. The qval should be the +#' primary metric that is used to interpret pathway differences i.e. a higher +#' qval translates to larger pathway differences between conditions. +#' If only two samples are provided, a fold change (FC) enrichment score will also be +#' calculated. The FC statistic is generated from a running sum of mean changes in gene +#' expression from all genes of the pathway. It's calculated from average pathway +#' expression in population1 - population2, so a negative FC means the pathway is +#' higher in population2. +#' +#' @export + +compare_pathways_parallel <- function(samples, + pathways, + downsample = 500, + min_genes = 15, + max_genes = 500, + cores = 2) { + + # get pathways for analysis + if (class(pathways)[1] == "character") { + pathways <- get_paths(pathways) + } + + # define the number of cells in each condition + cell_number <- sapply(samples, function(x) ncol(x)) + + for (i in 1:length(cell_number)) { + message(paste("Cell numbers in population", i, "=", cell_number[i])) + } + + message("- If greater than ", downsample, + " cells, these populations will be downsampled", "\n") + + # randomly sample cells + for (i in 1:length(samples)) { + + samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < 500, cell_number[i], downsample)) + + } + + # only take shared genes + genes <- lapply(samples, function(x) rownames(x)) + genes <- table(unlist(genes)) + genes <- genes[genes == length(samples)] + genes <- names(genes) + samples <- lapply(samples, function(x) x[rownames(x) %in% genes, ]) + + # filter out pathways + gene_numbers <- lapply(pathways, function(x) nrow(samples[[1]][rownames(samples[[1]]) %in% x$Genes, ])) + keep_pathway <- sapply(gene_numbers, function(x) any(x >= min_genes & x <= max_genes)) + excluded_pathways <- sapply(pathways[!keep_pathway], function(x) unique(dplyr::pull(x, Pathway))) + pathways_filtered <- pathways[keep_pathway] + if (length(pathways_filtered) == 0) { + stop(call. = F, "No pathways passed the min/max genes threshold") + + } else if (length(excluded_pathways) > 0) { + + message("Excluding ", length(excluded_pathways), + " pathway(s) based on min/max genes parameter: ", + paste(utils::head(excluded_pathways, 5), collapse = ", "), "...", "\n") + + } else { + + message("All ", length(pathways), " pathways passed the min/max genes threshold", "\n") + + } + + + if (length(samples) > 2) { + + message("Performing a multisample analysis with SCPA...") + + } else { + + message("Calculating pathway fold changes...", "\n") + message("Performing a two-sample analysis with SCPA...") + + } + + if (!require(doParallel)) { + stop('doParallel library not loaded. Please exeucte library("doParallel").') + } else { + cluster <- makeCluster(cores, type = "PSOCK") + registerDoParallel(cluster) + + scpa_result <- foreach(pathway = pathways_filtered) %dopar% { + res <- tryCatch( + expr = { + # subset data to get one pathway + path_subset <- lapply(samples, function(x) x[rownames(x) %in% pathway$Genes, ]) + path_subset <- lapply(path_subset, function(x) t(x)) + path_subset <- lapply(path_subset, function(x) x[, sort(colnames(x))]) + + if (length(path_subset) == 2) { + + avg_expression <- lapply(path_subset, function(x) data.frame(colMeans(x))) + samp_combined <- cbind(avg_expression[[1]], avg_expression[[2]]) + samp_combined <- magrittr::set_colnames(samp_combined, c("Pop1", "Pop2")) + samp_combined <- cbind(samp_combined, logFC = samp_combined[, "Pop1"]-samp_combined[, "Pop2"]) + path_fc <- sum(samp_combined[, "logFC"]) + + multicross::mcm(path_subset, level = 0.05) %>% + data.frame() %>% + t() %>% + data.frame() %>% + dplyr::mutate(FC = path_fc) %>% + dplyr::mutate(Pathway = pathway$Pathway[1]) %>% + dplyr::select(-X2) %>% + dplyr::mutate(Pval = as.numeric(X1)) %>% + dplyr::select(-X1) %>% + dplyr::mutate(adjPval = stats::p.adjust(Pval , method = "bonferroni", + n = length(pathways_filtered))) %>% + dplyr::mutate(qval = sqrt(-log10(adjPval))) %>% + dplyr::select(Pathway, Pval, adjPval, qval, FC) + + } else { + + multicross::mcm(path_subset, level = 0.05) %>% + data.frame() %>% + t() %>% + data.frame() %>% + dplyr::mutate(Pathway = pathway$Pathway[1]) %>% + dplyr::select(-X2) %>% + dplyr::mutate(Pval = as.numeric(X1)) %>% + dplyr::select(-X1) %>% + dplyr::mutate(adjPval = stats::p.adjust(Pval , method = "bonferroni", + n = length(pathways_filtered))) %>% + dplyr::mutate(qval = sqrt(-log10(adjPval))) %>% + dplyr::select(Pathway, Pval, adjPval, qval) + } + }, + error = function(e) { + return("Error in pathway ", pathway$Pathway[1], ": ", e) + }) + } + + stopCluster(cluster) + + scpa_result <- scpa_result %>% + dplyr::bind_rows() %>% + tibble::remove_rownames() %>% + dplyr::arrange(desc(qval)) + + return(scpa_result) + + } +} diff --git a/man/compare_pathways_parallel.Rd b/man/compare_pathways_parallel.Rd new file mode 100644 index 0000000..9551dd2 --- /dev/null +++ b/man/compare_pathways_parallel.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ComparePathways.R +\name{compare_pathways_parallel} +\alias{compare_pathways_parallel} +\title{Use SCPA to compare gene sets} +\usage{ +compare_pathways_parallel( + samples, + pathways, + downsample = 500, + min_genes = 15, + max_genes = 500, + cores = 2 +) +} +\arguments{ +\item{samples}{List of samples, each supplied as an expression matrix with cells in columns +and genes in rows.} + +\item{pathways}{Pathways and their genes with each pathway in a separate list. For formatting of +gene lists, see documentation at https://jackbibby1.github.io/SCPA/articles/using_gene_sets.html} + +\item{downsample}{Option to downsample cell numbers. Defaults to 500 cells per condition. If a population +has < 500 cells, all cells from that condition are used.} + +\item{min_genes}{Gene sets with fewer than this number of genes will be excluded} + +\item{max_genes}{Gene sets with more than this number of genes will be excluded} + +\item{cores}{Number of cores to use for parallel processing} +} +\value{ +Statistical results from the SCPA analysis. The qval should be the +primary metric that is used to interpret pathway differences i.e. a higher +qval translates to larger pathway differences between conditions. +If only two samples are provided, a fold change (FC) enrichment score will also be +calculated. The FC statistic is generated from a running sum of mean changes in gene +expression from all genes of the pathway. It's calculated from average pathway +expression in population1 - population2, so a negative FC means the pathway is +higher in population2. +} +\description{ +This function takes an input of samples and pathways +to compare gene set perturbations over different conditions with SCPA. +} +\examples{ +\dontrun{ +scpa_result <- compare_pathways_parallel( + list(sample1, sample2, sample3), + pathways = pathways), + cores = 2) +} + +} From da5b7bf3a11abbf071ca5e2a9c5743a3a9f320fb Mon Sep 17 00:00:00 2001 From: jiwen90 <70122688+jiwen90@users.noreply.github.com> Date: Sun, 29 Jan 2023 04:13:19 -0500 Subject: [PATCH 2/3] fix downsample --- R/ComparePathways.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/ComparePathways.R b/R/ComparePathways.R index 4b6e01d..68b5d97 100644 --- a/R/ComparePathways.R +++ b/R/ComparePathways.R @@ -53,7 +53,7 @@ compare_pathways <- function(samples, # randomly sample cells for (i in 1:length(samples)) { - samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < 500, cell_number[i], downsample)) + samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < downsample, cell_number[i], downsample)) } @@ -221,7 +221,7 @@ compare_pathways_parallel <- function(samples, # randomly sample cells for (i in 1:length(samples)) { - samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < 500, cell_number[i], downsample)) + samples[[i]] <- random_cells(samples[[i]], ifelse(cell_number[i] < downsample, cell_number[i], downsample)) } From ed704d7e5107336f60fa2519b8c7c269faae5493 Mon Sep 17 00:00:00 2001 From: jiwen90 <70122688+jiwen90@users.noreply.github.com> Date: Mon, 30 Jan 2023 14:29:08 -0500 Subject: [PATCH 3/3] fix catch --- R/ComparePathways.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/ComparePathways.R b/R/ComparePathways.R index 68b5d97..aaa5e1f 100644 --- a/R/ComparePathways.R +++ b/R/ComparePathways.R @@ -319,7 +319,6 @@ compare_pathways_parallel <- function(samples, } }, error = function(e) { - return("Error in pathway ", pathway$Pathway[1], ": ", e) }) }