add multivariate arg [skip ci]

neurogenomics · Mar 8, 2024 · d5194b5 · d5194b5
1 parent cdb539b
commit d5194b5
Show file tree

Hide file tree

Showing 10 changed files with 420 additions and 91 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -19,6 +19,7 @@ SystemRequirements: Python (>= 3.7.0)
 biocViews:
     Software
 Imports: 
+    Rdimtools,
     scKirby,
     utils,
     methods,
@@ -85,7 +86,8 @@ Remotes:
     github::neurogenomics/HPOExplorer,
     github::neurogenomics/MultiEWCE,
     github::RajLabMSSM/echotabix,
-    github::RajLabMSSM/downloadR
+    github::RajLabMSSM/downloadR,
+    github::satijalab/seurat-wrappers
 RoxygenNote: 7.3.1
 VignetteBuilder: knitr 
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,7 @@ export(phenomix_merge)
 export(phenomix_query)
 export(phenomix_query_batched)
 export(plot_enrichment)
+export(plot_hpo_pseudotime)
 export(plot_reduction)
 export(plot_trait_cor)
 export(plot_variancePartition)

diff --git a/R/iterate_lm.R b/R/iterate_lm.R
@@ -4,11 +4,18 @@
 #' @param ymat gene x celltype matrix.
 #' @param test_method Association testing method to use.
 #' @param correction_method Multiple-testing correction 
-#' method to be passed to \code{stats::p.adjust}.
+#' method to be passed to \link[stats]{p.adjust}.
 #' @param qvalue_thresh q-value threshold to use when report
 #'  significant results summary.
-#' @param quantize A named list where the values of "x" and "y" indicaite the 
-#' number of quantiles to bin the respective "xmat" and "ymat" datasets into.
+#' @param quantize A named list where the values of "x" and "y" indicate the 
+#' number of quantiles to bin the respective \code{xmat} and 
+#' \code{ymat} datasets into.
+#' @param multivariate If \code{TRUE}, runs tests with each column in 
+#' \code{xmat} as a multivariate predictor in a single model 
+#' (one model per column in \code{yvar}). If\code{FALSE},
+#' runs tests with each column in \code{xmat} as a univariate predictor in
+#' separate models (one model per column in \code{yvar}). 
+#' @param ... Additional parameters passed to the statistical test function.
 #' @inheritParams set_cores
 #'
 #' @export
@@ -29,14 +36,16 @@
 #'                      workers = 1)
 iterate_lm <- function(xmat,
                        ymat,
-                       test_method = c("glm","anova"),
+                       test_method = c("glm","anova","lm.ridge","rlm"), 
+                       multivariate = FALSE,
                        correction_method = "BH",
                        qvalue_thresh = .05,
                        quantize = list(x=NULL,
                                        y=NULL),
                        progressbar = TRUE,
                        workers = NULL,
-                       verbose=TRUE) {  
+                       verbose = TRUE,
+                       ...) {  
     p <- q <- NULL;
     test_method <- match.arg(test_method)
     data.table::setDTthreads(threads = 1)
@@ -71,25 +80,34 @@ iterate_lm <- function(xmat,
     lm_res <- iterate_lm_long(xmat = xmat,
                               ymat = ymat, 
                               cores = cores,
-                              method = test_method)
+                              method = test_method,
+                              multivariate = multivariate,
+                              ...)
     ### Multiple-testing correction 
     if(test_method=="anova"){  
         lm_res[,q:=stats::p.adjust(p = p, method = correction_method)] 
     } else { 
+        grep_terms <- if(isTRUE(multivariate)) "^x[:]" else "^x$"
         model_p <- model_q <- term <- xvar <- NULL;
         lm_res|>data.table::setnames("p.value","p")
         model_res <- lm_res[term %in% c("(Intercept)")]|>
             data.table:::setnames(c("p","estimate","statistic"),
                                   c("model_p","model_estimate","model_statistic"))
         model_res[,model_q:=stats::p.adjust(p = model_p, 
                                             method = correction_method)]  
-        lm_res <- merge(lm_res[grepl("^x[:]",term)] ,
+        lm_res <- merge(lm_res[grepl(grep_terms,term)] ,
                         model_res[,c("model_id","model_p","model_q",
                                      "model_estimate","model_statistic")], 
                         by="model_id")
-        lm_res[,xvar:=gsub("^x\\:xvar","",term)]|>
-            data.table::setcolorder("xvar",3)
-        lm_res[,q:=ifelse(model_q<0.05,p,min(1,model_q+p))] 
+        if(isTRUE(multivariate)){
+            lm_res[,xvar:=gsub("^x\\:xvar","",term)]|>
+                data.table::setcolorder("xvar",3)
+            ## by=.I is EXTREMELY important! otherwise, the p-values will be 
+            ## the minimum across ALL rows.
+            lm_res[,q:=ifelse(model_q<0.05,p,min(1,model_q+p)), by=.I] 
+        } else {
+            lm_res[,q:=stats::p.adjust(p = p, method = correction_method)] 
+        }
     }   
     #### Report ####
     messager(formatC(nrow(lm_res[q<qvalue_thresh,]), big.mark = ","),

diff --git a/R/iterate_lm_long.R b/R/iterate_lm_long.R
@@ -1,8 +1,10 @@
 iterate_lm_long <- function(xmat,
                             ymat, 
                             cores, 
-                            method){
-    x <- y <- NULL; 
+                            method,
+                            multivariate,
+                            ...){
+    x <- y <- xvar <- NULL; 
     progressbar <- cores$params$progressbar
     add_model_id <- function(res,i){ 
         mid <- gsub("file",paste0("model",i,"_"),basename(tempfile()))
@@ -30,29 +32,49 @@ iterate_lm_long <- function(xmat,
             } 
             ## Run tests: glm   
             if(method=="glm"){
-                messager("Method: glm",v=!progressbar)
-                mod <- stats::glm(data = dt,
-                                  formula = y~x*xvar)
-                res <- broom::tidy(mod) |>
-                    data.table::data.table() 
-                add_model_id(res,i)
+                if(isTRUE(multivariate)){
+                    messager("Method: glm (multivariate)",v=!progressbar)
+                    mod <- stats::glm(data = dt,
+                                      formula = y~x*xvar,
+                                      ...)
+                    res <- broom::tidy(mod) |>
+                        data.table::data.table()  
+                    add_model_id(res,i)
+                } else {
+                    res <- lapply(stats::setNames(unique(dt$xvar),
+                                                  unique(dt$xvar)),
+                                  function(xv){
+                        messager("Method: glm (univariate)",v=!progressbar)
+                        mod <- stats::glm(data = dt[xvar==xv],
+                                          formula = y~x,
+                                          ...)
+                        res <- broom::tidy(mod) |>
+                            data.table::data.table() 
+                        add_model_id(res,i)
+                    }) |> 
+                        data.table::rbindlist(idcol = "xvar",
+                                              fill = TRUE)
+                }
             } else if(method=="anova") {
             ## Run tests: ANOVA 
-                messager("Method: ANOVA",v=!progressbar)
+                messager("Method: ANOVA",v=!progressbar) 
                 res <- dt |>
                 rstatix::group_by(xvar) |>
-                rstatix::anova_test(formula = y ~ x) |>
+                rstatix::anova_test(formula = y ~ x,
+                                    ...) |>
                 data.table::data.table()
             } else if(method=="lm.ridge"){ 
             ### XGboost ####
                 mod <- MASS::lm.ridge(formula= y~x+xvar,
-                                      data=dt)
+                                      data=dt,
+                                      ...)
                 res <- broom::tidy(mod) |>
                     data.table::data.table() 
                 add_model_id(res,i)
             } else if(method=="rlm"){ 
                 mod <- MASS::rlm(formula= y~x+xvar,
-                                  data=dt)
+                                 data=dt,
+                                 ...)
                 res <- broom::tidy(mod) |>
                     data.table::data.table() 
                 add_model_id(res,i)

diff --git a/R/plot_hpo_pseudotime.R b/R/plot_hpo_pseudotime.R
@@ -0,0 +1,104 @@
+#' Plot HPO pseudotime
+#' 
+#' Plot the Human Phenotype Ontology (HPO) in genomic latent space,
+#' then compute pseudotime trajectories between a subset of phenotypes that
+#' are symptoms of a given disease (or set of diseases).
+#' @param obj \link{Seurat} object of HPO terms generated by \link{prepare_hpo}.
+#' @param disease_ids One or more disease IDs found within \code{dt_genes}.
+#' @inheritParams prepare_hpo
+#' @inheritParams monocle3::learn_graph
+#' @inheritDotParams monocle3::plot_cells
+#' @export
+plot_hpo_pseudotime <- function(obj,
+                                dt_genes = HPOExplorer::load_phenotype_to_genes(1),
+                                disease_ids=dt_genes$disease_id[1:3],
+                                learn_graph_control=list(prune_graph=FALSE),
+                                ...
+                                ){
+    requireNamespace("monocle3")
+    requireNamespace("SeuratWrappers")
+    gene_symbol <- disease_id <- NULL;
+
+    #### Add disease metadata
+    ## Using gene overlap (670998 p2d pairs) ##
+    # nrow(unique(dt_genes[,c("hpo_id","disease_id")]))
+    p2d <- dt_genes[disease_id %in% disease_ids,
+                    list(count=data.table::uniqueN(gene_symbol)),
+                    by=c("hpo_id","disease_id")]#|>
+        # data.table::dcast.data.table(formula = hpo_id~disease_id,
+        #                              value.var = "count")
+    if(nrow(p2d)==0) stopper("No hpo_ids found for the given disease_ids.")
+    # p2d <- p2d[colnames(ref),]
+    # ## Using annotation overlap (258276 p2d pairs)
+    # nrow(unique(dt_annot[,c("hpo_id","disease_id")]))
+    # p2d <- dt_annot[,count:=1]|>
+    #     data.table::dcast.data.table(formula = hpo_id~disease_id,
+    #                                  value.var = "count",
+    #                                  )
+    #### Merge with rest of annotations ####
+    ## make Seurat way too slow having this many columns...
+    # dt_annot_melt <- data.table::merge.data.table(dt_annot_melt,
+    #                                               p2d,
+    #                                               by.x = "id",
+    #                                               by.y = "hpo_id",
+    #                                               all.x = TRUE)
+
+    hpo_ids <- intersect(unique(p2d$hpo_id),
+                         colnames(obj))
+    if(length(hpo_ids)==0) {
+        stopper("No hpo_ids oevrlapping with samples (colnames) in obj.")
+    }
+    cds <- SeuratWrappers::as.cell_data_set(obj)
+    cds_sub <- monocle3::cluster_cells(
+        cds = cds[,hpo_ids],
+        k = max(as.integer(cds@colData$seurat_clusters)))
+    cds_sub <- monocle3::learn_graph(cds_sub, 
+                                     learn_graph_control=learn_graph_control)
+    cds_sub <- monocle3::order_cells(cds_sub, root_cells = colnames(cds_sub))
+    monocle3::principal_graph(cds) <- monocle3::principal_graph(cds_sub)
+    monocle3::principal_graph_aux(cds) <- monocle3::principal_graph_aux(cds_sub) 
+    plt <- monocle3::plot_cells(cds,
+                                # color_cells_by = "ancestor_name_abnormality",
+                                # group_cells_by = "top_celltype"
+                                )
+    return(
+        list(data=cds,
+             plot=plt)
+    )
+    # pseudo_dt <- t(cds@principal_graph_aux$UMAP$pr_graph_cell_proj_dist)|>`colnames<-`(c("umap1","umap2"))
+
+    # gm <- ref@graphs$freq_nn[highlights$hpo_id,
+    #                           highlights$hpo_id]
+    # g <- igraph::graph_from_adjacency_matrix(gm)
+    # gdt <- KGExplorer::graph_to_dt(g)[object!=subject]
+    # ggraph(g, layout = 'stress') + 
+    #     geom_edge_density() + 
+    #     geom_edge_link(alpha = 0.25)
+    # dp <- Seurat::DimPlot(ref, 
+    #                       group.by = disease_id,
+    #                       cols.highlight = "red",
+    #                       alpha = .7,
+    #                       sizes.highlight = highlights[[disease_id]],
+    #                       cells.highlight = list( highlights[["hpo_id"]])|>
+    #                           `names<-`(disease_id))
+    # highlight_df <- subset(dp[[1]]$data,highlight!="Unselected")
+    # 
+    # highlight_dt <- merge(gdt,
+    #                       data.table::data.table(highlight_df,keep.rownames = "subject"),
+    #                       by="subject")|>
+    #     merge(data.table::data.table(highlight_df,keep.rownames = "object"),
+    #           by=c("object","highlight",disease_id)) |>
+    #     data.table::setnames(c("umap_1.x","umap_2.x","umap_1.y","umap_2.y"),
+    #                          c("x","y","xend","yend"))
+    # # ggplot2::ggplot()
+    # dp + ggplot2::geom_segment(data = highlight_dt,
+    #                            mapping = ggplot2::aes(x = x,
+    #                                                   xend = xend,
+    #                                                   y = y,
+    #                                                   yend = yend),
+    #                            inherit.aes = FALSE)
+    # dp+
+    #     ggplot2::geom_step(data = highlight_df, 
+    #                        ggplot2::aes(x=umap_1,
+    #                                     y=umap_2)) 
+}