diff --git a/NAMESPACE b/NAMESPACE index 4433ec8..d8f0fbb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,9 @@ export(get_terminal_nodes) export(lefsePlotFeat) export(lefser) +export(lefserAllRanks) export(lefserPlot) +export(lefserPlotClad) export(relativeAb) import(SummarizedExperiment) import(ggplot2) @@ -14,6 +16,7 @@ importFrom(coin,wilcox_test) importFrom(dplyr,"%>%") importFrom(dplyr,arrange) importFrom(dplyr,mutate) +importFrom(ggtree,"%<+%") importFrom(methods,as) importFrom(methods,is) importFrom(stats,kruskal.test) diff --git a/R/lefser.R b/R/lefser.R index 1c0ba3e..37c0931 100644 --- a/R/lefser.R +++ b/R/lefser.R @@ -331,6 +331,12 @@ lefser <- attr(res_scores, "blk") <- blockCol attr(res_scores, "method") <- method attr(res_scores, "lgroupf") <- lgroupf[1] + attr(res_scores, "case") <- lgroupf[2] + + ## Some more attributes to create the cladogram. + pathStrings <- .selectPathStrings(relab, res_scores) + attr(res_scores, "pathStrings") <- pathStrings + attr(res_scores, "tree") <- .toTree(pathStrings) res_scores } diff --git a/R/lefserPlotClad.R b/R/lefserPlotClad.R new file mode 100644 index 0000000..25e0d49 --- /dev/null +++ b/R/lefserPlotClad.R @@ -0,0 +1,326 @@ + +# Functions for plotting a cladogram -------------------------------------- + +#' LEfSer plot cladogram +#' +#' \code{lefserPlotClad} plots a cladogram from the results of +#' `lefser` or `lefserAllRanks` +#' +#' @param df An object of class "lefser_df" or "lefesr_df_all". +#' @param colors Colors corresponding to class 0 and 1. +#' Options: "c" (colorblind), "l" (lefse), "g" (greyscale). +#' Defaults to "c". This argument also accepts a character(2) with two color names. +#' @param showTipLabels Logical. If TRUE, show tip labels. Default is FALSE. +#' @param showNodeLabels Options: "p" = phylum, "c" = class, "o" = order, +#' "f" = family, "g" = genus. It can accept several options, e.g., +#' c("p", "c"). +#' +#' @importFrom ggtree %<+% +#' +#' @return A ggtree object. +#' @export +#' +#' @examples +#' data("zeller14") +#' z14 <- zeller14[, zeller14$study_condition != "adenoma"] +#' tn <- get_terminal_nodes(rownames(z14)) +#' z14tn <- z14[tn, ] +#' z14tn_ra <- relativeAb(z14tn) +#' resAll <- lefserAllRanks(relab = z14tn_ra, groupCol = "study_condition") +#' ggt <- lefserPlotClad(df = resAll) +lefserPlotClad <- function( + df, colors = "c", showTipLabels = FALSE, showNodeLabels = "p" +) { + inputClass <- class(df)[1] + if (inputClass == "lefser_df") { + message("Woriking with lefser_df. Consider using lefserAll.") + # df$features <- .extracTips(df$features) + } else if (inputClass == "lefser_df_all") { + message("Working with lefser_df_all") + ## .extractTips should be use here as well + ## The feature names format should use full taxonomy + } else { + stop( + "You need an object of class 'lefser_df_all'", + call. = FALSE + ) + } + + df$features <- .extracTips(df$features) + + colors <- .selectPalette(colors) + tree <- attr(df, "tree") + controlVar <- attr(df, "lgroupf") + caseVar <- attr(df, "case") + + res <- df |> + dplyr::mutate( + sample = dplyr::case_when( + ## This assumes positive values always mean enriched in + ## the case condition. + .data[["scores"]] > 0 ~ .env[["caseVar"]], + TRUE ~ .env[["controlVar"]] + ) + ) |> + dplyr::mutate(abs = abs(.data[["scores"]])) |> + as.data.frame() + + labels <- c(tree$tip.label, tree$node.label) + res$node <- match(res$features, labels) + dat <- dplyr::relocate(res, node) + + internalNodes <- ape::Ntip(tree) + 1:ape::Nnode(tree) + collapseThem <- purrr::map_int(internalNodes, ~ { + chNods <- treeio::offspring(.data = tree, .node = .x, type = "tips") + if (any(chNods %in% dat$node)) { + return(NA) + } else { + return(.x) + } + }) |> + purrr::discard(is.na) + + nodLab <- match.arg( + arg = showNodeLabels, + choices = c("p", "c", "o", "f", "g"), + several.ok = TRUE + ) + nodLabRgx <- paste0("[", paste0(nodLab, collapse = ""), "]__") + treeData <- dat |> + dplyr::mutate( + showNodeLabs = dplyr::case_when( + grepl(nodLabRgx, features) ~ features, + TRUE ~ NA + ) + ) + # return(treeData) + + gt <- ggtree::ggtree( + tree, layout = "circular", branch.length = "none", size = 0.2 + ) %<+% treeData + + if (showTipLabels) { + gt <- gt + + ggtree::geom_tiplab( + mapping = ggtree::aes(label = features), size = 2, + geom = "text", na.rm=TRUE + ) + } + + gt2 <- gt + + ggtree::geom_tippoint( + mapping = ggtree::aes(fill = sample, size = abs), shape = 21, + na.rm=TRUE + ) + + ggtree::geom_nodepoint( + mapping = ggtree::aes(fill = sample, size = abs), shape = 21, + na.rm = TRUE + ) + + ggrepel::geom_label_repel( + mapping = ggtree::aes(label = showNodeLabs), + na.rm = TRUE + ) + + ggtree::scale_fill_manual( + values = colors, breaks = c(controlVar, caseVar), + name = "Sample", na.value = NA + ) + + ggplot2::scale_size(name = "Absolute\nscore") + + ggtree::theme(legend.position = "right") + for (i in collapseThem) { + gt2 <- withCallingHandlers( + ggtree::collapse(gt2, node = i), + warning = function(w) { + if (grepl("collapse", w$message)) { + invokeRestart("muffleWarning") + } + } + ) + } + return(gt2) +} + +# Run lefser at all taxonomic levels -------------------------------------- + +#' Run lefser on all taxonomic levels +#' +#' @param relab A SummarizedExperiment. +#' @param ... Arguments passed to the \code{lefser} function. +#' +#' @return An object of class 'lefser_df_all' and 'data.frame'. +#' @export +#' +#' @examples +#' +#' data("zeller14") +#' z14 <- zeller14[, zeller14$study_condition != "adenoma"] +#' tn <- get_terminal_nodes(rownames(z14)) +#' z14tn <- z14[tn, ] +#' z14tn_ra <- relativeAb(z14tn) +#' +#' resAll <- lefserAllRanks(relab = z14tn_ra, groupCol = "study_condition") +#' +lefserAllRanks <- function(relab,...) { + ## Feature names should have the full taxonomy + se <- .rowNames2RowData(relab) + seL <- mia::splitByRanks(se) + ## The kingdom level is not needed + ## The mia package doesn't support strain. + seL <- seL[names(seL) != "kingdom"] + seL <- purrr::map(seL, ~ { + seVar <- .x + rowDat <- as.data.frame(SummarizedExperiment::rowData(seVar)) + rowDat <- purrr::discard(rowDat, function(x) all(is.na(x))) + rowDat <- S4Vectors::DataFrame(rowDat) + SummarizedExperiment::rowData(seVar) <- rowDat + seVar + }) + for (i in seq_along(seL)) { + rownames(seL[[i]]) <- .lognRowNames(seL[[i]]) + } + + res <- seL |> + purrr::map(function(x, ...) lefser(relab = x,...), ...) |> + dplyr::bind_rows() + resOriginal <- lefser(relab, ...) + ## Get only tip names (full names with full taxonomy are too long). + # resOriginal$features <- stringr::str_extract( + # resOriginal$features, "[^|]+$" + # ) + res <- res |> + ## Avoid repeating features. + dplyr::filter(!.data[["features"]] %in% resOriginal$features) |> + ## Features not supported by mia are added (strain, OTUs, etc.) + dplyr::bind_rows(resOriginal) + + controlVar <- attr(resOriginal, "lgroupf") + caseVar <- attr(resOriginal, "case") + + class(res) <- c("lefser_df_all", class(res)) + + ## These pathStrings could be used in the plotting function instead (or not) + pathStrings <- .selectPathStrings(relab, res) + attr(res, "pathStrings") <- pathStrings + attr(res, "tree") <- .toTree(pathStrings) + + attr(res, "lgroupf") <- controlVar + attr(res, "case") <- caseVar + return(res) +} + +## Add taxonomic information to rowData +## This step is necessary for mia to work +.rowNames2RowData <- function(x) { + se <- x + taxonomy <- .getTaxonomyFromPathStr(rownames(se)) + dataFrame <- data.frame(tax = taxonomy) |> + tidyr::separate( + col = "tax", into = paste0("col", 1:10), # Number of taxa is usually seven, so 10 should be more than enough. + sep = "\\|", extra = "merge", fill = "right" + ) |> + purrr::discard(~ all(is.na(.x))) + ## purrr::map_chr ensures that the a single letter is used per column. + ## Having two or more letters would trigger and error message from map_chr. + firstLetter <- purrr::map_chr(dataFrame, ~ { + taxLvl <- stringr::str_extract(.x, "\\w__") + unique(taxLvl[which(!is.na(taxLvl))]) + }) + rankNames <- dplyr::case_when( + firstLetter == "k__" ~ "kingdom", + firstLetter == "p__" ~ "phylum", + firstLetter == "c__" ~ "class", + firstLetter == "o__" ~ "order", + firstLetter == "f__" ~ "family", + firstLetter == "g__" ~ "genus", + firstLetter == "s__" ~ "species", + firstLetter == "t__" ~ "strain", + ) + colnames(dataFrame) <- rankNames + DF <- S4Vectors::DataFrame(dataFrame) + SummarizedExperiment::rowData(se) <- DF + return(se) +} + +## This functions makes sure that only the taxonomy +## is used for the rowData. +## OTU's or other non-typical taxonomic ranks will not be included. +.getTaxonomyFromPathStr <- function(pathStrings) { + rgx <- "^k__[^|]+\\|p__[^|]+\\|c__[^|]+\\|o__[^|]+\\|f__[^|]+(\\|g__[^|]+)?(\\|s__[^|]+)?(\\|t__[^|]+)?" + stringr::str_extract(pathStrings, pattern = rgx) +} + +## This function selects pathStrings containing only +## taxa that is differentiallty abundant +.selectPathStrings <- function(se, res) { + pathStrings <- rownames(se) + index <- res$features |> + purrr::map(~ which(stringr::str_detect(pathStrings, .x))) |> + unlist() |> + unique() |> + sort() + pathStrings <- pathStrings[index] + return(pathStrings) +} + +# Create cladogram -------------------------------------------------------- +## Convert a character vector with pathStrings into a cladogram +## These could come from the rownames of a SummarizedExperiment with +## terminal nodes +.toTree <- function(pathStrs) { + edgeDF <- pathStrs |> + purrr::map(.pathString2EdgeList) |> + dplyr::bind_rows() |> + dplyr::distinct() + tipLabels <- stringr::str_extract(pathStrs, "[^|]+$") + nodeLabels <- unique(edgeDF$from) + idMap <- 1:(length(tipLabels) + length(nodeLabels)) + names(idMap) <- c(tipLabels, nodeLabels) + edgeMat <- matrix( + data = c(idMap[edgeDF$from], idMap[edgeDF$to]), + ncol = 2 + ) + tr <- list( + edge = edgeMat, + tip.label = tipLabels, + node.label = nodeLabels, + Nnode = length(nodeLabels), + Ntip = length(tipLabels) + ) + class(tr) <- "phylo" + tr +} + +## Helper function for .toTree +## Input is a single path string, e.g., "k__bacteria|p_Fusobacteria..." +.pathString2EdgeList <- function(pathStr) { + pathStrRoot <- stringr::str_c("ROOT|", pathStr) + chr_vct <- stringr::str_split(pathStrRoot, "\\|")[[1]] + data.frame( + from = chr_vct[1:length(chr_vct)-1], + to = chr_vct[2:length(chr_vct)] + ) +} + +## This function extracts only the last element of the taxonomy +.extracTips <- function(pathStrs) { + stringr::str_extract(pathStrs, "[^|]+$") +} + + +# Utils ------------------------------------------------------------------- +.lognRowNames <- function(se) { + dat <- SummarizedExperiment::rowData(se) |> + as.data.frame() |> + tibble::rownames_to_column(var = "rowname") |> + dplyr::relocate(.data[["rowname"]]) + lastColLgl <- all(dat[[colnames(dat)[ncol(dat)]]] == dat[["rowname"]]) + if (lastColLgl) { + dat <- dat[, which(colnames(dat) != "rowname")] + output <- dat |> + tidyr::unite( + col = "features", 1:tidyselect::last_col(), + sep = "|", remove = TRUE, + ) |> + dplyr::pull(.data[["features"]]) + } + return(output) +} diff --git a/inst/scripts/cladogramPlot.R b/inst/scripts/cladogramPlot.R new file mode 100644 index 0000000..19e4480 --- /dev/null +++ b/inst/scripts/cladogramPlot.R @@ -0,0 +1,18 @@ +suppressPackageStartupMessages(library(lefser)) +data("zeller14") +z14 <- zeller14[, zeller14$study_condition != "adenoma"] +tn <- get_terminal_nodes(rownames(z14)) +z14tn <- z14[tn, ] +z14tn_ra <- relativeAb(z14tn) + +resAll <- lefserAllRanks(relab = z14tn_ra, groupCol = "study_condition") +ggt <- lefserPlotClad(df = resAll) +# y +# z <- lefserPlotClad(df = resAll, showTipLabels = TRUE, showNodeLabels = c("c")) +# z +# sessioninfo::session_info() + + +# res <- lefser(z14tn_ra, groupCol = "study_condition") +# x <- lefserPlotClad(df = res) +# x \ No newline at end of file diff --git a/man/lefserAllRanks.Rd b/man/lefserAllRanks.Rd new file mode 100644 index 0000000..2503c68 --- /dev/null +++ b/man/lefserAllRanks.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lefserPlotClad.R +\name{lefserAllRanks} +\alias{lefserAllRanks} +\title{Run lefser on all taxonomic levels} +\usage{ +lefserAllRanks(relab, ...) +} +\arguments{ +\item{relab}{A SummarizedExperiment.} + +\item{...}{Arguments passed to the \code{lefser} function.} +} +\value{ +An object of class 'lefser_df_all' and 'data.frame'. +} +\description{ +Run lefser on all taxonomic levels +} +\examples{ + +data("zeller14") +z14 <- zeller14[, zeller14$study_condition != "adenoma"] +tn <- get_terminal_nodes(rownames(z14)) +z14tn <- z14[tn, ] +z14tn_ra <- relativeAb(z14tn) + +resAll <- lefserAllRanks(relab = z14tn_ra, groupCol = "study_condition") + +} diff --git a/man/lefserPlot.Rd b/man/lefserPlot.Rd index 71c92b4..14f936a 100644 --- a/man/lefserPlot.Rd +++ b/man/lefserPlot.Rd @@ -13,7 +13,8 @@ lefserPlot( ) } \arguments{ -\item{df}{Data frame produced by \code{lefser}.} +\item{df}{Data frame produced by \code{lefser}. This data frame contains +two columns labeled as \code{c("features", "scores")}.} \item{colors}{Colors corresponding to class 0 and 1. Options: "c" (colorblind), "l" (lefse), "g" (greyscale). Defaults to "c". diff --git a/man/lefserPlotClad.Rd b/man/lefserPlotClad.Rd new file mode 100644 index 0000000..44c97e2 --- /dev/null +++ b/man/lefserPlotClad.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lefserPlotClad.R +\name{lefserPlotClad} +\alias{lefserPlotClad} +\title{LEfSer plot cladogram} +\usage{ +lefserPlotClad(df, colors = "c", showTipLabels = FALSE, showNodeLabels = "p") +} +\arguments{ +\item{df}{An object of class "lefser_df" or "lefesr_df_all".} + +\item{colors}{Colors corresponding to class 0 and 1. +Options: "c" (colorblind), "l" (lefse), "g" (greyscale). +Defaults to "c". This argument also accepts a character(2) with two color names.} + +\item{showTipLabels}{Logical. If TRUE, show tip labels. Default is FALSE.} + +\item{showNodeLabels}{Options: "p" = phylum, "c" = class, "o" = order, +"f" = family, "g" = genus. It can accept several options, e.g., +c("p", "c").} +} +\value{ +A ggtree object. +} +\description{ +\code{lefserPlotClad} plots a cladogram from the results of +\code{lefser} or \code{lefserAllRanks} +} +\examples{ +data("zeller14") +z14 <- zeller14[, zeller14$study_condition != "adenoma"] +tn <- get_terminal_nodes(rownames(z14)) +z14tn <- z14[tn, ] +z14tn_ra <- relativeAb(z14tn) +resAll <- lefserAllRanks(relab = z14tn_ra, groupCol = "study_condition") +ggt <- lefserPlotClad(df = resAll) +} diff --git a/tests/testthat/test-lefserPlotClad.R b/tests/testthat/test-lefserPlotClad.R new file mode 100644 index 0000000..7cbc3f5 --- /dev/null +++ b/tests/testthat/test-lefserPlotClad.R @@ -0,0 +1,17 @@ +data("zeller14") +z14 <- zeller14[, zeller14$study_condition != "adenoma"] +tn <- get_terminal_nodes(rownames(z14)) +z14tn <- z14[tn, ] +z14tn_ra <- relativeAb(z14tn) + +res <- lefser(z14tn_ra, groupCol = "study_condition") +resAll <- lefserAllRanks(relab = z14tn_ra, groupCol = "study_condition") + +test_that("lefserAllRanks works", { + expect_s3_class(resAll, "lefser_df_all") +}) + +test_that("lefserPlotClad works", { + ggt <- lefserPlotClad(df = resAll) + expect_s3_class(ggt, "ggtree") +}) \ No newline at end of file