diff --git a/DESCRIPTION b/DESCRIPTION index 4e71aa1..cace762 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tfpscanner Title: Transmission fitness polymorphism scanner -Version: 0.3.0 -Date: 2023-01-18 +Version: 0.3.2 +Date: 2023-07-10 Author: Erik Volz, Olivia Boyd Maintainer: Erik Volz Description: A pipeline for scanning a SARS-CoV-2 phylogeny for clades with outlying growth diff --git a/NEWS.md b/NEWS.md index b454e46..2289030 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# tfpscanner 0.3.1 _2023-07-10_ + +- Add a function to extract the node-annotation data-frame that is used when making interactive + tree plots `extract_tree_dataframe(sc0, tr2, branch_cols)` away from `treeview()` + # tfpscanner 0.3.0 _2023-07-06_ - Feature: function `create_browser_data()` was added to generate all treeview illustrations and diff --git a/R/create_browser_data.R b/R/create_browser_data.R index 1287e01..d3b1cce 100644 --- a/R/create_browser_data.R +++ b/R/create_browser_data.R @@ -216,9 +216,9 @@ available_treeview <- function(data_dir) { ) ) all_trees <- as.character(sort(all_trees)) - names(all_trees) <- all_trees %>% - stringr::str_replace_all("_|-|\\.rds", " ") %>% - stringr::str_trim() %>% + names(all_trees) <- all_trees |> + stringr::str_replace_all("_|-|\\.rds", " ") |> + stringr::str_trim() |> stringr::str_to_title() return(all_trees) } diff --git a/R/extract_tree_dataframe.R b/R/extract_tree_dataframe.R new file mode 100644 index 0000000..b2543ad --- /dev/null +++ b/R/extract_tree_dataframe.R @@ -0,0 +1,83 @@ +#' Extracts node-specific data and annotates whether they are internal or a tip in the tree +#' +#' @param sc0 data.frame. +#' @param tr2 phylo. +#' @param branch_cols Character vector defining the statistics that must be extracted from +#' \code{sc0} for all nodes in the tree. +#' @return data.frame. Containing a subset of the columns from \code{sc0} (including all +#' \code{branch_cols}) and some additional columns. The additional columns are \code{lineages}, +#' \code{cocirc_summary}, \code{node}, \code{internal}, \code{lineages1}. + +extract_tree_dataframe <- function(sc0, + tr2, + branch_cols) { + tdvars <- unique(c( + branch_cols, + "logistic_growth_rate", + "clock_outlier", + "cluster_size", + "date_range", + "cluster_id", + "region_summary", + "cocirc_lineage_summary", + "lineage", + "tr2mrca" + )) + + sc2 <- sc0[!is.na(sc0$tr2mrca), ] + sc2$date_range <- sapply( + seq_len(nrow(sc2)), + function(i) glue::glue("{sc2$least_recent_tip[i]} -> {sc2$most_recent_tip[i]}") + ) + + ## tips + td0 <- sc2[sc2$tr2mrca <= ape::Ntip(tr2), tdvars] + td0$lineages <- td0$lineage + td0$cocirc_summary <- td0$cocirc_lineage_summary + td0$node <- td0$tr2mrca + td0$internal <- "N" + + ## internal + td1 <- sc2[sc2$tr2mrca > ape::Ntip(tr2), tdvars] + if (nrow(td1) > 0) { + td1$lineages <- td1$lineage + td1$cocirc_summary <- td1$cocirc_lineage_summary + td1$node <- td1$tr2mrca + td1$internal <- "Y" + td1$cluster_size <- 0 + x <- setdiff( + (ape::Ntip(tr2) + 1):(ape::Ntip(tr2) + ape::Nnode(tr2)), + td1$node + ) # make sure every node represented + td1 <- merge(td1, + data.frame(node = x), + all = TRUE + ) + td <- rbind(td0, td1) + } else { + td <- td0 + } + td <- td[order(td$node), ] # important + + # rescale clock ? + td$clock_outlier <- scale(td$clock_outlier) / 2 + + # interpolate missing values & repair cluster sizes + td$logistic_growth_rate[(td$node <= ape::Ntip(tr2)) & (is.na(td$logistic_growth_rate))] <- 0 + td$clock_outlier[(td$node <= ape::Ntip(tr2)) & (is.na(td$clock_outlier))] <- 0 + for (ie in ape::postorder(tr2)) { + a <- tr2$edge[ie, 1] + u <- tr2$edge[ie, 2] + td$cluster_size[a] <- td$cluster_size[a] + td$cluster_size[u] + for (vn in branch_cols) { + if (is.na(td[[vn]][a])) { + td[[vn]][a] <- td[[vn]][u] + } + } + } + + # lineages for clade labels + td$lineages1 <- sapply(strsplit(td$lineages, split = "\\|"), "[", 1) + + td +} diff --git a/R/plot_tree.R b/R/plot_tree.R index 3dbfa03..cfe4692 100644 --- a/R/plot_tree.R +++ b/R/plot_tree.R @@ -47,16 +47,15 @@ create_trees <- function(ggtree_data, tree_list$with_interactivity_data <- append_interactivity_data( tree_list[["noninteractive"]], - branch_col = branch_col, - sc0 = sc0, - cmuts = cmuts, - mut_regex = mut_regex + branch_col = branch_col ) genotype <- extract_genotype_data( ggobj = tree_list[["with_interactivity_data"]], n_leaves = n_leaves, - mut_regex = mut_regex + mut_regex = mut_regex, + sc0 = sc0, + cmuts = cmuts ) tree_list$with_heatmap <- append_heatmap( @@ -220,82 +219,43 @@ create_noninteractive_ggtree <- function(ggtree_data, gtr1.1 } -#' Adds data to a ggtree object to allow mouse-over tooltips etc when presented interactively +#' Adds data to a \code{ggtree} object to allow mouse-over tooltips etc when presented interactively #' #' @param ggobj A \code{ggtree} object. #' @param branch_col Scalar string. The name of a column within \code{ggobj$data} defining the #' statistic under study here (`logistic_growth_rate`, `clock_outlier`). -#' @inheritParams create_trees #' #' @return A ggtree object. The \code{$data} entry has additional entries (\code{mouseover}, -#' \code{colour_var}, \code{defmuts}, \code{allmuts}) that are used when presented interactively -#' by \code{ggiraph}. +#' \code{colour_var}, \code{allmuts}) that are used when presented interactively by +#' \code{ggiraph}. append_interactivity_data <- function(ggobj, - branch_col, - sc0, - cmuts, - mut_regex) { + branch_col) { # make mouseover info - ## standard meta data - ttdfs <- apply(ggobj$data, 1, FUN = function(x) { - z <- as.list(x) - lgr <- as.numeric(z$logistic_growth_rate) - # TODO: replace with() with explicit z$cluster_id etc - y <- with( - z, - data.frame( - `Cluster ID` = glue::glue("#{cluster_id}"), - `Cluster size` = cluster_size, - `Date range` = date_range, - `Example sequence` = label, - `Logistic growth` = paste0( - ifelse(lgr > 0, "+", ""), - round(lgr * 100), "%" - ), - `Mol clock outlier` = clock_outlier, - `Lineages` = lineages - ) - ) - y <- t(y) - colnames(y) <- "" - tryCatch( - paste(knitr::kable(y, "simple"), collapse = "\n"), - error = function(e) paste(knitr::kable(y, "markdown"), collapse = "\n") - ) - }) + ggobj$data$mouseover <- glue::glue( + "Cluster ID: {ggobj$data$cluster_id}" + ) + ggobj$data$colour_var <- ggobj$data[[branch_col]] - ## table with geo composition - ttregtabs <- ggobj$data$region_summary # - ## cocirc - ttcocirc <- ggobj$data$cocirc_summary # + ggobj +} - ## defining muts - ttdefmuts <- sapply(match(ggobj$data$cluster_id, sc0$cluster_id), function(isc0) { - if (is.na(isc0)) { - return("") - } - paste( - sep = "\n", - "Cluster branch mutations:", - gsub( - x = tryCatch( - stringr::str_wrap( - paste( - collapse = " ", - sort_mutations(cmuts[[as.character(sc0$node_number[isc0])]]$defining) - ), - width = 60 - ), - error = function(e) browser() - ), - pattern = " ", - replacement = ", " - ), - "\n" - ) - }) # end of sapply +#' Extract the data about viral genotypes from a \code{ggtree} object +#' +#' @param ggobj A \code{ggtree} object, as generated by \code{append_interactivity_data}. The +#' \code{data} entry for this object should contain the columns "node", "label" and a column for +#' each of the \code{mut_regex} values. +#' @param mut_regex String. Regular expression defining the mutations under study here. This +#' should be a subset of the column-names in \code{ggobj$data}. +#' @inheritParams create_trees +#' +#' @return Data-frame. +extract_genotype_data <- function(ggobj, + n_leaves, + mut_regex, + sc0, + cmuts) { ttallmuts <- sapply(match(ggobj$data$cluster_id, sc0$cluster_id), function(isc0) { if (is.na(isc0)) { return("") @@ -318,7 +278,6 @@ append_interactivity_data <- function(ggobj, ) }) # end of sapply - ggobj$data$defmuts <- ttdefmuts ggobj$data$allmuts <- ttallmuts if (!is.null(mut_regex)) { for (mre in mut_regex) { @@ -327,37 +286,6 @@ append_interactivity_data <- function(ggobj, } } - # make html widget - ggobj$data$mouseover <- sapply(seq_along(ttdfs), function(i) { - paste0( - "Statistics:\n", ttdfs[i], - "\n\nGeography:\n", ttregtabs[i], - "\n\nCo-circulating with:\n", ttcocirc[i], - "\n\n", ttdefmuts[i], - "\n", ttallmuts[i], - "\n", - collapse = "\n" - ) - }) - ggobj$data$colour_var <- ggobj$data[[branch_col]] - - ggobj -} - -#' Extract the data about viral genotypes from a \code{ggtree} object -#' -#' @param ggobj A \code{ggtree} object, as generated by \code{append_interactivity_data}. The -#' \code{data} entry for this object should contain the columns "node", "label" and a column for -#' each of the \code{mut_regex} values. -#' @param mut_regex String. Regular expression defining the mutations under study here. This -#' should be a subset of the column-names in \code{ggobj$data}. -#' @inheritParams create_trees -#' -#' @return Data-frame. - -extract_genotype_data <- function(ggobj, - n_leaves, - mut_regex) { genotype <- as.data.frame( ggobj$data[ ggobj$data$node <= n_leaves, diff --git a/R/tfpscanner.R b/R/tfpscanner.R index fa13b95..fbb77f9 100644 --- a/R/tfpscanner.R +++ b/R/tfpscanner.R @@ -661,7 +661,7 @@ tfpscan <- function(tre, gtr1 <- gtr1 + ggtree::geom_tiplab(align = TRUE) gtr2 <- gtr1 - if (length(allsegregating) < 100) { + if ((length(allsegregating) < 100) & (length(allsegregating) > 0)) { gtr2 <- ggtree::gheatmap(gtr1, as.data.frame(aas), width = .66, diff --git a/R/treeview.R b/R/treeview.R index 7b9a405..17969fc 100644 --- a/R/treeview.R +++ b/R/treeview.R @@ -110,74 +110,12 @@ treeview <- function(e0, ) # tree data frames - ## tips - sc2 <- sc0[!is.na(sc0$tr2mrca), ] - sc2$date_range <- sapply( - seq_len(nrow(sc2)), - function(i) glue::glue("{sc2$least_recent_tip[i]} -> {sc2$most_recent_tip[i]}") - ) - tdvars <- unique(c( - branch_cols, - "logistic_growth_rate", - "clock_outlier", - "cluster_size", - "date_range", - "cluster_id", - "region_summary", - "cocirc_lineage_summary", - "lineage", - "tr2mrca" - )) - - td0 <- sc2[sc2$tr2mrca <= ape::Ntip(tr2), tdvars] - td0$lineages <- td0$lineage - td0$cocirc_summary <- td0$cocirc_lineage_summary - td0$node <- td0$tr2mrca - td0$internal <- "N" - ## internal - td1 <- sc2[sc2$tr2mrca > ape::Ntip(tr2), tdvars] - if (nrow(td1) > 0) { - td1$lineages <- td1$lineage - td1$cocirc_summary <- td1$cocirc_lineage_summary - td1$node <- td1$tr2mrca - td1$internal <- "Y" - td1$cluster_size <- 0 - x <- setdiff( - (ape::Ntip(tr2) + 1):(ape::Ntip(tr2) + ape::Nnode(tr2)), - td1$node - ) # make sure every node represented - td1 <- merge(td1, - data.frame(node = x), - all = TRUE - ) - td <- rbind(td0, td1) - } else { - td <- td0 - } - td <- td[order(td$node), ] # important - - # rescale clock ? - td$clock_outlier <- scale(td$clock_outlier) / 2 - - # interpolate missing values & repair cluster sizes - td$logistic_growth_rate[(td$node <= ape::Ntip(tr2)) & (is.na(td$logistic_growth_rate))] <- 0 - td$clock_outlier[(td$node <= ape::Ntip(tr2)) & (is.na(td$clock_outlier))] <- 0 - for (ie in ape::postorder(tr2)) { - a <- tr2$edge[ie, 1] - u <- tr2$edge[ie, 2] - td$cluster_size[a] <- td$cluster_size[a] + td$cluster_size[u] - for (vn in branch_cols) { - if (is.na(td[[vn]][a])) { - td[[vn]][a] <- td[[vn]][u] - } - } - } + td <- extract_tree_dataframe(sc0 = sc0, tr2 = tr2, branch_cols = branch_cols) # cols for continuous stats cols <- rev(c("red", "orange", "green", "cyan", "blue")) # lineages for clade labels - td$lineages1 <- sapply(strsplit(td$lineages, split = "\\|"), "[", 1) sc0$lineage1 <- sapply(strsplit(sc0$lineage, split = "\\|"), "[", 1) tablin <- table(td$lineages1[!(sc0$lineage1 %in% c("None", "B.1"))]) diff --git a/man/append_interactivity_data.Rd b/man/append_interactivity_data.Rd index 291ce48..b1525b2 100644 --- a/man/append_interactivity_data.Rd +++ b/man/append_interactivity_data.Rd @@ -2,25 +2,21 @@ % Please edit documentation in R/plot_tree.R \name{append_interactivity_data} \alias{append_interactivity_data} -\title{Adds data to a ggtree object to allow mouse-over tooltips etc when presented interactively} +\title{Adds data to a \code{ggtree} object to allow mouse-over tooltips etc when presented interactively} \usage{ -append_interactivity_data(ggobj, branch_col, sc0, cmuts, mut_regex) +append_interactivity_data(ggobj, branch_col) } \arguments{ \item{ggobj}{A \code{ggtree} object.} \item{branch_col}{Scalar string. The name of a column within \code{ggobj$data} defining the statistic under study here (`logistic_growth_rate`, `clock_outlier`).} - -\item{sc0, cmuts}{Data-frames.} - -\item{mut_regex}{Regular expression. Defines the mutations under study here.} } \value{ A ggtree object. The \code{$data} entry has additional entries (\code{mouseover}, - \code{colour_var}, \code{defmuts}, \code{allmuts}) that are used when presented interactively - by \code{ggiraph}. + \code{colour_var}, \code{allmuts}) that are used when presented interactively by + \code{ggiraph}. } \description{ -Adds data to a ggtree object to allow mouse-over tooltips etc when presented interactively +Adds data to a \code{ggtree} object to allow mouse-over tooltips etc when presented interactively } diff --git a/man/extract_genotype_data.Rd b/man/extract_genotype_data.Rd index 99442fc..75864d7 100644 --- a/man/extract_genotype_data.Rd +++ b/man/extract_genotype_data.Rd @@ -4,7 +4,7 @@ \alias{extract_genotype_data} \title{Extract the data about viral genotypes from a \code{ggtree} object} \usage{ -extract_genotype_data(ggobj, n_leaves, mut_regex) +extract_genotype_data(ggobj, n_leaves, mut_regex, sc0, cmuts) } \arguments{ \item{ggobj}{A \code{ggtree} object, as generated by \code{append_interactivity_data}. The @@ -15,6 +15,8 @@ each of the \code{mut_regex} values.} \item{mut_regex}{String. Regular expression defining the mutations under study here. This should be a subset of the column-names in \code{ggobj$data}.} + +\item{sc0, cmuts}{Data-frames.} } \value{ Data-frame. diff --git a/man/extract_tree_dataframe.Rd b/man/extract_tree_dataframe.Rd new file mode 100644 index 0000000..7dd68c9 --- /dev/null +++ b/man/extract_tree_dataframe.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/extract_tree_dataframe.R +\name{extract_tree_dataframe} +\alias{extract_tree_dataframe} +\title{Extracts node-specific data and annotates whether they are internal or a tip in the tree} +\usage{ +extract_tree_dataframe(sc0, tr2, branch_cols) +} +\arguments{ +\item{sc0}{data.frame.} + +\item{tr2}{phylo.} + +\item{branch_cols}{Character vector defining the statistics that must be extracted from +\code{sc0} for all nodes in the tree.} +} +\value{ +data.frame. Containing a subset of the columns from \code{sc0} (including all + \code{branch_cols}) and some additional columns. The additional columns are \code{lineages}, + \code{cocirc_summary}, \code{node}, \code{internal}, \code{lineages1}. +} +\description{ +Extracts node-specific data and annotates whether they are internal or a tip in the tree +}