jumpingrivers · russHyde · Dec 5, 2022 · Dec 5, 2022 · Dec 6, 2022 · Jan 12, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: tfpscanner
 Title: Transmission fitness polymorphism scanner 
-Version: 0.3.0
-Date: 2023-01-18
+Version: 0.3.2
+Date: 2023-07-10
 Author: Erik Volz, Olivia Boyd
 Maintainer: Erik Volz <[email protected]>
 Description: A pipeline for scanning a SARS-CoV-2 phylogeny for clades with outlying growth 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# tfpscanner 0.3.1 _2023-07-10_
+
+- Add a function to extract the node-annotation data-frame that is used when making interactive
+  tree plots `extract_tree_dataframe(sc0, tr2, branch_cols)` away from `treeview()`
+
 # tfpscanner 0.3.0 _2023-07-06_
 
 - Feature: function `create_browser_data()` was added to generate all treeview illustrations and

diff --git a/R/create_browser_data.R b/R/create_browser_data.R
@@ -216,9 +216,9 @@ available_treeview <- function(data_dir) {
     )
   )
   all_trees <- as.character(sort(all_trees))
-  names(all_trees) <- all_trees %>%
-    stringr::str_replace_all("_|-|\\.rds", " ") %>%
-    stringr::str_trim() %>%
+  names(all_trees) <- all_trees |>
+    stringr::str_replace_all("_|-|\\.rds", " ") |>
+    stringr::str_trim() |>
     stringr::str_to_title()
   return(all_trees)
 }

diff --git a/R/extract_tree_dataframe.R b/R/extract_tree_dataframe.R
@@ -0,0 +1,83 @@
+#' Extracts node-specific data and annotates whether they are internal or a tip in the tree
+#'
+#' @param   sc0   data.frame.
+#' @param   tr2   phylo.
+#' @param   branch_cols   Character vector defining the statistics that must be extracted from
+#'   \code{sc0} for all nodes in the tree.
+#' @return   data.frame. Containing a subset of the columns from \code{sc0} (including all
+#'   \code{branch_cols}) and some additional columns. The additional columns are \code{lineages},
+#'   \code{cocirc_summary}, \code{node}, \code{internal}, \code{lineages1}.
+
+extract_tree_dataframe <- function(sc0,
+                                   tr2,
+                                   branch_cols) {
+  tdvars <- unique(c(
+    branch_cols,
+    "logistic_growth_rate",
+    "clock_outlier",
+    "cluster_size",
+    "date_range",
+    "cluster_id",
+    "region_summary",
+    "cocirc_lineage_summary",
+    "lineage",
+    "tr2mrca"
+  ))
+
+  sc2 <- sc0[!is.na(sc0$tr2mrca), ]
+  sc2$date_range <- sapply(
+    seq_len(nrow(sc2)),
+    function(i) glue::glue("{sc2$least_recent_tip[i]} -> {sc2$most_recent_tip[i]}")
+  )
+
+  ## tips
+  td0 <- sc2[sc2$tr2mrca <= ape::Ntip(tr2), tdvars]
+  td0$lineages <- td0$lineage
+  td0$cocirc_summary <- td0$cocirc_lineage_summary
+  td0$node <- td0$tr2mrca
+  td0$internal <- "N"
+
+  ## internal
+  td1 <- sc2[sc2$tr2mrca > ape::Ntip(tr2), tdvars]
+  if (nrow(td1) > 0) {
+    td1$lineages <- td1$lineage
+    td1$cocirc_summary <- td1$cocirc_lineage_summary
+    td1$node <- td1$tr2mrca
+    td1$internal <- "Y"
+    td1$cluster_size <- 0
+    x <- setdiff(
+      (ape::Ntip(tr2) + 1):(ape::Ntip(tr2) + ape::Nnode(tr2)),
+      td1$node
+    ) # make sure every node represented
+    td1 <- merge(td1,
+      data.frame(node = x),
+      all = TRUE
+    )
+    td <- rbind(td0, td1)
+  } else {
+    td <- td0
+  }
+  td <- td[order(td$node), ] # important
+
+  # rescale clock ?
+  td$clock_outlier <- scale(td$clock_outlier) / 2
+
+  # interpolate missing values  &  repair cluster sizes
+  td$logistic_growth_rate[(td$node <= ape::Ntip(tr2)) & (is.na(td$logistic_growth_rate))] <- 0
+  td$clock_outlier[(td$node <= ape::Ntip(tr2)) & (is.na(td$clock_outlier))] <- 0
+  for (ie in ape::postorder(tr2)) {
+    a <- tr2$edge[ie, 1]
+    u <- tr2$edge[ie, 2]
+    td$cluster_size[a] <- td$cluster_size[a] + td$cluster_size[u]
+    for (vn in branch_cols) {
+      if (is.na(td[[vn]][a])) {
+        td[[vn]][a] <- td[[vn]][u]
+      }
+    }
+  }
+
+  # lineages for clade labels
+  td$lineages1 <- sapply(strsplit(td$lineages, split = "\\|"), "[", 1)
+
+  td
+}
diff --git a/R/plot_tree.R b/R/plot_tree.R
@@ -47,16 +47,15 @@ create_trees <- function(ggtree_data,
 
   tree_list$with_interactivity_data <- append_interactivity_data(
     tree_list[["noninteractive"]],
-    branch_col = branch_col,
-    sc0 = sc0,
-    cmuts = cmuts,
-    mut_regex = mut_regex
+    branch_col = branch_col
   )
 
   genotype <- extract_genotype_data(
     ggobj = tree_list[["with_interactivity_data"]],
     n_leaves = n_leaves,
-    mut_regex = mut_regex
+    mut_regex = mut_regex,
+    sc0 = sc0,
+    cmuts = cmuts
   )
 
   tree_list$with_heatmap <- append_heatmap(
@@ -220,82 +219,43 @@ create_noninteractive_ggtree <- function(ggtree_data,
   gtr1.1
 }
 
-#' Adds data to a ggtree object to allow mouse-over tooltips etc when presented interactively
+#' Adds data to a \code{ggtree} object to allow mouse-over tooltips etc when presented interactively
 #'
 #' @param   ggobj   A \code{ggtree} object.
 #' @param   branch_col    Scalar string. The name of a column within \code{ggobj$data} defining the
 #'   statistic under study here (`logistic_growth_rate`, `clock_outlier`).
-#' @inheritParams   create_trees
 #'
 #' @return  A ggtree object. The \code{$data} entry has additional entries (\code{mouseover},
-#'   \code{colour_var}, \code{defmuts}, \code{allmuts}) that are used when presented interactively
-#'   by \code{ggiraph}.
+#'   \code{colour_var}, \code{allmuts}) that are used when presented interactively by
+#'   \code{ggiraph}.
 
 append_interactivity_data <- function(ggobj,
-                                      branch_col,
-                                      sc0,
-                                      cmuts,
-                                      mut_regex) {
+                                      branch_col) {
   # make mouseover info
-  ## standard meta data
-  ttdfs <- apply(ggobj$data, 1, FUN = function(x) {
-    z <- as.list(x)
-    lgr <- as.numeric(z$logistic_growth_rate)
-    # TODO: replace with() with explicit z$cluster_id etc
-    y <- with(
-      z,
-      data.frame(
-        `Cluster ID` = glue::glue("#{cluster_id}"),
-        `Cluster size` = cluster_size,
-        `Date range` = date_range,
-        `Example sequence` = label,
-        `Logistic growth` = paste0(
-          ifelse(lgr > 0, "+", ""),
-          round(lgr * 100), "%"
-        ),
-        `Mol clock outlier` = clock_outlier,
-        `Lineages` = lineages
-      )
-    )
-    y <- t(y)
-    colnames(y) <- ""
-    tryCatch(
-      paste(knitr::kable(y, "simple"), collapse = "\n"),
-      error = function(e) paste(knitr::kable(y, "markdown"), collapse = "\n")
-    )
-  })
+  ggobj$data$mouseover <- glue::glue(
+    "Cluster ID: {ggobj$data$cluster_id}"
+  )
+  ggobj$data$colour_var <- ggobj$data[[branch_col]]
 
-  ## table with geo composition
-  ttregtabs <- ggobj$data$region_summary #
-  ## cocirc
-  ttcocirc <- ggobj$data$cocirc_summary #
+  ggobj
+}
 
-  ## defining muts
-  ttdefmuts <- sapply(match(ggobj$data$cluster_id, sc0$cluster_id), function(isc0) {
-    if (is.na(isc0)) {
-      return("")
-    }
-    paste(
-      sep = "\n",
-      "Cluster branch mutations:",
-      gsub(
-        x = tryCatch(
-          stringr::str_wrap(
-            paste(
-              collapse = " ",
-              sort_mutations(cmuts[[as.character(sc0$node_number[isc0])]]$defining)
-            ),
-            width = 60
-          ),
-          error = function(e) browser()
-        ),
-        pattern = " ",
-        replacement = ", "
-      ),
-      "\n"
-    )
-  }) # end of sapply
+#' Extract the data about viral genotypes from a \code{ggtree} object
+#'
+#' @param   ggobj   A \code{ggtree} object, as generated by \code{append_interactivity_data}. The
+#'   \code{data} entry for this object should contain the columns "node", "label" and a column for
+#'   each of the \code{mut_regex} values.
+#' @param   mut_regex   String. Regular expression defining the mutations under study here. This
+#'   should be a subset of the column-names in \code{ggobj$data}.
+#' @inheritParams   create_trees
+#'
+#' @return   Data-frame.
 
+extract_genotype_data <- function(ggobj,
+                                  n_leaves,
+                                  mut_regex,
+                                  sc0,
+                                  cmuts) {
   ttallmuts <- sapply(match(ggobj$data$cluster_id, sc0$cluster_id), function(isc0) {
     if (is.na(isc0)) {
       return("")
@@ -318,7 +278,6 @@ append_interactivity_data <- function(ggobj,
     )
   }) # end of sapply
 
-  ggobj$data$defmuts <- ttdefmuts
   ggobj$data$allmuts <- ttallmuts
   if (!is.null(mut_regex)) {
     for (mre in mut_regex) {
@@ -327,37 +286,6 @@ append_interactivity_data <- function(ggobj,
     }
   }
 
-  # make html widget
-  ggobj$data$mouseover <- sapply(seq_along(ttdfs), function(i) {
-    paste0(
-      "Statistics:\n", ttdfs[i],
-      "\n\nGeography:\n", ttregtabs[i],
-      "\n\nCo-circulating with:\n", ttcocirc[i],
-      "\n\n", ttdefmuts[i],
-      "\n", ttallmuts[i],
-      "\n",
-      collapse = "\n"
-    )
-  })
-  ggobj$data$colour_var <- ggobj$data[[branch_col]]
-
-  ggobj
-}
-
-#' Extract the data about viral genotypes from a \code{ggtree} object
-#'
-#' @param   ggobj   A \code{ggtree} object, as generated by \code{append_interactivity_data}. The
-#'   \code{data} entry for this object should contain the columns "node", "label" and a column for
-#'   each of the \code{mut_regex} values.
-#' @param   mut_regex   String. Regular expression defining the mutations under study here. This
-#'   should be a subset of the column-names in \code{ggobj$data}.
-#' @inheritParams   create_trees
-#'
-#' @return   Data-frame.
-
-extract_genotype_data <- function(ggobj,
-                                  n_leaves,
-                                  mut_regex) {
   genotype <- as.data.frame(
     ggobj$data[
       ggobj$data$node <= n_leaves,

diff --git a/R/tfpscanner.R b/R/tfpscanner.R
@@ -661,7 +661,7 @@ tfpscan <- function(tre,
     gtr1 <- gtr1 + ggtree::geom_tiplab(align = TRUE)
     gtr2 <- gtr1
 
-    if (length(allsegregating) < 100) {
+    if ((length(allsegregating) < 100) & (length(allsegregating) > 0)) {
       gtr2 <- ggtree::gheatmap(gtr1,
         as.data.frame(aas),
         width = .66,

diff --git a/R/treeview.R b/R/treeview.R
@@ -110,74 +110,12 @@ treeview <- function(e0,
   )
 
   # tree data frames
-  ## tips
-  sc2 <- sc0[!is.na(sc0$tr2mrca), ]
-  sc2$date_range <- sapply(
-    seq_len(nrow(sc2)),
-    function(i) glue::glue("{sc2$least_recent_tip[i]} -> {sc2$most_recent_tip[i]}")
-  )
-  tdvars <- unique(c(
-    branch_cols,
-    "logistic_growth_rate",
-    "clock_outlier",
-    "cluster_size",
-    "date_range",
-    "cluster_id",
-    "region_summary",
-    "cocirc_lineage_summary",
-    "lineage",
-    "tr2mrca"
-  ))
-
-  td0 <- sc2[sc2$tr2mrca <= ape::Ntip(tr2), tdvars]
-  td0$lineages <- td0$lineage
-  td0$cocirc_summary <- td0$cocirc_lineage_summary
-  td0$node <- td0$tr2mrca
-  td0$internal <- "N"
-  ## internal
-  td1 <- sc2[sc2$tr2mrca > ape::Ntip(tr2), tdvars]
-  if (nrow(td1) > 0) {
-    td1$lineages <- td1$lineage
-    td1$cocirc_summary <- td1$cocirc_lineage_summary
-    td1$node <- td1$tr2mrca
-    td1$internal <- "Y"
-    td1$cluster_size <- 0
-    x <- setdiff(
-      (ape::Ntip(tr2) + 1):(ape::Ntip(tr2) + ape::Nnode(tr2)),
-      td1$node
-    ) # make sure every node represented
-    td1 <- merge(td1,
-      data.frame(node = x),
-      all = TRUE
-    )
-    td <- rbind(td0, td1)
-  } else {
-    td <- td0
-  }
-  td <- td[order(td$node), ] # important
-
-  # rescale clock ?
-  td$clock_outlier <- scale(td$clock_outlier) / 2
-
-  # interpolate missing values  &  repair cluster sizes
-  td$logistic_growth_rate[(td$node <= ape::Ntip(tr2)) & (is.na(td$logistic_growth_rate))] <- 0
-  td$clock_outlier[(td$node <= ape::Ntip(tr2)) & (is.na(td$clock_outlier))] <- 0
-  for (ie in ape::postorder(tr2)) {
-    a <- tr2$edge[ie, 1]
-    u <- tr2$edge[ie, 2]
-    td$cluster_size[a] <- td$cluster_size[a] + td$cluster_size[u]
-    for (vn in branch_cols) {
-      if (is.na(td[[vn]][a])) {
-        td[[vn]][a] <- td[[vn]][u]
-      }
-    }
-  }
+  td <- extract_tree_dataframe(sc0 = sc0, tr2 = tr2, branch_cols = branch_cols)
 
   # cols for continuous stats
   cols <- rev(c("red", "orange", "green", "cyan", "blue"))
 
   # lineages for clade labels
-  td$lineages1 <- sapply(strsplit(td$lineages, split = "\\|"), "[", 1)
   sc0$lineage1 <- sapply(strsplit(sc0$lineage, split = "\\|"), "[", 1)
 
   tablin <- table(td$lineages1[!(sc0$lineage1 %in% c("None", "B.1"))])