Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trim tooltip #20

Open
wants to merge 16 commits into
base: staging-wp5
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: tfpscanner
Title: Transmission fitness polymorphism scanner
Version: 0.3.0
Date: 2023-01-18
Version: 0.3.2
Date: 2023-07-10
Author: Erik Volz, Olivia Boyd
Maintainer: Erik Volz <[email protected]>
Description: A pipeline for scanning a SARS-CoV-2 phylogeny for clades with outlying growth
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# tfpscanner 0.3.1 _2023-07-10_

- Add a function to extract the node-annotation data-frame that is used when making interactive
tree plots `extract_tree_dataframe(sc0, tr2, branch_cols)` away from `treeview()`

# tfpscanner 0.3.0 _2023-07-06_

- Feature: function `create_browser_data()` was added to generate all treeview illustrations and
Expand Down
6 changes: 3 additions & 3 deletions R/create_browser_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,9 @@ available_treeview <- function(data_dir) {
)
)
all_trees <- as.character(sort(all_trees))
names(all_trees) <- all_trees %>%
stringr::str_replace_all("_|-|\\.rds", " ") %>%
stringr::str_trim() %>%
names(all_trees) <- all_trees |>
stringr::str_replace_all("_|-|\\.rds", " ") |>
stringr::str_trim() |>
stringr::str_to_title()
return(all_trees)
}
Expand Down
83 changes: 83 additions & 0 deletions R/extract_tree_dataframe.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#' Extracts node-specific data and annotates whether they are internal or a tip in the tree
#'
#' @param sc0 data.frame.
#' @param tr2 phylo.
#' @param branch_cols Character vector defining the statistics that must be extracted from
#' \code{sc0} for all nodes in the tree.
#' @return data.frame. Containing a subset of the columns from \code{sc0} (including all
#' \code{branch_cols}) and some additional columns. The additional columns are \code{lineages},
#' \code{cocirc_summary}, \code{node}, \code{internal}, \code{lineages1}.

extract_tree_dataframe <- function(sc0,
tr2,
branch_cols) {
tdvars <- unique(c(
branch_cols,
"logistic_growth_rate",
"clock_outlier",
"cluster_size",
"date_range",
"cluster_id",
"region_summary",
"cocirc_lineage_summary",
"lineage",
"tr2mrca"
))

sc2 <- sc0[!is.na(sc0$tr2mrca), ]
sc2$date_range <- sapply(
seq_len(nrow(sc2)),
function(i) glue::glue("{sc2$least_recent_tip[i]} -> {sc2$most_recent_tip[i]}")
)

## tips
td0 <- sc2[sc2$tr2mrca <= ape::Ntip(tr2), tdvars]
td0$lineages <- td0$lineage
td0$cocirc_summary <- td0$cocirc_lineage_summary
td0$node <- td0$tr2mrca
td0$internal <- "N"

## internal
td1 <- sc2[sc2$tr2mrca > ape::Ntip(tr2), tdvars]
if (nrow(td1) > 0) {
td1$lineages <- td1$lineage
td1$cocirc_summary <- td1$cocirc_lineage_summary
td1$node <- td1$tr2mrca
td1$internal <- "Y"
td1$cluster_size <- 0
x <- setdiff(
(ape::Ntip(tr2) + 1):(ape::Ntip(tr2) + ape::Nnode(tr2)),
td1$node
) # make sure every node represented
td1 <- merge(td1,
data.frame(node = x),
all = TRUE
)
td <- rbind(td0, td1)
} else {
td <- td0
}
td <- td[order(td$node), ] # important

# rescale clock ?
td$clock_outlier <- scale(td$clock_outlier) / 2

# interpolate missing values & repair cluster sizes
td$logistic_growth_rate[(td$node <= ape::Ntip(tr2)) & (is.na(td$logistic_growth_rate))] <- 0
td$clock_outlier[(td$node <= ape::Ntip(tr2)) & (is.na(td$clock_outlier))] <- 0
for (ie in ape::postorder(tr2)) {
a <- tr2$edge[ie, 1]
u <- tr2$edge[ie, 2]
td$cluster_size[a] <- td$cluster_size[a] + td$cluster_size[u]
for (vn in branch_cols) {
if (is.na(td[[vn]][a])) {
td[[vn]][a] <- td[[vn]][u]
}
}
}

# lineages for clade labels
td$lineages1 <- sapply(strsplit(td$lineages, split = "\\|"), "[", 1)

td
}
130 changes: 29 additions & 101 deletions R/plot_tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,15 @@ create_trees <- function(ggtree_data,

tree_list$with_interactivity_data <- append_interactivity_data(
tree_list[["noninteractive"]],
branch_col = branch_col,
sc0 = sc0,
cmuts = cmuts,
mut_regex = mut_regex
branch_col = branch_col
)

genotype <- extract_genotype_data(
ggobj = tree_list[["with_interactivity_data"]],
n_leaves = n_leaves,
mut_regex = mut_regex
mut_regex = mut_regex,
sc0 = sc0,
cmuts = cmuts
)

tree_list$with_heatmap <- append_heatmap(
Expand Down Expand Up @@ -220,82 +219,43 @@ create_noninteractive_ggtree <- function(ggtree_data,
gtr1.1
}

#' Adds data to a ggtree object to allow mouse-over tooltips etc when presented interactively
#' Adds data to a \code{ggtree} object to allow mouse-over tooltips etc when presented interactively
#'
#' @param ggobj A \code{ggtree} object.
#' @param branch_col Scalar string. The name of a column within \code{ggobj$data} defining the
#' statistic under study here (`logistic_growth_rate`, `clock_outlier`).
#' @inheritParams create_trees
#'
#' @return A ggtree object. The \code{$data} entry has additional entries (\code{mouseover},
#' \code{colour_var}, \code{defmuts}, \code{allmuts}) that are used when presented interactively
#' by \code{ggiraph}.
#' \code{colour_var}, \code{allmuts}) that are used when presented interactively by
#' \code{ggiraph}.

append_interactivity_data <- function(ggobj,
branch_col,
sc0,
cmuts,
mut_regex) {
branch_col) {
# make mouseover info
## standard meta data
ttdfs <- apply(ggobj$data, 1, FUN = function(x) {
z <- as.list(x)
lgr <- as.numeric(z$logistic_growth_rate)
# TODO: replace with() with explicit z$cluster_id etc
y <- with(
z,
data.frame(
`Cluster ID` = glue::glue("#{cluster_id}"),
`Cluster size` = cluster_size,
`Date range` = date_range,
`Example sequence` = label,
`Logistic growth` = paste0(
ifelse(lgr > 0, "+", ""),
round(lgr * 100), "%"
),
`Mol clock outlier` = clock_outlier,
`Lineages` = lineages
)
)
y <- t(y)
colnames(y) <- ""
tryCatch(
paste(knitr::kable(y, "simple"), collapse = "\n"),
error = function(e) paste(knitr::kable(y, "markdown"), collapse = "\n")
)
})
ggobj$data$mouseover <- glue::glue(
"Cluster ID: {ggobj$data$cluster_id}"
)
ggobj$data$colour_var <- ggobj$data[[branch_col]]

## table with geo composition
ttregtabs <- ggobj$data$region_summary #
## cocirc
ttcocirc <- ggobj$data$cocirc_summary #
ggobj
}

## defining muts
ttdefmuts <- sapply(match(ggobj$data$cluster_id, sc0$cluster_id), function(isc0) {
if (is.na(isc0)) {
return("")
}
paste(
sep = "\n",
"Cluster branch mutations:",
gsub(
x = tryCatch(
stringr::str_wrap(
paste(
collapse = " ",
sort_mutations(cmuts[[as.character(sc0$node_number[isc0])]]$defining)
),
width = 60
),
error = function(e) browser()
),
pattern = " ",
replacement = ", "
),
"\n"
)
}) # end of sapply
#' Extract the data about viral genotypes from a \code{ggtree} object
#'
#' @param ggobj A \code{ggtree} object, as generated by \code{append_interactivity_data}. The
#' \code{data} entry for this object should contain the columns "node", "label" and a column for
#' each of the \code{mut_regex} values.
#' @param mut_regex String. Regular expression defining the mutations under study here. This
#' should be a subset of the column-names in \code{ggobj$data}.
#' @inheritParams create_trees
#'
#' @return Data-frame.

extract_genotype_data <- function(ggobj,
n_leaves,
mut_regex,
sc0,
cmuts) {
ttallmuts <- sapply(match(ggobj$data$cluster_id, sc0$cluster_id), function(isc0) {
if (is.na(isc0)) {
return("")
Expand All @@ -318,7 +278,6 @@ append_interactivity_data <- function(ggobj,
)
}) # end of sapply

ggobj$data$defmuts <- ttdefmuts
ggobj$data$allmuts <- ttallmuts
if (!is.null(mut_regex)) {
for (mre in mut_regex) {
Expand All @@ -327,37 +286,6 @@ append_interactivity_data <- function(ggobj,
}
}

# make html widget
ggobj$data$mouseover <- sapply(seq_along(ttdfs), function(i) {
paste0(
"Statistics:\n", ttdfs[i],
"\n\nGeography:\n", ttregtabs[i],
"\n\nCo-circulating with:\n", ttcocirc[i],
"\n\n", ttdefmuts[i],
"\n", ttallmuts[i],
"\n",
collapse = "\n"
)
})
ggobj$data$colour_var <- ggobj$data[[branch_col]]

ggobj
}

#' Extract the data about viral genotypes from a \code{ggtree} object
#'
#' @param ggobj A \code{ggtree} object, as generated by \code{append_interactivity_data}. The
#' \code{data} entry for this object should contain the columns "node", "label" and a column for
#' each of the \code{mut_regex} values.
#' @param mut_regex String. Regular expression defining the mutations under study here. This
#' should be a subset of the column-names in \code{ggobj$data}.
#' @inheritParams create_trees
#'
#' @return Data-frame.

extract_genotype_data <- function(ggobj,
n_leaves,
mut_regex) {
genotype <- as.data.frame(
ggobj$data[
ggobj$data$node <= n_leaves,
Expand Down
2 changes: 1 addition & 1 deletion R/tfpscanner.R
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,7 @@ tfpscan <- function(tre,
gtr1 <- gtr1 + ggtree::geom_tiplab(align = TRUE)
gtr2 <- gtr1

if (length(allsegregating) < 100) {
if ((length(allsegregating) < 100) & (length(allsegregating) > 0)) {
gtr2 <- ggtree::gheatmap(gtr1,
as.data.frame(aas),
width = .66,
Expand Down
64 changes: 1 addition & 63 deletions R/treeview.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,74 +110,12 @@ treeview <- function(e0,
)

# tree data frames
## tips
sc2 <- sc0[!is.na(sc0$tr2mrca), ]
sc2$date_range <- sapply(
seq_len(nrow(sc2)),
function(i) glue::glue("{sc2$least_recent_tip[i]} -> {sc2$most_recent_tip[i]}")
)
tdvars <- unique(c(
branch_cols,
"logistic_growth_rate",
"clock_outlier",
"cluster_size",
"date_range",
"cluster_id",
"region_summary",
"cocirc_lineage_summary",
"lineage",
"tr2mrca"
))

td0 <- sc2[sc2$tr2mrca <= ape::Ntip(tr2), tdvars]
td0$lineages <- td0$lineage
td0$cocirc_summary <- td0$cocirc_lineage_summary
td0$node <- td0$tr2mrca
td0$internal <- "N"
## internal
td1 <- sc2[sc2$tr2mrca > ape::Ntip(tr2), tdvars]
if (nrow(td1) > 0) {
td1$lineages <- td1$lineage
td1$cocirc_summary <- td1$cocirc_lineage_summary
td1$node <- td1$tr2mrca
td1$internal <- "Y"
td1$cluster_size <- 0
x <- setdiff(
(ape::Ntip(tr2) + 1):(ape::Ntip(tr2) + ape::Nnode(tr2)),
td1$node
) # make sure every node represented
td1 <- merge(td1,
data.frame(node = x),
all = TRUE
)
td <- rbind(td0, td1)
} else {
td <- td0
}
td <- td[order(td$node), ] # important

# rescale clock ?
td$clock_outlier <- scale(td$clock_outlier) / 2

# interpolate missing values & repair cluster sizes
td$logistic_growth_rate[(td$node <= ape::Ntip(tr2)) & (is.na(td$logistic_growth_rate))] <- 0
td$clock_outlier[(td$node <= ape::Ntip(tr2)) & (is.na(td$clock_outlier))] <- 0
for (ie in ape::postorder(tr2)) {
a <- tr2$edge[ie, 1]
u <- tr2$edge[ie, 2]
td$cluster_size[a] <- td$cluster_size[a] + td$cluster_size[u]
for (vn in branch_cols) {
if (is.na(td[[vn]][a])) {
td[[vn]][a] <- td[[vn]][u]
}
}
}
td <- extract_tree_dataframe(sc0 = sc0, tr2 = tr2, branch_cols = branch_cols)

# cols for continuous stats
cols <- rev(c("red", "orange", "green", "cyan", "blue"))

# lineages for clade labels
td$lineages1 <- sapply(strsplit(td$lineages, split = "\\|"), "[", 1)
sc0$lineage1 <- sapply(strsplit(sc0$lineage, split = "\\|"), "[", 1)

tablin <- table(td$lineages1[!(sc0$lineage1 %in% c("None", "B.1"))])
Expand Down
Loading