Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve getTaxonomyLabels #672

Merged
merged 5 commits into from
Jan 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: mia
Type: Package
Version: 1.15.13
Version: 1.15.14
Authors@R:
c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
email = "[email protected]",
Expand Down
135 changes: 95 additions & 40 deletions R/taxonomy.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,43 +33,50 @@
#'
#' @param with.rank \code{Logical scalar}. Should the level be add as a
#' suffix? For example: "Phylum:Crenarchaeota". (Default: \code{FALSE})
#'
#'
#' @param with_rank Deprecated. Use \code{with.rank} instead.
#'
#' @param make.unique \code{Logical scalar}. Should the labels be made
#' unique, if there are any duplicates? (Default: \code{TRUE})
#'
#'
#' @param make_unique Deprecated. Use \code{make.unique} instead.
#'
#' @param resolve.loops \code{Logical scalar}. Should \code{resolveLoops}
#' be applied to the taxonomic data? Please note that has only an effect,
#' if the data is unique. (Default: \code{TRUE})
#'
#'
#' @param resolve_loops Deprecated. Use \code{resolve.loops} instead.
#'
#' @param taxa \code{Character vector}. Used for subsetting the
#' @param taxa \code{Character vector}. Used for subsetting the
#' taxonomic information. If no information is found,\code{NULL} is returned
#' for the individual element. (Default: \code{NULL})
#'
#' @param from
#' @param from
#' \itemize{
#' \item For \code{mapTaxonomy}: \code{character scalar}. A value which
#' \item For \code{mapTaxonomy}: \code{character scalar}. A value which
#' must be a valid taxonomic rank. (Default: \code{NULL})
#' \item otherwise a \code{Taxa} object as returned by
#' \item otherwise a \code{Taxa} object as returned by
#' \code{\link[DECIPHER:IdTaxa]{IdTaxa}}
#' }
#'
#' @param to \code{Character Scalar}. Must be a valid
#' @param to \code{Character Scalar}. Must be a valid
#' taxonomic rank. (Default: \code{NULL})
#'
#'
#' @param use.grepl \code{Logical}. Should pattern matching via
#' \code{grepl} be used? Otherwise literal matching is used.
#' (Default: \code{FALSE})
#'
#'
#' @param use_grepl Deprecated. Use \code{use.grepl} instead.
#'
#' @param ... optional arguments not used currently.
#'
#' @param ... additional arguments
#' \itemize{
#' \item \code{lowest.rank}: A lowest taxonomy level to be considered in
#' \code{getTaxonomyLabels}. Ranks lower than this will be collapsed into rank
#' specified by \code{lowest.rank}. For example, if genus level is specified,
#' species will be collapsed into genus. If \code{NULL}, the data is not
#' collapsed. (Default: \code{NULL})
#' }
#'
#' @param ranks \code{Character vector}. A vector of ranks to be set.
#' @details
#' Taxonomic information from the \code{IdTaxa} function of \code{DECIPHER}
Expand All @@ -84,9 +91,9 @@
#' \item \code{taxonomyRanks}: a \code{character} vector with all the
#' taxonomic ranks found in \code{colnames(rowData(x))}
#' \item \code{taxonomyRankEmpty}: a \code{logical} value
#' \item \code{mapTaxonomy}: a \code{list} per element of taxa. Each
#' \item \code{mapTaxonomy}: a \code{list} per element of taxa. Each
#' element is either a \code{DataFrame}, a \code{character} or \code{NULL}.
#' If all \code{character} results have the length of one, a single
#' If all \code{character} results have the length of one, a single
#' \code{character} vector is returned.
#' }
#'
Expand All @@ -107,22 +114,27 @@
#' table(taxonomyRankEmpty(GlobalPatterns,"Species"))
#'
#' getTaxonomyLabels(GlobalPatterns[1:20,])
#'
#' # Taxonomy labels represent the lowest taxonomy name that identifies each
#' # taxa. For instance, they can represent OTUs which does no necessarily
#' # tell much. In this case, you might want to get the labels with higher
#' # taxonomy rank
#' getTaxonomyLabels(GlobalPatterns[1:20,], lowest.rank = "Class")
#'
#' # mapTaxonomy
#' ## returns the unique taxonomic information
#' mapTaxonomy(GlobalPatterns)
#' # returns specific unique taxonomic information
#' mapTaxonomy(GlobalPatterns, taxa = "Escherichia")
#' # returns information on a single output
#' mapTaxonomy(GlobalPatterns, taxa = "Escherichia",to="Family")
#'
#'
#' # setTaxonomyRanks
#' tse <- GlobalPatterns
#' colnames(rowData(tse))[1] <- "TAXA1"
#'
#'
#' setTaxonomyRanks(colnames(rowData(tse)))
#' # Taxonomy ranks set to: taxa1 phylum class order family genus species
#'
#' # Taxonomy ranks set to: taxa1 phylum class order family genus species
#'
#' # getTaxonomyRanks is to get/check if the taxonomic ranks is set to "TAXA1"
#' getTaxonomyRanks()
NULL
Expand Down Expand Up @@ -227,7 +239,7 @@ setMethod("checkTaxonomy", signature = c(x = "SummarizedExperiment"),
setTaxonomyRanks <- function(ranks) {
ranks <- tolower(ranks)
# Check if rank is a character vector with length >= 1
if (!is.character(ranks) || length(ranks) < 1
if (!is.character(ranks) || length(ranks) < 1
|| any(ranks == "" | ranks == " " | ranks == "\t" | ranks == "-" |
ranks == "_")
|| any(grepl("\\s{2,}", ranks))) {
Expand Down Expand Up @@ -316,9 +328,15 @@ setMethod("getTaxonomyLabels", signature = c(x = "SummarizedExperiment"),
stop("'resolve.loops' must be TRUE or FALSE.", call. = FALSE)
}
#
dup <- duplicated(rowData(x)[,taxonomyRanks(x)])
# Collapse taxonomy ranks if user has specified so
x <- .collapse_lowest_taxonomy_ranks(x, ...)

dup <- duplicated(rowData(x)[,taxonomyRanks(x), drop = FALSE])
if(any(dup)){
td <- apply(rowData(x)[,taxonomyRanks(x)],1L,paste,collapse = "___")
td <- apply(
rowData(x)[,taxonomyRanks(x), drop = FALSE],
1L,
paste, collapse = "___")
td_non_dup <- td[!dup]
m <- match(td, td_non_dup)
}
Expand All @@ -339,6 +357,41 @@ setMethod("getTaxonomyLabels", signature = c(x = "SummarizedExperiment"),
}
)

# This function is for collapsing the lowest taxonomy ranks into single value.
# For instance, if user specifies genus rank, genus and species are collapsed
# into one, and species rank is removed from the taxonomy table. If family is
# specified, along with these two, also family is incorporated into this value.
.collapse_lowest_taxonomy_ranks <- function(
x, lowest.rank = NULL, empty.fields = c(NA, "", " ", "\t", "-", "_"),
sep = "_", ...){
if( !.is_a_string(sep) ){
stop("'sep' must be a single character value.", call. = FALSE)
}
# By default, we keep the taxonomy table untouched.
if( !is.null(lowest.rank) ){
.check_taxonomic_rank(lowest.rank, x)
# Get available taxonomy ranks
available_ranks <- taxonomyRanks(x)
# Get indices of ranks that we are going to collapse into one
mod_ranks <- seq(
which(available_ranks == lowest.rank), length(available_ranks))
# For each row, collapse ranks into one
new_rank <- apply(rowData(x)[, mod_ranks, drop = FALSE], 1, function(x){
# Check if empty, and replace with NA if it is
x[ x %in% empty.fields ] <- NA
# Collapse values
x <- paste(na.omit(x), collapse = sep)
return(x)
})
# Remove the collapsed ranks from the original table
rowData(x) <- rowData(x)[, -mod_ranks, drop = FALSE]
# Add the collapsed values to the taxonomy table
new_rank <- unname(new_rank)
rowData(x)[[lowest.rank]] <- new_rank
}
return(x)
}

#' @importFrom IRanges CharacterList LogicalList
.get_tax_ranks_selected <- function(x,rd, tax_cols, empty.fields){
# We need DataFrame here to handle cases with a single entry in tax_cols
Expand Down Expand Up @@ -415,46 +468,48 @@ setMethod("getTaxonomyLabels", signature = c(x = "SummarizedExperiment"),
}

#' Calculate hierarchy tree
#'
#'
#' These functions generate a hierarchy tree using taxonomic information from a
#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{SummarizedExperiment}}
#' object and add this hierarchy tree into the \code{rowTree}.
#'
#'
#' @inheritParams taxonomy-methods
#'
#'
#' @param ... optional arguments not used currently.
#'
#' @details
#'
#' \code{addHierarchyTree} calculates a hierarchy tree from the available
#'
#' \code{addHierarchyTree} calculates a hierarchy tree from the available
#' taxonomic information and add it to \code{rowTree}.
#'
#'
#' \code{getHierarchyTree} generates a hierarchy tree from the available
#' taxonomic information. Internally it uses
#' \code{\link[TreeSummarizedExperiment:toTree]{toTree}} and
#' \code{\link[TreeSummarizedExperiment:resolveLoop]{resolveLoop}} to sanitize
#' data if needed.
#'
#'
#' Please note that a hierarchy tree is not an actual phylogenetic tree.
#' A phylogenetic tree represents evolutionary relationships among features.
#' On the other hand, a hierarchy tree organizes species into a hierarchical
#' structure based on their taxonomic ranks.
#'
#' structure based on their taxonomic ranks.
#'
#' @return
#' \itemize{
#' \item \code{addHierarchyTree}: a \code{TreeSummarizedExperiment} whose
#' \code{phylo} tree represents the hierarchy among available taxonomy
#' \code{phylo} tree represents the hierarchy among available taxonomy
#' information.
#' \item \code{getHierarchyTree}: a \code{phylo} tree representing the
#' \item \code{getHierarchyTree}: a \code{phylo} tree representing the
#' hierarchy among available taxonomy information.
#' }
#'
#'
#' @name hierarchy-tree
#'
#'
#' @examples
#' # Generate a tree based on taxonomic rank hierarchy (a hierarchy tree).
#' data(GlobalPatterns)
#' tse <- GlobalPatterns
#' getHierarchyTree(tse)
#'
#'
#' # Add a hierarchy tree to a TreeSummarizedExperiment.
#' # Please note that any tree already stored in rowTree() will be overwritten.
#' tse <- addHierarchyTree(tse)
Expand All @@ -470,7 +525,7 @@ setMethod("getHierarchyTree", signature = c(x = "SummarizedExperiment"),
# Input check
# If there is no rowData it is not possible to create rowTree
if( ncol(rowData(x)) == 0L ){
stop("'x' does not have rowData. Tree cannot be created.",
stop("'x' does not have rowData. Tree cannot be created.",
call. = FALSE)
}
# If there are no taxonomy ranks
Expand Down Expand Up @@ -601,7 +656,7 @@ setMethod("mapTaxonomy", signature = c(x = "SummarizedExperiment"),
stop("'from' must be an element of taxonomyRanks(x).",
call. = FALSE)
}
}
}
if(!is.null(to)){
if(!.is_a_string(to)){
stop("'to' must be a single character value.",
Expand Down Expand Up @@ -683,13 +738,13 @@ setMethod("mapTaxonomy", signature = c(x = "SummarizedExperiment"),
}

#' @importFrom SummarizedExperiment rowData
.get_tax_groups <- function(x, col, ignore.taxonomy = onRankOnly,
.get_tax_groups <- function(x, col, ignore.taxonomy = onRankOnly,
onRankOnly = FALSE, ...){
# input check
if(!.is_a_bool(ignore.taxonomy)){
stop("'ignore.taxonomy' must be TRUE or FALSE.", call. = FALSE)
}

tax_cols <- .get_tax_cols_from_se(x)
tax_col_n <- seq_along(tax_cols)
if(length(tax_col_n) < col){
Expand Down
16 changes: 14 additions & 2 deletions man/taxonomy-methods.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions tests/testthat/test-2taxonomy.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,18 @@ test_that("taxonomy", {
expect_equal(getTaxonomyLabels(xtse, resolve.loops = TRUE),
c("Family:j","Phylum:a","Family:k","Family:l","Family:m",
"Family:n","Family:o_1","Phylum:c","Family:o_2"))
# Check that lowest.rank works
data(GlobalPatterns)
tse <- GlobalPatterns
labs <- getTaxonomyLabels(tse, lowest.rank = "Kingdom", with.rank = TRUE)
expect_true( all(unlist(lapply(labs, grepl, pattern = "Kingdom:"))) )
labs <- getTaxonomyLabels(tse, lowest.rank = "Class", with.rank = TRUE)
expect_true( all(!unlist(lapply(labs, grepl, pattern = "Order|FamilySpecies|Genus"))) )
#
tse <- agglomerateByRank(tse, rank = "Phylum")
labs <- getTaxonomyLabels(tse)
labs1 <- getTaxonomyLabels(tse, lowest.rank = "Class")
expect_equal(labs, labs1)

# addHierarchyTree
data(GlobalPatterns, package="mia")
Expand Down
Loading