diff --git a/DESCRIPTION b/DESCRIPTION index eeff2a4e9..38c4c3c37 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mia Type: Package -Version: 1.15.4 +Version: 1.15.5 Authors@R: c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"), email = "tuomas.v.borman@utu.fi", diff --git a/NAMESPACE b/NAMESPACE index b8c444f21..43c48df48 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -289,7 +289,9 @@ importFrom(DelayedArray,DelayedArray) importFrom(DelayedArray,colSums) importFrom(DelayedArray,getAutoBPPARAM) importFrom(DelayedArray,rowSums) +importFrom(DelayedArray,rowsum) importFrom(DelayedArray,setAutoBPPARAM) +importFrom(DelayedArray,type) importFrom(DelayedMatrixStats,colMeans2) importFrom(DelayedMatrixStats,colQuantiles) importFrom(DelayedMatrixStats,colSums2) @@ -375,7 +377,6 @@ importFrom(rbiom,unifrac) importFrom(rlang,":=") importFrom(rlang,sym) importFrom(scuttle,sumCountsAcrossFeatures) -importFrom(scuttle,summarizeAssayByGroup) importFrom(stats,TukeyHSD) importFrom(stats,anova) importFrom(stats,as.dist) diff --git a/NEWS b/NEWS index 053a02399..17936409b 100644 --- a/NEWS +++ b/NEWS @@ -159,3 +159,4 @@ computation Changes in version 1.15.x + subsetBy*: added update.tree argument ++ agglomerateBy*: Add na.rm option for excluding NA counts diff --git a/R/agglomerate.R b/R/agglomerate.R index bb95f9fe5..78f183af7 100644 --- a/R/agglomerate.R +++ b/R/agglomerate.R @@ -1,5 +1,10 @@ -#' Agglomerate or merge data using taxonomic information -#' +#' @name +#' agglomerate-methods +#' +#' @title +#' Agglomerate data using taxonomic information or other grouping +#' +#' @description #' Agglomeration functions can be used to sum-up data based on specific criteria #' such as taxonomic ranks, variables or prevalence. #' @@ -16,12 +21,38 @@ #' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{assay}} are #' agglomerated, i.e. summed up. If the assay contains values other than counts #' or absolute values, this can lead to meaningless values being produced. +#' +#' @details +#' Agglomeration sums up the values of assays at the specified taxonomic level. +#' With certain assays, e.g. those that include binary or negative values, this +#' summing can produce meaningless values. In those cases, consider performing +#' agglomeration first, and then applying the transformation afterwards. +#' +#' \code{agglomerateByVariable} works similarly to +#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. +#' However, additional support for \code{TreeSummarizedExperiment} was added and +#' science field agnostic names were used. In addition the \code{archetype} +#' argument lets the user select how to preserve row or column data. +#' +#' For merge data of assays the function from \code{scuttle} are used. +#' +#' @return +#' \code{agglomerateByRank} returns a taxonomically-agglomerated, +#' optionally-pruned object of the same class as \code{x}. +#' \code{agglomerateByVariable} returns an object of the same class as \code{x} +#' with the specified entries merged into one entry in all relevant components. +#' \code{agglomerateByRank} returns a taxonomically-agglomerated, +#' optionally-pruned object of the same class as \code{x}. #' #' @inheritParams getPrevalence #' #' @param empty.fields \code{Character vector}. Defines which values should be #' regarded as empty. (Default: \code{c(NA, "", " ", "\t")}). They will be #' removed if \code{na.rm = TRUE} before agglomeration. +#' +#' @param empty.rm \code{Logical scalar}. Defines whether rows including +#' \code{empty.fields} in specified \code{rank} will be excluded. +#' (Default: \code{TRUE}) #' #' @param agglomerateTree Deprecated. Use \code{update.tree} instead. #' @@ -37,28 +68,36 @@ #' \item \code{empty.ranks.rm}: \code{Logical scalar}. Determines #' whether to remove those columns of rowData that include only NAs after #' agglomeration. (Default: \code{FALSE}) +#' +#' \item \code{empty.rm}: \code{Logical scalar}. Determines +#' whether to remove rows that do not belong to any group, i.e., that +#' have \code{NA} value. (Default: \code{FALSE}) +#' #' \item \code{make.unique}: \code{Logical scalar}. Determines #' whether to make rownames unique. (Default: \code{TRUE}) +#' #' \item \code{detection}: The threshold value for determining presence #' or absence. A value in \code{x} must exceed this threshold to be #' considered present. +#' #' \item \code{assay.type}: \code{Character scalar}. Specifies the assay -#' used to -#' calculate prevalence. (Default: \code{"counts"}) +#' used to calculate prevalence. (Default: \code{"counts"}) +#' #' \item \code{prevalence}: Prevalence threshold (in 0 to 1). The #' required prevalence is strictly greater by default. To include the #' limit, set \code{include.lowest} to \code{TRUE}. +#' #' \item \code{update.refseq}: \code{Logical scalar}. Should a #' consensus sequence be calculated? If set to \code{FALSE}, the result #' from \code{archetype} is returned; If set to \code{TRUE} the result #' from #' \code{\link[DECIPHER:ConsensusSequence]{DECIPHER::ConsensusSequence}} #' is returned. (Default: \code{FALSE}) +#' #' \item \code{archetype} Of each level of \code{group}, which element -#' should -#' be regarded as the archetype and metadata in the columns or rows kept, -#' while merging? This can be single integer value or an integer vector -#' of the same length as \code{levels(group)}. (Default: +#' should be regarded as the archetype and metadata in the columns or +#' rows kept, while merging? This can be single integer value or an +#' integer vector of the same length as \code{levels(group)}. (Default: #' \code{1L}, which means the first element encountered per #' factor level will be kept) #' } @@ -92,43 +131,6 @@ #' #' @param mergeTree Deprecated. Use \code{update.tree} instead. #' -#' @details -#' -#' Agglomeration sums up the values of assays at the specified taxonomic level. -#' With -#' certain assays, e.g. those that include binary or negative values, this -#' summing -#' can produce meaningless values. In those cases, consider performing -#' agglomeration -#' first, and then applying the transformation afterwards. -#' -#' \code{agglomerateByVariable} works similarly to -#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. -#' However, additional support for \code{TreeSummarizedExperiment} was added and -#' science field agnostic names were used. In addition the \code{archetype} -#' argument lets the user select how to preserve row or column data. -#' -#' For merge data of assays the function from \code{scuttle} are used. -#' -#' @return -#' \code{agglomerateByRank} returns a taxonomically-agglomerated, -#' optionally-pruned object of the same class as \code{x}. -#' \code{agglomerateByVariable} returns an object of the same class as \code{x} -#' with the specified entries merged into one entry in all relevant components. -#' \code{agglomerateByRank} returns a taxonomically-agglomerated, -#' optionally-pruned object of the same class as \code{x}. -#' -#' @name agglomerate-methods -#' -#' @seealso -#' \code{\link[=splitOn]{splitOn}} -#' \code{\link[=unsplitOn]{unsplitOn}} -#' \code{\link[=agglomerate-methods]{agglomerateByVariable}}, -#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, -#' \code{\link[=agglomerate-methods]{agglomerateByRank}}, -#' \code{\link[SingleCellExperiment:altExps]{altExps}}, -#' \code{\link[SingleCellExperiment:splitAltExps]{splitAltExps}} -#' #' @examples #' #' ### Agglomerate data based on taxonomic information @@ -158,9 +160,9 @@ #' tse <- agglomerateByRank(tse, rank = "Genus") #' tse <- transformAssay(tse, method = "pa") #' -#' # removing empty labels by setting na.rm = TRUE +#' # Removing empty labels by setting empty.rm = TRUE #' sum(is.na(rowData(GlobalPatterns)$Family)) -#' x3 <- agglomerateByRank(GlobalPatterns, rank="Family", na.rm = TRUE) +#' x3 <- agglomerateByRank(GlobalPatterns, rank="Family", empty.rm = TRUE) #' nrow(x3) # different from x2 #' #' # Because all the rownames are from the same rank, rownames do not include @@ -172,20 +174,19 @@ #' print(rownames(x3[1:3,])) #' #' # use 'empty.ranks.rm' to remove columns that include only NAs -#' x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", -#' empty.ranks.rm = TRUE) +#' x4 <- agglomerateByRank( +#' GlobalPatterns, rank="Phylum", empty.ranks.rm = TRUE) #' head(rowData(x4)) #' -#' # If the assay contains NAs, you might want to consider replacing them, +#' # If the assay contains NAs, you might want to specify na.rm=TRUE, #' # since summing-up NAs lead to NA #' x5 <- GlobalPatterns #' # Replace first value with NA #' assay(x5)[1,1] <- NA #' x6 <- agglomerateByRank(x5, "Kingdom") #' head( assay(x6) ) -#' # Replace NAs with 0. This is justified when we are summing-up counts. -#' assay(x5)[ is.na(assay(x5)) ] <- 0 -#' x6 <- agglomerateByRank(x5, "Kingdom") +#' # Use na.rm=TRUE +#' x6 <- agglomerateByRank(x5, "Kingdom", na.rm = TRUE) #' head( assay(x6) ) #' #' ## Look at enterotype dataset... @@ -199,44 +200,94 @@ #' data(esophagus) #' esophagus #' plot(rowTree(esophagus)) -#' # get a factor for merging +#' # Get a factor for merging #' f <- factor(regmatches(rownames(esophagus), -#' regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) -#' merged <- agglomerateByVariable(esophagus, by = "rows", f, -#' update.tree = TRUE) +#' regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) +#' merged <- agglomerateByVariable( +#' esophagus, by = "rows", f, update.tree = TRUE) #' plot(rowTree(merged)) #' # #' data(GlobalPatterns) #' GlobalPatterns -#' merged <- agglomerateByVariable(GlobalPatterns, by = "cols", -#' colData(GlobalPatterns)$SampleType) +#' merged <- agglomerateByVariable( +#' GlobalPatterns, by = "cols", colData(GlobalPatterns)$SampleType) #' merged +#' +#' @seealso +#' \code{\link[=splitOn]{splitOn}} +#' \code{\link[=unsplitOn]{unsplitOn}} +#' \code{\link[=agglomerate-methods]{agglomerateByVariable}}, +#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}, +#' \code{\link[=agglomerate-methods]{agglomerateByRank}}, +#' \code{\link[SingleCellExperiment:altExps]{altExps}}, +#' \code{\link[SingleCellExperiment:splitAltExps]{splitAltExps}} +#' NULL #' @rdname agglomerate-methods #' @export -setGeneric("agglomerateByRank", - signature = "x", - function(x, ...) - standardGeneric("agglomerateByRank")) +setGeneric("agglomerateByRank", signature = "x", function(x, ...) + standardGeneric("agglomerateByRank")) #' @rdname agglomerate-methods -#' @aliases agglomerateByVariable #' @export -setGeneric("agglomerateByVariable", - signature = "x", - function(x, ...) - standardGeneric("agglomerateByVariable")) +setMethod( + "agglomerateByRank", signature = c(x = "TreeSummarizedExperiment"), + function(x, rank = taxonomyRanks(x)[1], update.tree = agglomerateTree, + agglomerate.tree = agglomerateTree, agglomerateTree = FALSE, ...){ + # Input check + if(!.is_a_bool(update.tree)){ + stop("'update.tree' must be TRUE or FALSE.", call. = FALSE) + } + # + # If there are multiple rowTrees, it might be that multiple + # trees are preserved after agglomeration even though the + # dataset could be presented with one tree. + # --> order the data so that the taxa are searched from one tree + # first. + if( length(rowTreeNames(x)) > 1 ){ + x <- .order_based_on_trees(x) + } + # Agglomerate data by using SCE method + x <- callNextMethod(x, rank = rank, update.tree = update.tree, ...) + return(x) + } +) + +#' @rdname agglomerate-methods +#' @importFrom SingleCellExperiment altExp altExp<- altExps<- +#' @export +setMethod( + "agglomerateByRank", signature = c(x = "SingleCellExperiment"), + function(x, rank = taxonomyRanks(x)[1], altexp = NULL, + altexp.rm = strip_altexp, strip_altexp = TRUE, ...){ + # Input check + if(!.is_a_bool(altexp.rm)){ + stop("'altexp.rm' must be TRUE or FALSE.", call. = FALSE) + } + # + # Get altexp if specified + x <- .check_and_get_altExp(x, altexp) + # Remove altexps if user specified so. As we agglomerate data, they do + # not necessarily represent the "high-level" data anymore. I.e., usually + # altExp includes subsets of TreeSE, but that is not the case anymore. + # That is why we clear the altexp slot. + if( altexp.rm ){ + altExps(x) <- NULL + } + # Agglomerate the data by using SE method + x <- callNextMethod(x, rank = rank, ...) + return(x) + } +) #' @rdname agglomerate-methods -#' #' @importFrom SummarizedExperiment rowData rowData<- -#' #' @export setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), - function(x, rank = taxonomyRanks(x)[1], na.rm = TRUE, + function(x, rank = taxonomyRanks(x)[1], empty.rm = TRUE, empty.fields = c(NA, "", " ", "\t", "-", "_"), ...){ - # input check + # Input check if(nrow(x) == 0L){ stop("No data available in `x` ('x' has nrow(x) == 0L.)", call. = FALSE) @@ -245,8 +296,8 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), stop("'rank' must be a non-empty single character value", call. = FALSE) } - if(!.is_a_bool(na.rm)){ - stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) + if(!.is_a_bool(empty.rm)){ + stop("'empty.rm' must be TRUE or FALSE.", call. = FALSE) } if(ncol(rowData(x)) == 0L){ stop("taxonomyData needs to be populated.", call. = FALSE) @@ -254,146 +305,110 @@ setMethod("agglomerateByRank", signature = c(x = "SummarizedExperiment"), .check_taxonomic_rank(rank, x) .check_for_taxonomic_data_order(x) # - - # Make a vector from the taxonomic data. - col <- which( taxonomyRanks(x) %in% rank ) + # Get the index of which taxonomy rank is detected and used for + # agglomeration + col_idx <- which( taxonomyRanks(x) %in% rank ) + # Get the indices of detected rank columns from rowData tax_cols <- .get_tax_cols_from_se(x) - - # if na.rm is TRUE, remove the empty, white-space, NA values from - # tree will be pruned later, if update.tree = TRUE - if( na.rm ){ - x <- .remove_with_empty_taxonomic_info(x, tax_cols[col], - empty.fields) + + # if empty.rm is TRUE, remove those rows that have empty, + # white-space, NA values in rank information. I.e., they do not have + # taxonomy information in specified taxonomy level. + if( empty.rm ){ + x <- .remove_with_empty_taxonomic_info( + x, tax_cols[col_idx], empty.fields) } # If rank is the only rank that is available and this data is unique, # then the data is already 'aggregated' and no further operations # are needed. - if (length(taxonomyRanks(x)) == 1L && - !anyDuplicated(rowData(x)[,taxonomyRanks(x)])) { + if( length(taxonomyRanks(x)) == 1L && + !anyDuplicated(rowData(x)[,taxonomyRanks(x)]) ){ return(x) } - - # get groups of taxonomy entries - tax_factors <- .get_tax_groups(x, col = col, ...) - # Convert to factors. Use na.rm so that NA values are not preserved. - # i.e. they are not converted into character values. + + # Get groups of taxonomy entries, i.e., get the specified rank + # column from rowData + tax_factors <- .get_tax_groups(x, col = col_idx, ...) + # Convert to factors. Use empty.rm so that NA values are not + # preserved. i.e. they are not converted into character values. # NA values are handled earlier in this function. - tax_factors <- .norm_f(nrow(x), tax_factors, na.rm = TRUE) - - # merge taxa - x <- agglomerateByVariable( - x, by = "rows", group = tax_factors, na.rm = TRUE, ...) - - # "Empty" the values to the right of the rank, using NA_character_. - if( col < length(taxonomyRanks(x)) ){ - badcolumns <- tax_cols[seq_along(tax_cols) > col] - if(length(badcolumns) > 0L){ - row_data <- rowData(x) - row_data[, badcolumns] <- NA_character_ - rowData(x) <- row_data - } + tax_factors <- .norm_f(nrow(x), tax_factors, empty.rm = TRUE) + + # Agglomerate data by utilizing agglomerateByVariable + args <- c(list( + x, by = "rows", group = tax_factors, empty.rm = TRUE), list(...)) + x <- do.call(agglomerateByVariable, args) + + # Replace the values to the right of the rank with NA_character_. + # These columns no longer represent the agglomerated data, as they + # previously corresponded to specific lower taxonomic ranks that are + # now aggregated at the current level. + badcolumns <- tax_cols[seq_along(tax_cols) > col_idx] + if( length(badcolumns) > 0L ){ + rowData(x)[, badcolumns] <- NA_character_ } - # adjust rownames - rownames(x) <- getTaxonomyLabels(x, empty.fields, ..., - with.rank = FALSE, - resolve.loops = FALSE) + # Adjust rownames + rownames(x) <- getTaxonomyLabels( + x, empty.fields, with.rank = FALSE, resolve.loops = FALSE, ...) # Remove those columns from rowData that include only NAs x <- .remove_NA_cols_from_rowdata(x, ...) + # Add agglomeration info to metadata x <- .add_values_to_metadata(x, "agglomerated_by_rank", rank) - # Order the data in alphabetical order x <- x[ order(rownames(x)), ] + return(x) } ) #' @rdname agglomerate-methods #' @aliases agglomerateByVariable #' @export -setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"), - function(x, by, group = f, f, ...){ - by <- .check_MARGIN(by) - FUN <- switch(by, .merge_rows, .merge_cols) - x <- FUN(x, group, ...) - return(x) - } -) +setGeneric("agglomerateByVariable", signature = "x", function(x, ...) + standardGeneric("agglomerateByVariable")) #' @rdname agglomerate-methods #' @aliases agglomerateByVariable #' @export setMethod("agglomerateByVariable", - signature = c(x = "TreeSummarizedExperiment"), - function(x, by, group = f, f, update.tree = mergeTree, - mergeTree = FALSE, ...){ - # Check by - by <- .check_MARGIN(by) - # Get function based on by - FUN <- switch(by, .merge_rows_TSE, .merge_cols_TSE) - # Agglomerate - x <- FUN(x, group, update.tree = update.tree, ...) - return(x) - } -) - -#' @rdname agglomerate-methods -#' @importFrom SingleCellExperiment altExp altExp<- altExps<- -#' @export -setMethod("agglomerateByRank", signature = c(x = "SingleCellExperiment"), - function(x, ..., altexp = NULL, altexp.rm = strip_altexp, - strip_altexp = TRUE){ - # input check - if(!.is_a_bool(altexp.rm)){ - stop("'altexp.rm' mus be TRUE or FALSE.", call. = FALSE) - } - # - if (!is.null(altexp)) { - x <- altExp(x, altexp) - } - if(altexp.rm && is(x, "SingleCellExperiment")){ - altExps(x) <- NULL - } - callNextMethod(x, ...) + signature = c(x = "TreeSummarizedExperiment"), + function(x, by, group = f, f, update.tree = mergeTree, mergeTree = FALSE, + ...){ + # Check by + by <- .check_MARGIN(by) + # Get function based on by + FUN <- switch(by, .merge_rows_TSE, .merge_cols_TSE) + # Agglomerate + x <- FUN(x, group, update.tree = update.tree, ...) + return(x) } ) #' @rdname agglomerate-methods +#' @aliases agglomerateByVariable #' @export -setMethod( - "agglomerateByRank", signature = c(x = "TreeSummarizedExperiment"), - function( - x, ..., update.tree = agglomerateTree, - agglomerate.tree = agglomerateTree, agglomerateTree = FALSE){ - # input check - if(!.is_a_bool(update.tree)){ - stop("'update.tree' must be TRUE or FALSE.", - call. = FALSE) - } - # If there are multipe rowTrees, it might be that multiple - # trees are preserved after agglomeration even though the - # dataset could be presented with one tree. - # --> order the data so that the taxa are searched from one tree - # first. - if( length(rowTreeNames(x)) > 1 ){ - x <- .order_based_on_trees(x) - } - # Agglomerate data - x <- callNextMethod(x, update.tree = update.tree, ...) - return(x) - } +setMethod("agglomerateByVariable", signature = c(x = "SummarizedExperiment"), + function(x, by, group = f, f, ...){ + # Check by + by <- .check_MARGIN(by) + # Agglomerate the data + x <- .merge_rows_or_cols(x, group, by, ...) + return(x) + } ) ################################ HELP FUNCTIONS ################################ -.remove_with_empty_taxonomic_info <- - function(x, column, empty.fields = c(NA,""," ","\t","-","_")) - { - tax <- as.character(rowData(x)[,column]) - f <- !(tax %in% empty.fields) - if(any(!f)){ - x <- x[f, , drop=FALSE] - } - x +# This functions subset the data so that rows that do not have taxonomy +# information in specified rank are removed. +.remove_with_empty_taxonomic_info <- function( + x, column, empty.fields = c(NA,""," ","\t","-","_")){ + tax <- as.character(rowData(x)[,column]) + f <- !(tax %in% empty.fields) + if(any(!f)){ + x <- x[f, , drop=FALSE] } + return(x) +} # This function removes empty rank columns from rowdata. (Those that include # only NA values) diff --git a/R/getPrevalence.R b/R/getPrevalence.R index fca790187..6ccf3a850 100644 --- a/R/getPrevalence.R +++ b/R/getPrevalence.R @@ -9,11 +9,11 @@ #' #' @param assay_name Deprecated. Use \code{assay.type} instead. #' -#' @param detection \code{Numeric scalar}. Detection threshold for absence/presence. -#' If \code{as_relative = FALSE}, -#' it sets the counts threshold for a taxon to be considered present. -#' If \code{as_relative = TRUE}, it sets the relative abundance threshold -#' for a taxon to be considered present. (Default: \code{0}) +#' @param detection \code{Numeric scalar}. Detection threshold for +#' absence/presence. If \code{as_relative = FALSE}, +#' it sets the counts threshold for a taxon to be considered present. +#' If \code{as_relative = TRUE}, it sets the relative abundance threshold +#' for a taxon to be considered present. (Default: \code{0}) #' #' @param include.lowest \code{Logical scalar}. Should the lower boundary of the #' detection and prevalence cutoffs be included? (Default: \code{FALSE}) @@ -23,11 +23,11 @@ #' @param sort \code{Logical scalar}. Should the result be sorted by prevalence? #' (Default: \code{FALSE}) #' -#' @param rank \code{Character scalar}. Defines a taxonomic rank. Must be a value of -#' \code{taxonomyRanks()} function. +#' @param rank \code{Character scalar}. Defines a taxonomic rank. Must be a +#' value of \code{taxonomyRanks()} function. #' -#' @param na.rm \code{Logical scalar}. Should NA values be omitted when calculating -#' prevalence? (Default: \code{TRUE}) +#' @param na.rm \code{Logical scalar}. Should NA values be omitted? +#' (Default: \code{TRUE}) #' #' @param update.tree \code{Logical scalar}. Should #' \code{rowTree()} also be agglomerated? (Default: \code{FALSE}) @@ -37,8 +37,6 @@ #' \item If \code{!is.null(rank)} arguments are passed on to #' \code{\link[=agglomerate-methods]{agglomerateByRank}}. See #' \code{\link[=agglomerate-methods]{?agglomerateByRank}} for more details. -#' Note that you can specify whether to remove empty ranks with -#' \code{agg.na.rm} instead of \code{na.rm}. (default: \code{FALSE}) #' #' \item for \code{getPrevalent}, \code{getRare}, \code{subsetByPrevalent} #' and \code{subsetByRare} additional parameters passed to @@ -175,8 +173,8 @@ NULL #' @rdname getPrevalence #' @export setGeneric("getPrevalence", signature = "x", - function(x, ...) - standardGeneric("getPrevalence")) + function(x, ...) + standardGeneric("getPrevalence")) #' @rdname getPrevalence #' @export @@ -186,8 +184,7 @@ setMethod("getPrevalence", signature = c(x = "ANY"), function( # input check if (!.is_numeric_string(detection)) { stop("'detection' must be a single numeric value or coercible to ", - "one.", - call. = FALSE) + "one.", call. = FALSE) } # if(!.is_a_bool(na.rm)){ @@ -227,36 +224,6 @@ setMethod("getPrevalence", signature = c(x = "ANY"), function( } ) -.agg_for_prevalence <- function( - x, rank, relabel = FALSE, make.unique = TRUE, na.rm = FALSE, - agg.na.rm = TRUE, ...){ - # Check na.rm. It is not used in this function, it is only caught so that - # it can be passed to getPrevalence(matrix) and not use it here in - # agglomerateByRank function. - if(!.is_a_bool(na.rm)){ - stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) - } - # - # Check drop.empty.rank - if(!.is_a_bool(agg.na.rm)){ - stop("'agg.na.rm' must be TRUE or FALSE.", call. = FALSE) - } - # - if(!is.null(rank)){ - .check_taxonomic_rank(rank, x) - args <- c(list(x = x, rank = rank, na.rm = agg.na.rm), list(...)) - argNames <- c( - "x","rank","ignore.taxonomy","na.rm","empty.fields", "archetype", - "update.tree","average","BPPARAM", "update.refseq") - args <- args[names(args) %in% argNames] - x <- do.call(agglomerateByRank, args) - if(relabel){ - rownames(x) <- getTaxonomyLabels(x, make.unique = make.unique) - } - } - x -} - #' @rdname getPrevalence #' @export setMethod("getPrevalence", signature = c(x = "SummarizedExperiment"), @@ -264,7 +231,7 @@ setMethod("getPrevalence", signature = c(x = "SummarizedExperiment"), rank = NULL, ...){ # check assay .check_assay_present(assay.type, x) - x <- .agg_for_prevalence(x, rank = rank, ...) + x <- .merge_features(x, rank = rank, ...) mat <- assay(x, assay.type) # Calculate abundance mat <- .to_rel_abund(mat, ...) @@ -286,8 +253,8 @@ setMethod("getPrevalence", signature = c(x = "SummarizedExperiment"), #' #' @export setGeneric("getPrevalent", signature = "x", - function(x, ...) - standardGeneric("getPrevalent")) + function(x, ...) + standardGeneric("getPrevalent")) .norm_rownames <- function(x){ if(is.null(rownames(x))){ @@ -303,8 +270,7 @@ setGeneric("getPrevalent", signature = "x", # input check if (!.is_numeric_string(prevalence)) { stop("'prevalence' must be a single numeric value or coercible to ", - "one.", - call. = FALSE) + "one.", call. = FALSE) } prevalence <- as.numeric(prevalence) @@ -337,7 +303,7 @@ setGeneric("getPrevalent", signature = "x", .get_prevalent_taxa <- function(x, rank = NULL, ...){ if(is(x,"SummarizedExperiment")){ - x <- .agg_for_prevalence(x, rank = rank, ...) + x <- .merge_features(x, rank = rank, ...) } indices <- .get_prevalent_indices(x, ...) # If named input return named output @@ -381,8 +347,8 @@ setMethod("getPrevalent", signature = c(x = "SummarizedExperiment"), #' #' @export setGeneric("getRare", signature = "x", - function(x, ...) - standardGeneric("getRare")) + function(x, ...) + standardGeneric("getRare")) .get_rare_indices <- function(x, ...){ indices <- .get_prevalent_indices(x = x, ...) @@ -395,7 +361,7 @@ setGeneric("getRare", signature = "x", .get_rare_taxa <- function(x, rank = NULL, ...){ if(is(x,"SummarizedExperiment")){ - x <- .agg_for_prevalence(x, rank = rank, ...) + x <- .merge_features(x, rank = rank, ...) } indices <- .get_rare_indices(x, ...) # @@ -434,14 +400,14 @@ setMethod("getRare", signature = c(x = "SummarizedExperiment"), #' @rdname getPrevalence #' @export setGeneric("subsetByPrevalent", signature = "x", - function(x, ...) - standardGeneric("subsetByPrevalent")) + function(x, ...) + standardGeneric("subsetByPrevalent")) #' @rdname getPrevalence #' @export setMethod("subsetByPrevalent", signature = c(x = "SummarizedExperiment"), function(x, rank = NULL, ...){ - x <- .agg_for_prevalence(x, rank = rank, ...) + x <- .merge_features(x, rank = rank, ...) prevalent_indices <- .get_prevalent_indices(x, ...) x[prevalent_indices, ] } @@ -470,14 +436,14 @@ setMethod("subsetByPrevalent", signature = c(x = "TreeSummarizedExperiment"), #' @rdname getPrevalence #' @export setGeneric("subsetByRare", signature = "x", - function(x, ...) - standardGeneric("subsetByRare")) + function(x, ...) + standardGeneric("subsetByRare")) #' @rdname getPrevalence #' @export setMethod("subsetByRare", signature = c(x = "SummarizedExperiment"), function(x, rank = NULL, ...){ - x <- .agg_for_prevalence(x, rank = rank, ...) + x <- .merge_features(x, rank = rank, ...) rare_indices <- .get_rare_indices(x, ...) x[rare_indices, ] } @@ -506,8 +472,8 @@ setMethod("subsetByRare", signature = c(x = "TreeSummarizedExperiment"), #' @rdname getPrevalence #' @export setGeneric("getPrevalentAbundance", signature = "x", - function(x, assay.type = assay_name, assay_name = "relabundance", ...) - standardGeneric("getPrevalentAbundance")) + function(x, assay.type = assay_name, assay_name = "relabundance", ...) + standardGeneric("getPrevalentAbundance")) #' @rdname getPrevalence #' @export @@ -517,9 +483,8 @@ setMethod("getPrevalentAbundance", signature = c(x = "ANY"), cm <- getPrevalent(x, ...) if (length(cm) == 0) { stop("With the given abundance and prevalence thresholds, no taxa ", - "were found. Try to change detection and prevalence ", - "parameters.", - call. = FALSE) + "were found. Try to change detection and prevalence ", + "parameters.", call. = FALSE) } colSums(x[cm, ,drop=FALSE]) } @@ -586,23 +551,24 @@ setMethod("getPrevalentAbundance", signature = c(x = "SummarizedExperiment"), #' #' @export setGeneric("agglomerateByPrevalence", signature = "x", - function(x, ...) - standardGeneric("agglomerateByPrevalence")) + function(x, ...) + standardGeneric("agglomerateByPrevalence")) #' @rdname agglomerateByPrevalence #' @export setMethod("agglomerateByPrevalence", signature = c(x = "SummarizedExperiment"), - function(x, rank = NULL, other.name = other_label, other_label = "Other", ...){ + function(x, rank = NULL, other.name = other_label, other_label = "Other", + ...){ # input check if(!.is_a_string(other.name)){ stop("'other.name' must be a single character value.", - call. = FALSE) + call. = FALSE) } # # Check assays that they can be merged safely - mapply(.check_assays_for_merge, assayNames(x), assays(x)) + temp <- mapply(.check_assays_for_merge, assayNames(x), assays(x)) # - x <- .agg_for_prevalence(x, rank, check.assays = FALSE, ...) + x <- .merge_features(x, rank, check.assays = FALSE, ...) pr <- getPrevalent(x, rank = NULL, ...) f <- rownames(x) %in% pr if(any(!f)){ @@ -624,12 +590,12 @@ setMethod("agglomerateByPrevalence", signature = c(x = "SummarizedExperiment"), #' @rdname agglomerateByPrevalence #' @export setMethod("agglomerateByPrevalence", - signature = c(x = "TreeSummarizedExperiment"), + signature = c(x = "TreeSummarizedExperiment"), function(x, rank = NULL, other.name = other_label, other_label = "Other", update.tree = FALSE, ...){ # input check if(!.is_a_bool(update.tree)){ - stop("'update.tree' must be TRUE or FALSE.", call. = FALSE) + stop("'update.tree' must be TRUE or FALSE.", call. = FALSE) } # update.refseq is a hidden parameter as for all other agglomeration # methods from the agglomerate-methods man page. @@ -647,7 +613,7 @@ setMethod("agglomerateByPrevalence", # sequences are only subsetted without finding consensus sequences. if( merge_refseq && !is.null(referenceSeq(x)) ){ # If user wants to agglomerate based on rank - x <- .agg_for_prevalence(x, rank, check.assays = FALSE, ...) + x <- .merge_features(x, rank, check.assays = FALSE, ...) # Find groups that will be used to agglomerate the data f <- rownames(x)[ match(rownames(x), rownames(res)) ] f[ is.na(f) ] <- other.name @@ -661,7 +627,7 @@ setMethod("agglomerateByPrevalence", res <- .agglomerate_trees(res, 1) } return(res) - } + } ) # Get abundance. Determines if relative abundance is calculated or not. diff --git a/R/merge.R b/R/merge.R index 2c1dde0ac..6151106ef 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,6 +1,9 @@ -.norm_f <- function(i, f, dim.type = c("rows","columns"), na.rm = FALSE, ...){ - if(!.is_a_bool(na.rm)){ - stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) +# This function can be used to unify the group id vector. It can be any +# kind of vector, but this converts it to factor. +.norm_f <- function( + i, f, dim.type = c("rows","columns"), empty.rm = FALSE, ...){ + if(!.is_a_bool(empty.rm)){ + stop("'empty.rm' must be TRUE or FALSE.", call. = FALSE) } dim.type <- match.arg(dim.type) if(!is.character(f) && !is.factor(f)){ @@ -13,7 +16,7 @@ call. = FALSE) } # This is done otherwise we lose NA values - if( !na.rm && any(is.na(f)) ){ + if( !empty.rm && any(is.na(f)) ){ f <- as.character(f) f[ is.na(f) ] <- "NA" } @@ -23,6 +26,9 @@ f } +# When we merge rows or columns, first member of group is kept as default +# (in colData or rowData). This function controls this and allows user to +# specify some other element than the first one. .norm_archetype <- function(f, archetype){ if(length(archetype) > 1L){ if(length(levels(f)) != length(archetype)){ @@ -49,6 +55,8 @@ archetype } +# This function returns the index/position of rows/columns that are kept +# after merging. #' @importFrom S4Vectors splitAsList .get_element_pos <- function(f, archetype){ archetype <- as.list(archetype) @@ -58,13 +66,12 @@ f_pos } +# This function merges assays and row/colData. #' @importFrom S4Vectors SimpleList #' @importFrom scuttle sumCountsAcrossFeatures -.merge_rows <- function(x, f, archetype = 1L, - average = FALSE, - BPPARAM = SerialParam(), - check.assays = TRUE, - ...){ +.merge_rows_or_cols <- function( + x, f, by, archetype = 1L, average = FALSE, BPPARAM = SerialParam(), + check.assays = TRUE, na.rm = FALSE, ...){ # input check if( !.is_a_bool(average) ){ stop("'average' must be TRUE or FALSE.", call. = FALSE) @@ -72,95 +79,109 @@ if( !.is_a_bool(check.assays) ){ stop("'check.assays' must be TRUE or FALSE.", call. = FALSE) } - if( .is_a_string(f) && f %in% colnames(rowData(x)) ){ - f <- rowData(x)[[ f ]] + if( !.is_a_bool(na.rm) ){ + stop("'na.rm' must be TRUE or FALSE.", call. = FALSE) } - f <- .norm_f(nrow(x), f, ...) - if(length(levels(f)) == nrow(x)){ + # + # Get correct functions based on whether we agglomerate rows or cols + rowData_FUN <- switch(by, rowData, colData) + nrow_FUN <- switch(by, nrow, ncol) + rownames_FUN <- switch(by, rownames, colnames) + rownames_ass_FUN <- switch(by, `rownames<-`, `colnames<-`) + # If user specified column name from row/colData, get the values + if( .is_a_string(f) && f %in% colnames(rowData_FUN(x)) ){ + f <- rowData_FUN(x)[[ f ]] + } + # Check that the group ID vector is specifying groups for each element + f <- .norm_f(nrow_FUN(x), f, ...) + # If the data is already agglomerated at each group + if(length(levels(f)) == nrow_FUN(x)){ return(x) } - + # In merging, first element of certain group is kept by default. archetype, + # can control this behavior; it can specify the preserved rows for every + # group or index. archetype <- .norm_archetype(f, archetype) - # merge assays + + # Get assays assays <- assays(x) + # We check whether the assays include values that cannot be summed. For + # instance, summing negative values do not make sense. if( check.assays ){ - mapply(.check_assays_for_merge, names(assays), assays) - } - assays <- S4Vectors::SimpleList(lapply(assays, - scuttle::sumCountsAcrossFeatures, - ids = f, - subset.row = NULL, - subset.col = NULL, - average = average, - BPPARAM = BPPARAM)) - names(assays) <- names(assays(x)) - # merge to result - x <- x[.get_element_pos(f, archetype = archetype),] + temp <- lapply(seq_len(length(assays)), function(i) + .check_assays_for_merge(names(assays)[[i]], assays[[i]])) + } + + # Transpose if we are merging columns + if( by == 2L ){ + assays <- lapply(assays, function(mat) t(mat)) + } + # Get the aggregation function based on whether user wants to exclude NAs + # and if there are any NAs. scuttle::sumCountsAcrossFeatures cannot handle + # NAs so if user wants to exclude them, we use own implementation. + FUN <- if( na.rm && anyNA(assays[[1]])) .sum_counts_accross_features_na else + sumCountsAcrossFeatures + # Agglomerate assays + assays <- lapply(assays, FUN, average = average, ids = f, BPPARAM = BPPARAM) + # Transpose back to original orientation + if( by == 2L ){ + assays <- lapply(assays, function(mat) t(mat)) + } + # Convert to SimpleList + assays <- assays |> SimpleList() + + # Now we have agglomerated assays, but TreeSE has still the original form. + # We take specified rows/columns from the TreeSE. + idx <- .get_element_pos(f, archetype = archetype) + if( by == 1L ){ + x <- x[idx, ] + } else{ + x <- x[ , idx] + } + + # Add assays back to TreeSE assays(x, withDimnames = FALSE) <- assays - # Change rownames to group names - rownames(x) <- rownames(assays[[1]]) - x + # Change row/colnames. Currently, they have same names as in original data + # but just certain rows. Change them to represent groups + x <- rownames_ass_FUN(x, rownames_FUN(assays[[1]])) + return(x) } -#' @importFrom scuttle sumCountsAcrossFeatures +# This function works similarly to scuttle::sumCountsAcrossFeatures but this +# excludes NAs from the data. The scuttle function cannot handle NAs. +#' @importFrom DelayedArray DelayedArray type rowsum +.sum_counts_accross_features_na <- function(x, average, ids, ...){ + # Which cell is not NA? + is_not_na <- !is.na(x) + type(is_not_na) <- "integer" + # Aggregate data to certain groups + x <- rowsum(x, ids, na.rm = TRUE) + # Calculate average if specified + if( average ){ + x <- x/rowsum(is_not_na, ids) + } + return(x) +} + +# This functions checks if assay has negative or binary values. It does not +# make sense to sum them, so we give warning to user. .check_assays_for_merge <- function(assay.type, assay){ # Check if assays include binary or negative values if( all(assay == 0 | assay == 1) ){ - warning("'",assay.type,"'", " includes binary values.", + warning("'", assay.type, "'", " includes binary values.", "\nAgglomeration of it might lead to meaningless values.", "\nCheck the assay, and consider doing transformation again", "manually with agglomerated data.", call. = FALSE) } if( !all( assay >= 0 | is.na(assay) ) ){ - warning("'",assay.type,"'", " includes negative values.", + warning("'", assay.type, "'", " includes negative values.", "\nAgglomeration of it might lead to meaningless values.", "\nCheck the assay, and consider doing transformation again", "manually with agglomerated data.", call. = FALSE) } -} - -#' @importFrom S4Vectors SimpleList -#' @importFrom scuttle summarizeAssayByGroup -.merge_cols <- function(x, f, archetype = 1L, ...){ - # input check - if( .is_a_string(f) && f %in% colnames(colData(x)) ){ - f <- colData(x)[[ f ]] - } - f <- .norm_f(ncol(x), f, "columns", ...) - - if(length(levels(f)) == ncol(x)){ - return(x) - } - archetype <- .norm_archetype(f, archetype) - # merge col data - element_pos <- .get_element_pos(f, archetype = archetype) - col_data <- colData(x)[element_pos,,drop=FALSE] - # merge assays - assays <- assays(x) - mapply(.check_assays_for_merge, names(assays), assays) - FUN <- function(mat, ...){ - temp <- scuttle::summarizeAssayByGroup(mat, - statistics = "sum", - ...) - # "sum" includes agglomerated (summed up) data - mat <- assay(temp, "sum") - return(mat) - } - assays <- S4Vectors::SimpleList(lapply(assays, - FUN = FUN, - ids = f, - subset.row = NULL, - subset.col = NULL, - ...)) - names(assays) <- names(assays(x)) - # merge to result - x <- x[,.get_element_pos(f, archetype = archetype)] - assays(x, withDimnames = FALSE) <- assays - # Change colnames to group names - colnames(x) <- colnames(assays[[1]]) - x + return(assay) } #' @importFrom Biostrings DNAStringSetList @@ -203,7 +224,7 @@ refSeq <- referenceSeq(x) } # - x <- .merge_rows(x, f, archetype = 1L, ...) + x <- .merge_rows_or_cols(x, f, by = 1L, archetype = 1L, ...) # optionally merge rowTree if( update.tree ){ x <- .agglomerate_trees(x, 1, ...) @@ -221,7 +242,7 @@ stop("'update.tree' must be TRUE or FALSE.", call. = FALSE) } # - x <- .merge_cols(x, f, archetype = 1L, ...) + x <- .merge_rows_or_cols(x, f, by = 2L, archetype = 1L, ...) # optionally merge colTree if( update.tree ){ x <- .agglomerate_trees(x, 2, ...) diff --git a/R/utils.R b/R/utils.R index fd7c52b3d..dee2dd391 100644 --- a/R/utils.R +++ b/R/utils.R @@ -521,13 +521,12 @@ ################################################################################ # internal wrappers for agglomerateByRank/agglomerateByVariable -.merge_features <- function(x, merge.by, ...) { +.merge_features <- function(x, merge.by = rank, rank = NULL, ...) { # Check if merge.by parameter belongs to taxonomyRanks - if (is.character(merge.by) && length(merge.by) == 1 && - merge.by %in% taxonomyRanks(x)) { + if( .is_a_string(merge.by) && merge.by %in% taxonomyRanks(x) ){ # Merge using agglomerateByRank x <- agglomerateByRank(x, rank = merge.by, ...) - } else { + } else if( !is.null(merge.by) ){ # Merge using agglomerateByVariable x <- agglomerateByVariable(x, by = "rows", group = merge.by, ...) } diff --git a/man/agglomerate-methods.Rd b/man/agglomerate-methods.Rd index 462928742..78e3f47db 100644 --- a/man/agglomerate-methods.Rd +++ b/man/agglomerate-methods.Rd @@ -3,12 +3,12 @@ \name{agglomerate-methods} \alias{agglomerate-methods} \alias{agglomerateByRank} -\alias{agglomerateByVariable} +\alias{agglomerateByRank,TreeSummarizedExperiment-method} +\alias{agglomerateByRank,SingleCellExperiment-method} \alias{agglomerateByRank,SummarizedExperiment-method} -\alias{agglomerateByVariable,SummarizedExperiment-method} +\alias{agglomerateByVariable} \alias{agglomerateByVariable,TreeSummarizedExperiment-method} -\alias{agglomerateByRank,SingleCellExperiment-method} -\alias{agglomerateByRank,TreeSummarizedExperiment-method} +\alias{agglomerateByVariable,SummarizedExperiment-method} \alias{agglomerateByRanks} \alias{agglomerateByRanks,SummarizedExperiment-method} \alias{agglomerateByRanks,SingleCellExperiment-method} @@ -17,21 +17,37 @@ \alias{unsplitByRanks} \alias{unsplitByRanks,SingleCellExperiment-method} \alias{unsplitByRanks,TreeSummarizedExperiment-method} -\title{Agglomerate or merge data using taxonomic information} +\title{Agglomerate data using taxonomic information or other grouping} \usage{ agglomerateByRank(x, ...) -agglomerateByVariable(x, ...) +\S4method{agglomerateByRank}{TreeSummarizedExperiment}( + x, + rank = taxonomyRanks(x)[1], + update.tree = agglomerateTree, + agglomerate.tree = agglomerateTree, + agglomerateTree = FALSE, + ... +) + +\S4method{agglomerateByRank}{SingleCellExperiment}( + x, + rank = taxonomyRanks(x)[1], + altexp = NULL, + altexp.rm = strip_altexp, + strip_altexp = TRUE, + ... +) \S4method{agglomerateByRank}{SummarizedExperiment}( x, rank = taxonomyRanks(x)[1], - na.rm = TRUE, + empty.rm = TRUE, empty.fields = c(NA, "", " ", "\\t", "-", "_"), ... ) -\S4method{agglomerateByVariable}{SummarizedExperiment}(x, by, group = f, f, ...) +agglomerateByVariable(x, ...) \S4method{agglomerateByVariable}{TreeSummarizedExperiment}( x, @@ -43,21 +59,7 @@ agglomerateByVariable(x, ...) ... ) -\S4method{agglomerateByRank}{SingleCellExperiment}( - x, - ..., - altexp = NULL, - altexp.rm = strip_altexp, - strip_altexp = TRUE -) - -\S4method{agglomerateByRank}{TreeSummarizedExperiment}( - x, - ..., - update.tree = agglomerateTree, - agglomerate.tree = agglomerateTree, - agglomerateTree = FALSE -) +\S4method{agglomerateByVariable}{SummarizedExperiment}(x, by, group = f, f, ...) agglomerateByRanks(x, ...) @@ -112,11 +114,29 @@ unsplitByRanks(x, ...) \code{SummarizedExperiment} objects and other functions. See \code{\link[=agglomerate-methods]{agglomerateByRank}} for more details.} -\item{rank}{\code{Character scalar}. Defines a taxonomic rank. Must be a value of -\code{taxonomyRanks()} function.} +\item{rank}{\code{Character scalar}. Defines a taxonomic rank. Must be a +value of \code{taxonomyRanks()} function.} + +\item{update.tree}{\code{Logical scalar}. Should +\code{rowTree()} also be merged? (Default: \code{FALSE})} + +\item{agglomerate.tree}{Deprecated. Use \code{update.tree} instead.} + +\item{agglomerateTree}{Deprecated. Use \code{update.tree} instead.} + +\item{altexp}{\code{Character scalar} or \code{integer scalar}. +Specifies an alternative experiment containing the input data.} + +\item{altexp.rm}{\code{Logical scalar}. Should alternative +experiments be removed prior to agglomeration? This prevents too many +nested alternative experiments by default. (Default: +\code{TRUE})} + +\item{strip_altexp}{Deprecated. Use \code{altexp.rm} instead.} -\item{na.rm}{\code{Logical scalar}. Should NA values be omitted when calculating -prevalence? (Default: \code{TRUE})} +\item{empty.rm}{\code{Logical scalar}. Defines whether rows including +\code{empty.fields} in specified \code{rank} will be excluded. +(Default: \code{TRUE})} \item{empty.fields}{\code{Character vector}. Defines which values should be regarded as empty. (Default: \code{c(NA, "", " ", "\t")}). They will be @@ -136,28 +156,14 @@ returned unchanged.} \item{f}{Deprecated. Use \code{group} instead.} -\item{update.tree}{\code{Logical scalar}. Should -\code{rowTree()} also be merged? (Default: \code{FALSE})} - \item{mergeTree}{Deprecated. Use \code{update.tree} instead.} -\item{altexp}{\code{Character scalar} or \code{integer scalar}. -Specifies an alternative experiment containing the input data.} - -\item{altexp.rm}{\code{Logical scalar}. Should alternative -experiments be removed prior to agglomeration? This prevents too many -nested alternative experiments by default. (Default: -\code{TRUE})} - -\item{strip_altexp}{Deprecated. Use \code{altexp.rm} instead.} - -\item{agglomerate.tree}{Deprecated. Use \code{update.tree} instead.} - -\item{agglomerateTree}{Deprecated. Use \code{update.tree} instead.} - \item{ranks}{\code{Character vector}. Defines taxonomic ranks. Must all be values of \code{taxonomyRanks()} function.} +\item{na.rm}{\code{Logical scalar}. Should NA values be omitted? +(Default: \code{TRUE})} + \item{as.list}{\code{Logical scalar}. Should the list of \code{SummarizedExperiment} objects be returned by the function \code{agglomerateByRanks} as a SimpleList or stored in altExps? @@ -194,14 +200,6 @@ and any existing \code{rowTree} is dropped as well, since existing Agglomeration functions can be used to sum-up data based on specific criteria such as taxonomic ranks, variables or prevalence. -\code{agglomerateByRanks} takes a \code{SummarizedExperiment}, splits it along the -taxonomic ranks, aggregates the data per rank, converts the input to a -\code{SingleCellExperiment} objects and stores the aggregated data as -alternative experiments. \code{unsplitByRanks} takes these alternative -experiments and flattens them again into a single -\code{SummarizedExperiment}. -} -\details{ \code{agglomerateByRank} can be used to sum up data based on associations with certain taxonomic ranks, as defined in \code{rowData}. Only available \code{\link{taxonomyRanks}} can be used. @@ -216,13 +214,18 @@ retained as defined by \code{archetype}. agglomerated, i.e. summed up. If the assay contains values other than counts or absolute values, this can lead to meaningless values being produced. +\code{agglomerateByRanks} takes a \code{SummarizedExperiment}, splits it along the +taxonomic ranks, aggregates the data per rank, converts the input to a +\code{SingleCellExperiment} objects and stores the aggregated data as +alternative experiments. \code{unsplitByRanks} takes these alternative +experiments and flattens them again into a single +\code{SummarizedExperiment}. +} +\details{ Agglomeration sums up the values of assays at the specified taxonomic level. -With -certain assays, e.g. those that include binary or negative values, this -summing -can produce meaningless values. In those cases, consider performing -agglomeration -first, and then applying the transformation afterwards. +With certain assays, e.g. those that include binary or negative values, this +summing can produce meaningless values. In those cases, consider performing +agglomeration first, and then applying the transformation afterwards. \code{agglomerateByVariable} works similarly to \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}}. @@ -277,9 +280,9 @@ tse <- transformAssay(GlobalPatterns, method = "pa") tse <- agglomerateByRank(tse, rank = "Genus") tse <- transformAssay(tse, method = "pa") -# removing empty labels by setting na.rm = TRUE +# Removing empty labels by setting empty.rm = TRUE sum(is.na(rowData(GlobalPatterns)$Family)) -x3 <- agglomerateByRank(GlobalPatterns, rank="Family", na.rm = TRUE) +x3 <- agglomerateByRank(GlobalPatterns, rank="Family", empty.rm = TRUE) nrow(x3) # different from x2 # Because all the rownames are from the same rank, rownames do not include @@ -291,20 +294,19 @@ rownames(x3) <- getTaxonomyLabels(x3, with.rank = TRUE) print(rownames(x3[1:3,])) # use 'empty.ranks.rm' to remove columns that include only NAs -x4 <- agglomerateByRank(GlobalPatterns, rank="Phylum", - empty.ranks.rm = TRUE) +x4 <- agglomerateByRank( + GlobalPatterns, rank="Phylum", empty.ranks.rm = TRUE) head(rowData(x4)) -# If the assay contains NAs, you might want to consider replacing them, +# If the assay contains NAs, you might want to specify na.rm=TRUE, # since summing-up NAs lead to NA x5 <- GlobalPatterns # Replace first value with NA assay(x5)[1,1] <- NA x6 <- agglomerateByRank(x5, "Kingdom") head( assay(x6) ) -# Replace NAs with 0. This is justified when we are summing-up counts. -assay(x5)[ is.na(assay(x5)) ] <- 0 -x6 <- agglomerateByRank(x5, "Kingdom") +# Use na.rm=TRUE +x6 <- agglomerateByRank(x5, "Kingdom", na.rm = TRUE) head( assay(x6) ) ## Look at enterotype dataset... @@ -318,18 +320,19 @@ taxonomyRanks(enterotype) data(esophagus) esophagus plot(rowTree(esophagus)) -# get a factor for merging +# Get a factor for merging f <- factor(regmatches(rownames(esophagus), - regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) -merged <- agglomerateByVariable(esophagus, by = "rows", f, - update.tree = TRUE) + regexpr("^[0-9]*_[0-9]*",rownames(esophagus)))) +merged <- agglomerateByVariable( + esophagus, by = "rows", f, update.tree = TRUE) plot(rowTree(merged)) # data(GlobalPatterns) GlobalPatterns -merged <- agglomerateByVariable(GlobalPatterns, by = "cols", - colData(GlobalPatterns)$SampleType) +merged <- agglomerateByVariable( + GlobalPatterns, by = "cols", colData(GlobalPatterns)$SampleType) merged + data(GlobalPatterns) # print the available taxonomic ranks taxonomyRanks(GlobalPatterns) diff --git a/man/agglomerateByPrevalence.Rd b/man/agglomerateByPrevalence.Rd index ec3247212..26495b8af 100644 --- a/man/agglomerateByPrevalence.Rd +++ b/man/agglomerateByPrevalence.Rd @@ -32,8 +32,8 @@ agglomerateByPrevalence(x, ...) \code{SummarizedExperiment} objects and other functions. See \code{\link[=agglomerate-methods]{agglomerateByRank}} for more details.} -\item{rank}{\code{Character scalar}. Defines a taxonomic rank. Must be a value of -\code{taxonomyRanks()} function.} +\item{rank}{\code{Character scalar}. Defines a taxonomic rank. Must be a +value of \code{taxonomyRanks()} function.} \item{other.name}{\code{Character scalar}. Used as the label for the summary of non-prevalent taxa. (default: \code{"Other"})} diff --git a/man/getPrevalence.Rd b/man/getPrevalence.Rd index 370cb3a6d..a7414df2f 100644 --- a/man/getPrevalence.Rd +++ b/man/getPrevalence.Rd @@ -115,8 +115,6 @@ getPrevalentAbundance( \item If \code{!is.null(rank)} arguments are passed on to \code{\link[=agglomerate-methods]{agglomerateByRank}}. See \code{\link[=agglomerate-methods]{?agglomerateByRank}} for more details. -Note that you can specify whether to remove empty ranks with -\code{agg.na.rm} instead of \code{na.rm}. (default: \code{FALSE}) \item for \code{getPrevalent}, \code{getRare}, \code{subsetByPrevalent} and \code{subsetByRare} additional parameters passed to @@ -126,8 +124,8 @@ and \code{subsetByRare} additional parameters passed to \code{getPrevalent} }} -\item{detection}{\code{Numeric scalar}. Detection threshold for absence/presence. -If \code{as_relative = FALSE}, +\item{detection}{\code{Numeric scalar}. Detection threshold for +absence/presence. If \code{as_relative = FALSE}, it sets the counts threshold for a taxon to be considered present. If \code{as_relative = TRUE}, it sets the relative abundance threshold for a taxon to be considered present. (Default: \code{0})} @@ -140,16 +138,16 @@ detection and prevalence cutoffs be included? (Default: \code{FALSE})} \item{sort}{\code{Logical scalar}. Should the result be sorted by prevalence? (Default: \code{FALSE})} -\item{na.rm}{\code{Logical scalar}. Should NA values be omitted when calculating -prevalence? (Default: \code{TRUE})} +\item{na.rm}{\code{Logical scalar}. Should NA values be omitted? +(Default: \code{TRUE})} \item{assay.type}{\code{Character scalar}. Specifies which assay to use for calculation. (Default: \code{"counts"})} \item{assay_name}{Deprecated. Use \code{assay.type} instead.} -\item{rank}{\code{Character scalar}. Defines a taxonomic rank. Must be a value of -\code{taxonomyRanks()} function.} +\item{rank}{\code{Character scalar}. Defines a taxonomic rank. Must be a +value of \code{taxonomyRanks()} function.} \item{prevalence}{Prevalence threshold (in 0 to 1). The required prevalence is strictly greater by default. To include the diff --git a/man/summaries.Rd b/man/summaries.Rd index 5919a0436..7874cd78a 100644 --- a/man/summaries.Rd +++ b/man/summaries.Rd @@ -55,8 +55,8 @@ assay used in calculation. (Default: \code{"counts"})} \item{assay_name}{Deprecated. Use \code{assay.type} instead.} -\item{na.rm}{\code{Logical scalar}. Should NA values be omitted when calculating -prevalence? (Default: \code{TRUE})} +\item{na.rm}{\code{Logical scalar}. Should NA values be omitted? +(Default: \code{TRUE})} \item{...}{Additional arguments passed on to \code{agglomerateByRank()} when \code{rank} is specified for \code{summarizeDominance}.} diff --git a/man/taxonomy-methods.Rd b/man/taxonomy-methods.Rd index f125981fa..3617c6e2b 100644 --- a/man/taxonomy-methods.Rd +++ b/man/taxonomy-methods.Rd @@ -71,8 +71,8 @@ IdTaxaToDataFrame(from) \arguments{ \item{x}{\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}.} -\item{rank}{\code{Character scalar}. Defines a taxonomic rank. Must be a value of -\code{taxonomyRanks()} function.} +\item{rank}{\code{Character scalar}. Defines a taxonomic rank. Must be a +value of \code{taxonomyRanks()} function.} \item{empty.fields}{\code{Character vector}. Defines which values should be regarded as empty. (Default: \code{c(NA, "", " ", "\t")}). They will be diff --git a/tests/testthat/test-2merge.R b/tests/testthat/test-2merge.R index 7189d4ec3..48e49139b 100644 --- a/tests/testthat/test-2merge.R +++ b/tests/testthat/test-2merge.R @@ -45,20 +45,20 @@ test_that("merge", { actual <- mia:::.get_element_pos(f, archetype = c(2,1)) expect_equal(actual,c(a = 2, b = 4)) - # .merge_rows + # .merge_rows_or_cols mat <- matrix(1:60, nrow = 6) gr <- GRanges("chr1",rep("1-6",6)) df <- DataFrame(n = c(1:6)) mcols(gr) <- df grl <- splitAsList(gr,1:6) - expect_error(mia:::.merge_rows(), + expect_error(mia:::.merge_rows_or_cols(by = 1L), 'argument "f" is missing') x <- SummarizedExperiment(assays = list(mat = mat)) xr <- SummarizedExperiment(assays = list(mat = mat), rowRanges = gr) xrl <- SummarizedExperiment(assays = list(mat = mat), rowRanges = unname(grl)) - expect_error(mia:::.merge_rows(x), + expect_error(mia:::.merge_rows_or_cols(x, by = 1L), 'argument "f" is missing') FUN_check_x <- function(x,archetype=1){ actual <- agglomerateByVariable(x, by = "rows", f, archetype) @@ -84,9 +84,124 @@ test_that("merge", { } lapply(list(xtse),FUN_check_x) lapply(list(xtse),FUN_check_x,archetype=2) + + # Check that average works as expected. average parameter controls whether + # to calculate mean or sum. Check that mean is correctly calculated when + # there are NAs + # + # Calculate average and sum for each row group + summary_FUN_rows <- function(x, col.var){ + # Loop through groups and calculate statistics + groups <- unique(rowData(x)[[col.var]]) |> sort() + res <- lapply(groups, function(group) { + mat_sub <- assay(x[rowData(x)[[col.var]] == group, ]) + list( + sum = colSums(mat_sub, na.rm = FALSE), + sum_na = colSums(mat_sub, na.rm = TRUE), + mean = colMeans(mat_sub, na.rm = FALSE), + mean_na = colMeans(mat_sub, na.rm = TRUE) + ) + }) + # Combine results for each statistic across groups + res <- lapply(c("sum", "sum_na", "mean", "mean_na"), function(stat) { + do.call(rbind, lapply(res, `[[`, stat)) + }) + names(res) <- c("sum", "sum_na", "mean", "mean_na") + return(res) + } + # Generate data + tse <- mockSCE() + rowData(tse)[["group"]] <- sample(LETTERS, nrow(tse), replace = TRUE) + colData(tse)[["group"]] <- sample(LETTERS, ncol(tse), replace=TRUE) + # Create a data with NAs + n_value <- nrow(tse)*ncol(tse) + assay(tse)[c(1, 5, 3, 6)] <- NA + # Test with NAs + res_sum <- agglomerateByVariable(tse, by = 1, group = "group", average = FALSE, na.rm = FALSE) + res_sum_na <- agglomerateByVariable(tse, by = 1, group = "group", average = FALSE, na.rm = TRUE) + res_mean <- agglomerateByVariable(tse, by = 1, group = "group", average = TRUE, na.rm = FALSE) + res_mean_na <- agglomerateByVariable(tse, by = 1, group = "group", average = TRUE, na.rm = TRUE) + ref <- summary_FUN_rows(tse, "group") + # + expect_equal(assay(res_sum), ref[["sum"]], check.attributes = FALSE) + expect_equal(assay(res_sum_na), ref[["sum_na"]], check.attributes = FALSE) + expect_equal(assay(res_mean), ref[["mean"]], check.attributes = FALSE) + expect_equal(assay(res_mean_na), ref[["mean_na"]], check.attributes = FALSE) + # Calculate average and sum for each column group + summary_FUN_cols <- function(x, col.var){ + # Loop through groups and calculate statistics + groups <- unique(colData(x)[[col.var]]) |> sort() + res <- lapply(groups, function(group){ + mat_sub <- assay(x[, colData(x)[[col.var]] == group ]) + list( + sum = rowSums(mat_sub, na.rm = FALSE), + sum_na = rowSums(mat_sub, na.rm = TRUE), + mean = rowMeans(mat_sub, na.rm = FALSE), + mean_na = rowMeans(mat_sub, na.rm = TRUE) + ) + }) + # Combine results for each statistic across groups + res <- lapply(c("sum", "sum_na", "mean", "mean_na"), function(stat){ + do.call(cbind, lapply(res, `[[`, stat)) + }) + names(res) <- c("sum", "sum_na", "mean", "mean_na") + return(res) + } + # Test with NAs + res_sum <- agglomerateByVariable(tse, by = 2, group = "group", average = FALSE, na.rm = FALSE) + res_sum_na <- agglomerateByVariable(tse, by = 2, group = "group", average = FALSE, na.rm = TRUE) + res_mean <- agglomerateByVariable(tse, by = 2, group = "group", average = TRUE, na.rm = FALSE) + res_mean_na <- agglomerateByVariable(tse, by = 2, group = "group", average = TRUE, na.rm = TRUE) + ref <- summary_FUN_cols(tse, "group") + # + expect_equal(assay(res_sum), ref[["sum"]], check.attributes = FALSE) + expect_equal(assay(res_sum_na), ref[["sum_na"]], check.attributes = FALSE) + expect_equal(assay(res_mean), ref[["mean"]], check.attributes = FALSE) + expect_equal(assay(res_mean_na), ref[["mean_na"]], check.attributes = FALSE) + + # Check that agglomerateByRank and agglomerateByVariable work correctly + # with na.rm + data(GlobalPatterns, package="mia") + tse <- GlobalPatterns + col_idx <- sample(seq_len(ncol(tse)), 1) + row_idx <- sample(seq_len(nrow(tse)), 1) + tse_mod <- tse + assay(tse_mod)[row_idx, col_idx] <- NA + group <- colData(tse)[col_idx, "SampleType"] + # na.rm = FALSE + tse_sub <- agglomerateByVariable(tse_mod, by = "cols", group = "SampleType", na.rm = FALSE) + test_mat <- tse_sub |> assay() + # na.rm = TRUE + tse_sub <- agglomerateByVariable(tse_mod, by = "cols", group = "SampleType", na.rm = TRUE) + ref_mat <- tse_sub |> assay() + group_idx <- which(colnames(tse_sub) == group) + expect_true( is.na(test_mat[row_idx, group_idx]) ) + expect_true( !is.na(ref_mat[row_idx, group_idx]) ) + expect_equal(test_mat[-row_idx, -group_idx], ref_mat[-row_idx, -group_idx]) + # + # na.rm = FALSE + group <- rowData(tse)[row_idx, "Kingdom"] + tse_sub <- agglomerateByVariable(tse_mod, by = "rows", group = "Kingdom", na.rm = FALSE) + test_mat <- tse_sub |> assay() + # na.rm = TRUE + tse_sub <- agglomerateByVariable(tse_mod, by = "rows", group = "Kingdom", na.rm = TRUE) + ref_mat <- tse_sub |> assay() + group_idx <- which(rownames(tse_sub) == group) + expect_true( is.na(test_mat[group_idx, col_idx]) ) + expect_true( !is.na(ref_mat[group_idx, col_idx]) ) + expect_equal(test_mat[-group_idx, -col_idx], ref_mat[-group_idx, -col_idx]) + # + # na.rm = FALSE + tse_sub <- agglomerateByRank(tse_mod, rank = "Kingdom", na.rm = FALSE) + test_mat2 <- tse_sub |> assay() + # na.rm = TRUE + tse_sub <- agglomerateByRank(tse_mod, rank = "Kingdom", na.rm = TRUE) + ref_mat2 <- tse_sub |> assay() + expect_equal(test_mat, test_mat2) + expect_equal(ref_mat, ref_mat2) + # Check multiple rowTrees data(esophagus, package="mia") - data(GlobalPatterns, package="mia") # Add arbitrary groups rowData(esophagus)$group <- c(rep(c("A", "B", "C"), each = nrow(esophagus)/3), rep("A", nrow(esophagus)-round(nrow(esophagus)/3)*3) ) diff --git a/tests/testthat/test-3agglomerate.R b/tests/testthat/test-3agglomerate.R index cf029ab06..3c03151a6 100644 --- a/tests/testthat/test-3agglomerate.R +++ b/tests/testthat/test-3agglomerate.R @@ -27,27 +27,27 @@ test_that("agglomerate", { expect_equal(assays(actual)$mat[2,1],c(b = 36)) expect_equal(assays(actual)$mat[3,1],c(c = 24)) # - expect_error(agglomerateByRank(xtse,"",na.rm=FALSE), + expect_error(agglomerateByRank(xtse,"",empty.rm=FALSE), "'rank' must be a non-empty single character value") - expect_error(agglomerateByRank(xtse,"Family",na.rm=""), - "'na.rm' must be TRUE or FALSE") + expect_error(agglomerateByRank(xtse,"Family",empty.rm=""), + "'empty.rm' must be TRUE or FALSE") expect_error( - agglomerateByRank(xtse,"Family",na.rm=FALSE,update.tree=""), + agglomerateByRank(xtse,"Family",empty.rm=FALSE,update.tree=""), "'update.tree' must be TRUE or FALSE") xtse2 <- xtse rowData(xtse2) <- NULL - expect_error(agglomerateByRank(xtse2,"Family",na.rm=FALSE), + expect_error(agglomerateByRank(xtse2,"Family",empty.rm=FALSE), "taxonomyData needs to be populated") # - actual <- agglomerateByRank(xtse,"Family",na.rm=FALSE) + actual <- agglomerateByRank(xtse,"Family",empty.rm=FALSE) expect_equivalent(rowData(actual),rowData(actual_family)) - actual <- agglomerateByRank(xtse,"Phylum",na.rm=FALSE) + actual <- agglomerateByRank(xtse,"Phylum",empty.rm=FALSE) expect_equivalent(rowData(actual),rowData(actual_phylum)) # - actual <- agglomerateByRank(xtse,"Family", ignore.taxonomy = FALSE, na.rm = TRUE) + actual <- agglomerateByRank(xtse,"Family", ignore.taxonomy = FALSE, empty.rm = TRUE) expect_equal(dim(actual),c(6,10)) expect_equal(rowData(actual)$Family,c("c","d","e","f","g","h")) - actual <- agglomerateByRank(xtse,"Family", ignore.taxonomy = FALSE, na.rm = FALSE) # the default + actual <- agglomerateByRank(xtse,"Family", ignore.taxonomy = FALSE, empty.rm = FALSE) expect_equal(dim(actual),c(8,10)) expect_equal(rowData(actual)$Family,c("c","d","e","f","g","h",NA,NA)) actual <- agglomerateByRank(xtse,"Phylum") @@ -63,22 +63,22 @@ test_that("agglomerate", { data(enterotype, package="mia") expect_equal(length(unique(rowData(enterotype)[,"Genus"])), nrow(agglomerateByRank(enterotype,"Genus", ignore.taxonomy = FALSE, - na.rm = FALSE))) + empty.rm = FALSE))) # agglomeration in all its forms data(GlobalPatterns, package="mia") se <- GlobalPatterns actual <- agglomerateByRank(se, rank = "Family", - ignore.taxonomy = FALSE, na.rm = FALSE) + ignore.taxonomy = FALSE, empty.rm = FALSE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), length(rowTree(se)$tip.label)) actual <- agglomerateByRank(se, rank = "Family", - ignore.taxonomy = FALSE, na.rm = FALSE, update.tree = TRUE) + ignore.taxonomy = FALSE, empty.rm = FALSE, update.tree = TRUE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), 603) actual <- agglomerateByRank(se, rank = "Family", - ignore.taxonomy = FALSE, na.rm = FALSE, update.tree = TRUE) + ignore.taxonomy = FALSE, empty.rm = FALSE, update.tree = TRUE) expect_equal(dim(actual),c(603,26)) expect_equal(length(rowTree(actual)$tip.label), nrow(actual)) # Test that warning occurs when assay contian binary or negative values @@ -92,30 +92,30 @@ test_that("agglomerate", { data(GlobalPatterns, package="mia") tse <- GlobalPatterns - # Check that na.rm works + # Check that empty.rm works # Get all phyla all_phyla <- unique( rowData(tse)$Phylum ) - # When na.rm = FALSE, then phyla should also include NA --> one extra row - test0 <- agglomerateByVariable(tse, by = 1, group = "Phylum", na.rm = FALSE) - test1 <- agglomerateByRank(tse, rank = "Phylum", na.rm = FALSE) + # When empty.rm = FALSE, then phyla should also include NA --> one extra row + test0 <- agglomerateByVariable(tse, by = 1, group = "Phylum", empty.rm = FALSE) + test1 <- agglomerateByRank(tse, rank = "Phylum", empty.rm = FALSE) # Test that dimentionality is the same for merging object by agglomerateByRank # and agglomerateByVariable. expect_equal(nrow(test0), length(all_phyla)) expect_equal(nrow(test1), length(all_phyla)) - # When na.rm = TRUE, there should be as many rows as there are non-NA phyla - test0 <- agglomerateByVariable(tse, by = 1, group = "Phylum", na.rm = TRUE) - test1 <- agglomerateByRank(tse, rank = "Phylum", na.rm = TRUE) + # When empty.rm = TRUE, there should be as many rows as there are non-NA phyla + test0 <- agglomerateByVariable(tse, by = 1, group = "Phylum", empty.rm = TRUE) + test1 <- agglomerateByRank(tse, rank = "Phylum", empty.rm = TRUE) - # Test that dimentionality is the same when NA values are removed. + # Test that dimensionality is the same when NA values are removed. expect_equal(nrow(test0), length( all_phyla[!is.na(all_phyla)] )) expect_equal(nrow(test1), length( all_phyla[!is.na(all_phyla)] )) # Check that there are more taxa when agglomeration is to "Species" level - test0 <- agglomerateByVariable(tse, by = 1, group = "Species", na.rm = FALSE) - test1 <- agglomerateByRank(tse, rank = "Species", na.rm = FALSE) + test0 <- agglomerateByVariable(tse, by = 1, group = "Species", empty.rm = FALSE) + test1 <- agglomerateByRank(tse, rank = "Species", empty.rm = FALSE) expect_equal(nrow(test0), 945) expect_equal(nrow(test1), 2307) @@ -143,9 +143,9 @@ test_that("agglomerate", { expect_equal(rd1[, cols], rd2[, cols]) expect_true( ncol(rd1) > ncol(rd2) ) # Test that make.unique work - uniq <- agglomerateByRank(tse, rank = "Species", na.rm = FALSE) + uniq <- agglomerateByRank(tse, rank = "Species", empty.rm = FALSE) not_uniq <- agglomerateByRank( - tse, rank = "Species", make.unique = FALSE, na.rm = FALSE) + tse, rank = "Species", make.unique = FALSE, empty.rm = FALSE) expect_true( !any( duplicated(rownames(uniq)) ) ) expect_true( any( duplicated(rownames(not_uniq)) ) ) diff --git a/tests/testthat/test-5dominantTaxa.R b/tests/testthat/test-5dominantTaxa.R index f415cd6e9..ee47df59b 100644 --- a/tests/testthat/test-5dominantTaxa.R +++ b/tests/testthat/test-5dominantTaxa.R @@ -21,8 +21,6 @@ test_that("getDominant", { expect_equal(getDominant(tse)[1:15], exp.vals.one) # Test at taxonomic level for values are passed to agglomerateRanks - getDominant(tse, rank = "Genus", na.rm = FALSE) - exp.vals.two <- c("Genus:CandidatusSolibacter", "Genus:MC18", "Class:Chloracidobacteria", "Genus:Bacteroides", "Genus:Bacteroides", "Genus:Streptococcus", @@ -31,21 +29,17 @@ test_that("getDominant", { "Genus:Dolichospermum", "Family:ACK-M1", "Order:Stramenopiles","Order:Stramenopiles","Order:Stramenopiles") names(exp.vals.two) <- exp.names.one - expect_equal(getDominant(tse, - rank = "Genus", - ignore.taxonomy = FALSE, - na.rm = FALSE)[1:15], - exp.vals.two) + expect_equal( + getDominant(tse, rank = "Genus", ignore.taxonomy = FALSE, empty.rm = FALSE)[1:15], + exp.vals.two) # Check if DominantTaxa is added to coldata - expect_equal(colData(addDominant(tse, - name="dominant"))$dominant[1:15], - exp.vals.one) - expect_equal(colData(addDominant(tse, - rank = "Genus", - na.rm = FALSE, - name="dominant"))$dominant[1:15], - exp.vals.two) + expect_equal( + colData(addDominant(tse, name="dominant"))$dominant[1:15], + exp.vals.one) + expect_equal( + colData(addDominant(tse,rank = "Genus", empty.rm = FALSE, name="dominant"))$dominant[1:15], + exp.vals.two) # Check if DominantTaxa is added when factor is passed exp.vals.three <- c( @@ -57,10 +51,9 @@ test_that("getDominant", { names(exp.vals.three) <- exp.names.one test <- tse rowData(test)$group <- rowData(tse)$Genus - expect_equal(colData(addDominant(test, rank = "group", na.rm = TRUE, - name="dominant"))$dominant[1:15], - exp.vals.three) - + expect_equal( + colData(addDominant(test, rank = "group", empty.rm = TRUE, name = "dominant"))$dominant[1:15], + exp.vals.three) tse1 <- tse # Now data contains 2 dominant taxa in one sample diff --git a/tests/testthat/test-5prevalence.R b/tests/testthat/test-5prevalence.R index a77ea05d2..bec4a4185 100644 --- a/tests/testthat/test-5prevalence.R +++ b/tests/testthat/test-5prevalence.R @@ -106,16 +106,16 @@ test_that("getPrevalence", { remove <- c(15, 200) assay(tse, "counts")[remove, ] <- NA # Check that agglomeration works - tse_agg <- agglomerateByRank(tse, ignore.taxonomy = FALSE, na.rm = FALSE, rank = rank) + tse_agg <- agglomerateByRank(tse, ignore.taxonomy = FALSE, empty.rm = FALSE, rank = rank) expect_warning(ref <- getPrevalence(tse_agg, na.rm = FALSE)) - expect_warning(res <- getPrevalence(tse, rank = "Genus", agg.na.rm = FALSE)) + expect_warning(res <- getPrevalence(tse, rank = "Genus", empty.rm = FALSE)) expect_true( all(res == ref, na.rm = TRUE) ) # tse_agg <- agglomerateByRank( - tse, ignore.taxonomy = FALSE, na.rm = TRUE, rank = rank) + tse, ignore.taxonomy = FALSE, empty.rm = TRUE, rank = rank) ref <- getPrevalence(tse_agg, na.rm = TRUE) res <- getPrevalence( - tse, na.rm = TRUE, rank = "Genus", agg.na.rm = TRUE) + tse, na.rm = TRUE, rank = "Genus", empty.rm = TRUE) expect_true( all(res == ref, na.rm = TRUE) ) }) diff --git a/vignettes/mia.Rmd b/vignettes/mia.Rmd index a4db08db9..da7bfd655 100644 --- a/vignettes/mia.Rmd +++ b/vignettes/mia.Rmd @@ -137,13 +137,13 @@ becomes very easy. altExp(tse, "family") <- x2 ``` -Keep in mind, that if you set `na.rm = TRUE`, rows with `NA` or similar value +Keep in mind, that if you set `empty.rm = TRUE`, rows with `NA` or similar value (defined via the `empty.fields` argument) will be removed. Depending on these settings different number of rows will be returned. ```{r} -x1 <- agglomerateByRank(tse, rank = "Species", na.rm = TRUE) -altExp(tse,"species") <- agglomerateByRank(tse, rank = "Species", na.rm = FALSE) +x1 <- agglomerateByRank(tse, rank = "Species", empty.rm = TRUE) +altExp(tse,"species") <- agglomerateByRank(tse, rank = "Species", empty.rm = FALSE) dim(x1) dim(altExp(tse,"species")) ```