From d1aedf2cb9eb1d3d10fa0ecf184ab6dddb85d91a Mon Sep 17 00:00:00 2001 From: Julie Laffy Date: Tue, 26 Sep 2023 16:18:32 +0300 Subject: [PATCH] aggr_dup_genes add function and doc --- NAMESPACE | 5 +++++ R/aggr_dup_genes.R | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 R/aggr_dup_genes.R diff --git a/NAMESPACE b/NAMESPACE index b7d44ab..d25e109 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(Jaccard) export(Overlap) export(Unlist) export(add_to_dataframe) +export(aggr_dup_genes) export(aggr_gene_expr) export(alias2ensembl) export(alias2entrez) @@ -226,6 +227,10 @@ importFrom(igraph,cluster_walktrap) importFrom(irlba,prcomp_irlba) importFrom(limma,alias2Symbol) importFrom(magrittr,"%>%") +importFrom(matrixStats,colMaxs) +importFrom(matrixStats,colMeans2) +importFrom(matrixStats,colMedians) +importFrom(matrixStats,colSums2) importFrom(matrixStats,rowSds) importFrom(mixtools,normalmixEM) importFrom(msigdbr,msigdbr) diff --git a/R/aggr_dup_genes.R b/R/aggr_dup_genes.R new file mode 100644 index 0000000..e53db1e --- /dev/null +++ b/R/aggr_dup_genes.R @@ -0,0 +1,45 @@ +### summary statistics for dealing with duplicated gene names in exprssion matrix + +#' @title Aggregate duplicate gene rows +#' @description Aggregate duplicate gene rows. Duplicate rows are collapsed into one by taking the colSums, colMeans, colMaxs or colMedians. +#' @param m matrix with genes as rows +#' @param stat statistic to aggregate duplicate gene rows. Defaults to 'sum'. Default: c("sum", "mean", "max", "median") +#' @return matrix with unique gene rows +#' @seealso +#' \code{\link[matrixStats]{rowSums2}},\code{\link[matrixStats]{rowMeans2}},\code{\link[matrixStats]{rowRanges}},\code{\link[matrixStats]{rowMedians}} +#' @rdname aggr_dup_genes +#' @export +#' @importFrom matrixStats colSums2 colMeans2 colMaxs colMedians +aggr_dup_genes = function(m, stat=c('sum','mean','max','median'), na.rm=T) { + + aggr_dup_gene = function(gene) { + mgene = m[rownames(m)==gene,] + statFUN(mgene, na.rm=na.rm) + } + + statFUN = switch(match.arg(stat), + sum = matrixStats::colSums2, + mean = matrixStats::colMeans2, + max = matrixStats::colMaxs, + median = matrixStats::colMedians) + + one_col=FALSE + if (ncol(m)==1) { + one_col=TRUE + m = cbind(m, m) + } + # gene order, keeping only first instance of dup.genes + gene.ord = rownames(m)[!duplicated(rownames(m))] + # find dup genes + dup.genes = unique(rownames(m)[duplicated(rownames(m))]) + # for each dup genes, apply summary statistic and return numeric vec + # rbind dup gene vectors + mNew = do.call(rbind, sapply(dup.genes, aggr_dup_gene, simplify = F)) + m = m[!rownames(m) %in% dup.genes, ] + m = rbind(m, mNew) + #m = m[gene.ord,] + if(one_col) { + m = m[,1,drop=F] + } + m +}