From 7a7dfb30e620705a98b1beef18f3e0af5b9e27ee Mon Sep 17 00:00:00 2001 From: Witold Wolski Date: Fri, 5 Apr 2024 17:16:25 +0200 Subject: [PATCH] adding nr obs aggregation to aggregator --- NAMESPACE | 3 +- R/AnalysisConfiguration.R | 7 ++- R/LFQDataAggregator.R | 6 ++- R/simulate_LFQ_data.R | 17 ++++--- R/tidyMS_R6_TransitionCorrelations.R | 56 +++++++++++++++--------- R/tidyMS_aggregation.R | 38 +++++++++++++--- man/LFQDataAggregator.Rd | 1 + man/aggregate_intensity_topN.Rd | 4 +- man/estimate_intensity.Rd | 2 - man/get_robscales.Rd | 6 +-- man/nr_obs.Rd | 20 +++++++++ man/response_matrix_as_tibble.Rd | 10 ++--- man/sim_lfq_data_peptide_config.Rd | 1 + man/tidy_to_wide_config.Rd | 32 ++++++++++---- man/{add_missing.Rd => which_missing.Rd} | 6 +-- 15 files changed, 143 insertions(+), 66 deletions(-) create mode 100644 man/nr_obs.Rd rename man/{add_missing.Rd => which_missing.Rd} (82%) diff --git a/NAMESPACE b/NAMESPACE index 8e846645b..d058e9bf3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,7 +30,6 @@ export(ProteinAnnotation) export(R6_extract_values) export(UpSet_interaction_missing_stats) export(UpSet_missing_stats) -export(add_missing) export(adjust_p_values) export(aggregate_contrast) export(aggregate_intensity_topN) @@ -103,6 +102,7 @@ export(names_to_matrix) export(normalize_log2_robscale) export(nr_B_in_A) export(nr_B_in_A_per_sample) +export(nr_obs) export(old2new) export(pairs_smooth) export(pairs_w_abline) @@ -174,6 +174,7 @@ export(tidy_to_wide) export(tidy_to_wide_config) export(transform_work_intensity) export(volcano_plotly) +export(which_missing) import(dplyr) import(ggplot2) importFrom(MASS,rlm) diff --git a/R/AnalysisConfiguration.R b/R/AnalysisConfiguration.R index 01b52a2c5..dd74c7437 100644 --- a/R/AnalysisConfiguration.R +++ b/R/AnalysisConfiguration.R @@ -129,6 +129,8 @@ R6_extract_values <- function(r6class){ setup_analysis <- function(data, configuration, cc = TRUE, from_factors = FALSE){ configuration <- configuration$clone(deep = TRUE) table <- configuration$table + if (is.null(table$fileName)) { stop("fileName column is not specified in configuration.")} + if (!table$fileName %in% colnames(data)) { stop("File name column :" , table$fileName , ", is missing in data.")} # extract hierarchy columns for (i in seq_along(table$hierarchy)) @@ -194,7 +196,7 @@ setup_analysis <- function(data, configuration, cc = TRUE, from_factors = FALSE txd <- data |> group_by(!!!syms(c(table$fileName, table$hierarchy_keys(), table$isotopeLabel))) |> summarize(n = n()) - if (length(table(txd$n)) > 1) { + if (any(txd$n > 1)) { str <- paste("There is more than ONE observations for each : ", paste( table$hierarchy_keys(), collapse = ", "), ",\n", "and sample : ", table$sampleName, "; (filename) : ", table$fileName, "\n") warning(str) @@ -204,12 +206,9 @@ setup_analysis <- function(data, configuration, cc = TRUE, from_factors = FALSE #tmp <- prolfqua::tidy_to_wide_config(data, configuration) #message("nr rows and nr columns") #message(paste(dim(tmp$data),collapse = ", ")) - if (cc) { data <- complete_cases( data , configuration) } - - return( data ) } diff --git a/R/LFQDataAggregator.R b/R/LFQDataAggregator.R index 5277054fd..9b7fc91ed 100644 --- a/R/LFQDataAggregator.R +++ b/R/LFQDataAggregator.R @@ -39,6 +39,7 @@ #' \dontrun{ #' lfqAggregator$write_plots(tempdir()) #' } +#' LFQDataAggregator <- R6::R6Class( "LFQDataAggregator", public = list( @@ -75,8 +76,8 @@ LFQDataAggregator <- R6::R6Class( if (!self$lfq$is_transformed()) { warning("You did not transform the intensities.", "medpolish works best with already variance stabilized intensities.", - "Use LFQData$get_Transformer to transform the data.", - self$lfq$config$table$workIntensity,) + "Use LFQData$get_Transformer to transform the data :", + self$lfq$config$table$workIntensity) } res <- estimate_intensity(self$lfq$data, self$lfq$config, .func = medpolish_estimate_dfconfig) self$lfq_agg <- LFQData$new(res$data, res$config, prefix = self$prefix) @@ -95,6 +96,7 @@ LFQDataAggregator <- R6::R6Class( } res <- estimate_intensity(self$lfq$data, self$lfq$config, .func = rlm_estimate_dfconfig) + res <- self$lfq_agg <- LFQData$new(res$data, res$config, prefix = self$prefix) invisible(self$lfq_agg) }, diff --git a/R/simulate_LFQ_data.R b/R/simulate_LFQ_data.R index 36934705b..e785dc25a 100644 --- a/R/simulate_LFQ_data.R +++ b/R/simulate_LFQ_data.R @@ -125,7 +125,7 @@ sim_lfq_data <- function( #' @param x vector of intensities #' #' -add_missing <- function(x){ +which_missing <- function(x){ missing_prop <- pnorm(x, mean = mean(x), sd = sd(x)) # sample TRUE or FALSE with propability in missing_prop samplemiss <- function(missing_prop) { @@ -136,8 +136,8 @@ add_missing <- function(x){ missing_values <- sapply(missing_prop, samplemiss) # Introduce missing values into the vector x - x[missing_values] <- NA - return(x) + #x[missing_values] <- NA + return(missing_values) } @@ -147,6 +147,7 @@ add_missing <- function(x){ #' @param seed seed for reproducibility, if NULL no seed is set. #' @export #' @examples +#' undebug(sim_lfq_data_peptide_config) #' x <- sim_lfq_data_peptide_config() #' stopifnot("data.frame" %in% class(x$data)) #' stopifnot("AnalysisConfiguration" %in% class(x$config)) @@ -156,13 +157,15 @@ sim_lfq_data_peptide_config <- function(Nprot = 10, with_missing = TRUE, seed = } data <- sim_lfq_data(Nprot = Nprot, PEPTIDE = TRUE) if (with_missing) { - data$abundance <- add_missing(data$abundance) + not_missing <- !which_missing(data$abundance) + data <- data[not_missing,] } data$isotopeLabel <- "light" data$qValue <- 0 atable <- AnalysisTableAnnotation$new() - atable$sampleName = "sample" + atable$fileName = "sample" + atable$factors["group_"] = "group" atable$hierarchy[["protein_Id"]] = c("proteinID", "idtype2") atable$hierarchy[["peptide_Id"]] = "peptideID" @@ -188,13 +191,13 @@ sim_lfq_data_protein_config <- function(Nprot = 10, with_missing = TRUE, seed = } data <- sim_lfq_data(Nprot = Nprot, PEPTIDE = FALSE) if (with_missing) { - data$abundance <- add_missing(data$abundance) + data <- data[!which_missing(data$abundance),] } data$isotopeLabel <- "light" data$qValue <- 0 atable <- AnalysisTableAnnotation$new() - atable$sampleName = "sample" + atable$fileName = "sample" atable$nr_children = "nr_peptides" atable$factors["group_"] = "group" atable$hierarchy[["protein_Id"]] = c("proteinID", "idtype2") diff --git a/R/tidyMS_R6_TransitionCorrelations.R b/R/tidyMS_R6_TransitionCorrelations.R index 7eb296591..b761f6a44 100644 --- a/R/tidyMS_R6_TransitionCorrelations.R +++ b/R/tidyMS_R6_TransitionCorrelations.R @@ -219,20 +219,38 @@ tidy_to_wide <- function(data, #' @return list with data, rowdata, and annotation (colData) #' @examples #' -#' dd <- prolfqua_data('data_spectronautDIA250_A') -#' config <- dd$config_f() -#' analysis <- dd$analysis(dd$data,config) -#' res <- tidy_to_wide_config(analysis, config) +#' dd <- prolfqua::sim_lfq_data_peptide_config() +#' config <- dd$config +#' data <- dd$data +#' res <- tidy_to_wide_config(data, config) #' testthat::expect_equal(nrow(res$rowdata), nrow(res$data)) #' testthat::expect_equal(ncol(res$data) - ncol(res$rowdata) , nrow(res$annotation)) -#' res <- tidy_to_wide_config(analysis, config, as.matrix = TRUE) -#' dim(res$data) == c(823, 45) -#' dim(res$annotation) == c(45, 6) -#' dim(res$rowdata) == c(823, 4) +#' res <- tidy_to_wide_config(data, config, as.matrix = TRUE) +#' dim(res$data) == c(28, 12) +#' dim(res$annotation) == c(12, 3) +#' dim(res$rowdata) == c(28, 3) #' #' res <- scale(res$data) -#' -tidy_to_wide_config <- function(data, config, as.matrix = FALSE, fileName = FALSE, sep="~lfq~"){ +#' tidy_to_wide_config(data, config, value = config$table$nr_children) +#' +#' +#' xt <- prolfqua::LFQData$new(dd$data, dd$config) +#' xt$data$nr_children +#' #xt$config$table$is_response_transformed <- TRUE +#' res <- xt$get_Aggregator() +#' x <- res$medpolish() +#' dd <- prolfqua::sim_lfq_data_protein_config() +#' dd$config$table$nr_children +#' dd$data +#' xt <- tidy_to_wide_config(dd$data, dd$config, value = dd$config$table$nr_children) +#' xt$data +#' +tidy_to_wide_config <- function(data, config, + as.matrix = FALSE, + fileName = FALSE, + sep="~lfq~", + value = config$table$get_response() + ){ if (fileName) { newcolname <- config$table$fileName }else{ @@ -245,7 +263,7 @@ tidy_to_wide_config <- function(data, config, as.matrix = FALSE, fileName = FALS res <- tidy_to_wide( data, c(config$table$hierarchy_keys(),config$table$isotopeLabel) , newcolname, - value = config$table$get_response() ) + value = value ) rowdata <- res |> dplyr::select(all_of(c(config$table$hierarchy_keys(),config$table$isotopeLabel))) if (as.matrix) { resMat <- as.matrix(dplyr::select(res,-dplyr::one_of(c(config$table$hierarchy_keys(),config$table$isotopeLabel)))) @@ -268,14 +286,14 @@ tidy_to_wide_config <- function(data, config, as.matrix = FALSE, fileName = FALS #' #' @keywords internal #' @examples -#' dd <- prolfqua_data('data_spectronautDIA250_A') -#' conf <- dd$config_f() -#' analysis <- dd$analysis(dd$data,conf) -#' res <- tidy_to_wide_config(analysis, conf, as.matrix = TRUE) +#' dd <- prolfqua::sim_lfq_data_peptide_config() +#' data <- dd$data +#' conf <- dd$config +#' res <- tidy_to_wide_config(data, conf, as.matrix = TRUE) #' #' res <- scale(res$data) #' xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf) -#' xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf,analysis) +#' xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf, data) #' conf$table$get_response() == "srm_intensityScaled" #' response_matrix_as_tibble <- function(pdata, value, config, data = NULL, sep = "~lfq~"){ @@ -311,10 +329,8 @@ response_matrix_as_tibble <- function(pdata, value, config, data = NULL, sep = " #' @examples #' #' -#' bb <- prolfqua_data('data_ionstar')$filtered() -#' bb$config <- old2new(bb$config) -#' stopifnot(nrow(bb$data) == 25780) -#' conf <- bb$config$clone(deep=TRUE) +#' bb <- prolfqua::sim_lfq_data_peptide_config() +#' conf <- bb$config #' sample_analysis <- bb$data #' pepIntensityNormalized <- transform_work_intensity(sample_analysis, conf, log2) #' s1 <- get_robscales(pepIntensityNormalized, conf) diff --git a/R/tidyMS_aggregation.R b/R/tidyMS_aggregation.R index e76310102..9b45818bd 100644 --- a/R/tidyMS_aggregation.R +++ b/R/tidyMS_aggregation.R @@ -580,10 +580,8 @@ old2new <- function(config) { #' @examples #' #' dd <- prolfqua::sim_lfq_data_peptide_config() -#' #' config <- dd$config #' data <- dd$data -#' #' data <- prolfqua::transform_work_intensity(data, config, log2) #' bbMed <- estimate_intensity(data, config, .func = medpolish_estimate_dfconfig) #' bbRob <- estimate_intensity(data, config, .func = rlm_estimate_dfconfig) @@ -601,8 +599,6 @@ estimate_intensity <- function(data, config, .func) config <- config$clone(deep = TRUE) xnested <- data |> group_by_at(config$table$hierarchy_keys_depth()) |> nest() - nr_children <- data |> group_by_at(config$table$hierarchy_keys_depth()) |> - summarize(!!config$table$nr_children := n()) pb <- progress::progress_bar$new(total = nrow(xnested)) message("starting aggregation") @@ -622,10 +618,31 @@ estimate_intensity <- function(data, config, .func) dplyr::select_at(c(config$table$hierarchy_keys_depth(), makeName)) |> tidyr::unnest(cols = makeName) |> dplyr::ungroup() - unnested <- dplyr::inner_join(nr_children, unnested) + + new_child = paste0("nr_",config$table$hierarchy_keys_depth()) + res_nr_children <- nr_obs(data, config, new_child = new_child) + unnested <- inner_join(unnested, res_nr_children, by = c(config$table$hierarchy_keys_depth(), config$table$fileName)) + newconfig$table$nr_children = new_child return(list(data = unnested, config = newconfig)) } +#' Aggregates e.g. protein abundances from peptide abundances +#' +#' @export +#' @examples +#' dd <- prolfqua::sim_lfq_data_peptide_config() +#' dd$data <- na.omit(dd$data) +#' xd <- nr_obs(dd$data, dd$config) +#' +#' #xd |> head() +#' +#' xd$nr_children |> table() +nr_obs <- function(data, config, new_child = config$table$nr_children){ + nr_children <- data |> group_by(!!!rlang::syms(c(config$table$hierarchy_keys_depth(), config$table$fileName))) |> + summarize(!!new_child := sum(!!sym(config$table$nr_children), na.rm = TRUE)) + return(nr_children) +} + #' Plot feature data and result of aggregation #' #' @param data data.frame before aggregation @@ -696,8 +713,8 @@ plot_estimate <- function(data, config, data_aggr, config_reduced, show.legend= #' @keywords internal #' @examples #' -#' dd <- prolfqua_data('data_ionstar')$filtered() -#' config <- old2new(dd$config) +#' dd <- prolfqua::sim_lfq_data_peptide_config() +#' config <- dd$config #' res <- dd$data #' ranked <- rank_peptide_by_intensity(res,config) #' @@ -748,6 +765,13 @@ aggregate_intensity_topN <- function(pdata , config, .func, N = 3){ config, workIntensity = newcol, hierarchy = config$table$hierarchy[seq_len(config$table$hierarchyDepth)]) + + new_child_name <- paste0("nr_", config$table$hierarchy_keys_depth() ) + res_nr_children <- nr_obs(pdata, config, new_child = new_child_name) + sumTopInt <- inner_join( + sumTopInt, res_nr_children, + by = c(config$table$fileName, config$table$hierarchy_keys_depth())) + newconfig$table$nr_children = new_child_name return(list(data = sumTopInt, config = newconfig)) } diff --git a/man/LFQDataAggregator.Rd b/man/LFQDataAggregator.Rd index 8eb8410d0..ffd6c6fd2 100644 --- a/man/LFQDataAggregator.Rd +++ b/man/LFQDataAggregator.Rd @@ -44,6 +44,7 @@ protPlotter$heatmap() \dontrun{ lfqAggregator$write_plots(tempdir()) } + } \seealso{ Other LFQData: diff --git a/man/aggregate_intensity_topN.Rd b/man/aggregate_intensity_topN.Rd index 84562a349..b91b0b664 100644 --- a/man/aggregate_intensity_topN.Rd +++ b/man/aggregate_intensity_topN.Rd @@ -23,8 +23,8 @@ run \link{rank_peptide_by_intensity} first } \examples{ -dd <- prolfqua_data('data_ionstar')$filtered() -config <- old2new(dd$config) +dd <- prolfqua::sim_lfq_data_peptide_config() +config <- dd$config res <- dd$data ranked <- rank_peptide_by_intensity(res,config) diff --git a/man/estimate_intensity.Rd b/man/estimate_intensity.Rd index 3262d336d..758a2c78c 100644 --- a/man/estimate_intensity.Rd +++ b/man/estimate_intensity.Rd @@ -18,10 +18,8 @@ Aggregates e.g. protein abundances from peptide abundances \examples{ dd <- prolfqua::sim_lfq_data_peptide_config() - config <- dd$config data <- dd$data - data <- prolfqua::transform_work_intensity(data, config, log2) bbMed <- estimate_intensity(data, config, .func = medpolish_estimate_dfconfig) bbRob <- estimate_intensity(data, config, .func = rlm_estimate_dfconfig) diff --git a/man/get_robscales.Rd b/man/get_robscales.Rd index 3924dcc03..df9803cb3 100644 --- a/man/get_robscales.Rd +++ b/man/get_robscales.Rd @@ -12,10 +12,8 @@ compute median and standard deviation for each sample \examples{ -bb <- prolfqua_data('data_ionstar')$filtered() -bb$config <- old2new(bb$config) -stopifnot(nrow(bb$data) == 25780) -conf <- bb$config$clone(deep=TRUE) +bb <- prolfqua::sim_lfq_data_peptide_config() +conf <- bb$config sample_analysis <- bb$data pepIntensityNormalized <- transform_work_intensity(sample_analysis, conf, log2) s1 <- get_robscales(pepIntensityNormalized, conf) diff --git a/man/nr_obs.Rd b/man/nr_obs.Rd new file mode 100644 index 000000000..bb23a8248 --- /dev/null +++ b/man/nr_obs.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyMS_aggregation.R +\name{nr_obs} +\alias{nr_obs} +\title{Aggregates e.g. protein abundances from peptide abundances} +\usage{ +nr_obs(data, config, new_child = config$table$nr_children) +} +\description{ +Aggregates e.g. protein abundances from peptide abundances +} +\examples{ +dd <- prolfqua::sim_lfq_data_peptide_config() +dd$data <- na.omit(dd$data) +xd <- nr_obs(dd$data, dd$config) + +#xd |> head() + +xd$nr_children |> table() +} diff --git a/man/response_matrix_as_tibble.Rd b/man/response_matrix_as_tibble.Rd index 986dbd78a..2d7f2d68c 100644 --- a/man/response_matrix_as_tibble.Rd +++ b/man/response_matrix_as_tibble.Rd @@ -21,14 +21,14 @@ response_matrix_as_tibble(pdata, value, config, data = NULL, sep = "~lfq~") Takes matrix of responses and converts into tibble } \examples{ -dd <- prolfqua_data('data_spectronautDIA250_A') -conf <- dd$config_f() -analysis <- dd$analysis(dd$data,conf) -res <- tidy_to_wide_config(analysis, conf, as.matrix = TRUE) +dd <- prolfqua::sim_lfq_data_peptide_config() +data <- dd$data +conf <- dd$config +res <- tidy_to_wide_config(data, conf, as.matrix = TRUE) res <- scale(res$data) xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf) -xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf,analysis) +xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf, data) conf$table$get_response() == "srm_intensityScaled" } diff --git a/man/sim_lfq_data_peptide_config.Rd b/man/sim_lfq_data_peptide_config.Rd index 6e3ad0d7a..f9d8abf3e 100644 --- a/man/sim_lfq_data_peptide_config.Rd +++ b/man/sim_lfq_data_peptide_config.Rd @@ -17,6 +17,7 @@ sim_lfq_data_peptide_config(Nprot = 10, with_missing = TRUE, seed = 1234) Simulate data, protein and peptide, with config } \examples{ +undebug(sim_lfq_data_peptide_config) x <- sim_lfq_data_peptide_config() stopifnot("data.frame" \%in\% class(x$data)) stopifnot("AnalysisConfiguration" \%in\% class(x$config)) diff --git a/man/tidy_to_wide_config.Rd b/man/tidy_to_wide_config.Rd index 23c21e059..f534d687d 100644 --- a/man/tidy_to_wide_config.Rd +++ b/man/tidy_to_wide_config.Rd @@ -9,7 +9,8 @@ tidy_to_wide_config( config, as.matrix = FALSE, fileName = FALSE, - sep = "~lfq~" + sep = "~lfq~", + value = config$table$get_response() ) } \value{ @@ -20,18 +21,31 @@ transform long to wide } \examples{ -dd <- prolfqua_data('data_spectronautDIA250_A') -config <- dd$config_f() -analysis <- dd$analysis(dd$data,config) -res <- tidy_to_wide_config(analysis, config) +dd <- prolfqua::sim_lfq_data_peptide_config() +config <- dd$config +data <- dd$data +res <- tidy_to_wide_config(data, config) testthat::expect_equal(nrow(res$rowdata), nrow(res$data)) testthat::expect_equal(ncol(res$data) - ncol(res$rowdata) , nrow(res$annotation)) -res <- tidy_to_wide_config(analysis, config, as.matrix = TRUE) -dim(res$data) == c(823, 45) -dim(res$annotation) == c(45, 6) -dim(res$rowdata) == c(823, 4) +res <- tidy_to_wide_config(data, config, as.matrix = TRUE) +dim(res$data) == c(28, 12) +dim(res$annotation) == c(12, 3) +dim(res$rowdata) == c(28, 3) res <- scale(res$data) +tidy_to_wide_config(data, config, value = config$table$nr_children) + + +xt <- prolfqua::LFQData$new(dd$data, dd$config) +xt$data$nr_children +#xt$config$table$is_response_transformed <- TRUE +res <- xt$get_Aggregator() +x <- res$medpolish() +dd <- prolfqua::sim_lfq_data_protein_config() +dd$config$table$nr_children +dd$data +xt <- tidy_to_wide_config(dd$data, dd$config, value = dd$config$table$nr_children) +xt$data } \keyword{internal} diff --git a/man/add_missing.Rd b/man/which_missing.Rd similarity index 82% rename from man/add_missing.Rd rename to man/which_missing.Rd index 17bfef748..24b1053e6 100644 --- a/man/add_missing.Rd +++ b/man/which_missing.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/simulate_LFQ_data.R -\name{add_missing} -\alias{add_missing} +\name{which_missing} +\alias{which_missing} \title{add missing values to x vector based on the values of x} \usage{ -add_missing(x) +which_missing(x) } \arguments{ \item{x}{vector of intensities}