adding nr obs aggregation to aggregator

fgcz · Apr 5, 2024 · 7a7dfb3 · 7a7dfb3
1 parent 6177bc6
commit 7a7dfb3
Show file tree

Hide file tree

Showing 15 changed files with 143 additions and 66 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -30,7 +30,6 @@ export(ProteinAnnotation)
 export(R6_extract_values)
 export(UpSet_interaction_missing_stats)
 export(UpSet_missing_stats)
-export(add_missing)
 export(adjust_p_values)
 export(aggregate_contrast)
 export(aggregate_intensity_topN)
@@ -103,6 +102,7 @@ export(names_to_matrix)
 export(normalize_log2_robscale)
 export(nr_B_in_A)
 export(nr_B_in_A_per_sample)
+export(nr_obs)
 export(old2new)
 export(pairs_smooth)
 export(pairs_w_abline)
@@ -174,6 +174,7 @@ export(tidy_to_wide)
 export(tidy_to_wide_config)
 export(transform_work_intensity)
 export(volcano_plotly)
+export(which_missing)
 import(dplyr)
 import(ggplot2)
 importFrom(MASS,rlm)

diff --git a/R/AnalysisConfiguration.R b/R/AnalysisConfiguration.R
@@ -129,6 +129,8 @@ R6_extract_values <- function(r6class){
 setup_analysis <- function(data, configuration, cc = TRUE,  from_factors = FALSE){
   configuration <- configuration$clone(deep = TRUE)
   table <- configuration$table
+  if (is.null(table$fileName)) { stop("fileName column is not specified in configuration.")}
+  if (!table$fileName %in% colnames(data)) { stop("File name column :" , table$fileName , ", is missing in data.")}
 
   # extract hierarchy columns
   for (i in seq_along(table$hierarchy))
@@ -194,7 +196,7 @@ setup_analysis <- function(data, configuration, cc = TRUE,  from_factors = FALSE
 
   txd <- data |> group_by(!!!syms(c(table$fileName, table$hierarchy_keys(), table$isotopeLabel))) |>
     summarize(n = n())
-  if (length(table(txd$n)) > 1) {
+  if (any(txd$n > 1)) {
     str <- paste("There is more than ONE observations for each : ", paste( table$hierarchy_keys(), collapse = ", "), ",\n",
                  "and sample : ", table$sampleName, "; (filename) : ", table$fileName, "\n")
     warning(str)
@@ -204,12 +206,9 @@ setup_analysis <- function(data, configuration, cc = TRUE,  from_factors = FALSE
   #tmp <- prolfqua::tidy_to_wide_config(data, configuration)
   #message("nr rows and nr columns")
   #message(paste(dim(tmp$data),collapse = ", "))
-
   if (cc) {
     data <- complete_cases( data , configuration)
   }
-
-
   return( data )
 }
 

diff --git a/R/LFQDataAggregator.R b/R/LFQDataAggregator.R
@@ -39,6 +39,7 @@
 #' \dontrun{
 #' lfqAggregator$write_plots(tempdir())
 #' }
+#'
 LFQDataAggregator <- R6::R6Class(
   "LFQDataAggregator",
   public = list(
@@ -75,8 +76,8 @@ LFQDataAggregator <- R6::R6Class(
       if (!self$lfq$is_transformed()) {
         warning("You did not transform the intensities.",
                 "medpolish works best with already variance stabilized intensities.",
-                "Use LFQData$get_Transformer to transform the data.",
-                self$lfq$config$table$workIntensity,)
+                "Use LFQData$get_Transformer to transform the data :",
+                self$lfq$config$table$workIntensity)
       }
       res <- estimate_intensity(self$lfq$data, self$lfq$config, .func = medpolish_estimate_dfconfig)
       self$lfq_agg <- LFQData$new(res$data, res$config, prefix = self$prefix)
@@ -95,6 +96,7 @@ LFQDataAggregator <- R6::R6Class(
       }
 
       res <- estimate_intensity(self$lfq$data, self$lfq$config, .func = rlm_estimate_dfconfig)
+      res <-
       self$lfq_agg <- LFQData$new(res$data, res$config, prefix = self$prefix)
       invisible(self$lfq_agg)
     },

diff --git a/R/simulate_LFQ_data.R b/R/simulate_LFQ_data.R
@@ -125,7 +125,7 @@ sim_lfq_data <- function(
 #' @param x vector of intensities
 #'
 #'
-add_missing <- function(x){
+which_missing <- function(x){
   missing_prop <- pnorm(x, mean = mean(x), sd = sd(x))
   # sample TRUE or FALSE with propability in missing_prop
   samplemiss <- function(missing_prop) {
@@ -136,8 +136,8 @@ add_missing <- function(x){
 
   missing_values <- sapply(missing_prop, samplemiss)
   # Introduce missing values into the vector x
-  x[missing_values] <- NA
-  return(x)
+  #x[missing_values] <- NA
+  return(missing_values)
 }
 
 
@@ -147,6 +147,7 @@ add_missing <- function(x){
 #' @param seed seed for reproducibility, if NULL no seed is set.
 #' @export
 #' @examples
+#' undebug(sim_lfq_data_peptide_config)
 #' x <- sim_lfq_data_peptide_config()
 #' stopifnot("data.frame" %in% class(x$data))
 #' stopifnot("AnalysisConfiguration" %in% class(x$config))
@@ -156,13 +157,15 @@ sim_lfq_data_peptide_config <- function(Nprot = 10, with_missing = TRUE, seed =
   }
   data <- sim_lfq_data(Nprot = Nprot, PEPTIDE = TRUE)
   if (with_missing) {
-    data$abundance <- add_missing(data$abundance)
+    not_missing <- !which_missing(data$abundance)
+    data <- data[not_missing,]
   }
   data$isotopeLabel <- "light"
   data$qValue <- 0
 
   atable <- AnalysisTableAnnotation$new()
-  atable$sampleName = "sample"
+  atable$fileName = "sample"
+
   atable$factors["group_"] = "group"
   atable$hierarchy[["protein_Id"]] = c("proteinID", "idtype2")
   atable$hierarchy[["peptide_Id"]] = "peptideID"
@@ -188,13 +191,13 @@ sim_lfq_data_protein_config <- function(Nprot = 10, with_missing = TRUE, seed =
   }
   data <- sim_lfq_data(Nprot = Nprot, PEPTIDE = FALSE)
   if (with_missing) {
-    data$abundance <- add_missing(data$abundance)
+    data <- data[!which_missing(data$abundance),]
   }
   data$isotopeLabel <- "light"
   data$qValue <- 0
 
   atable <- AnalysisTableAnnotation$new()
-  atable$sampleName = "sample"
+  atable$fileName = "sample"
   atable$nr_children = "nr_peptides"
   atable$factors["group_"] = "group"
   atable$hierarchy[["protein_Id"]] = c("proteinID", "idtype2")

diff --git a/R/tidyMS_R6_TransitionCorrelations.R b/R/tidyMS_R6_TransitionCorrelations.R
@@ -219,20 +219,38 @@ tidy_to_wide <- function(data,
 #' @return list with data, rowdata, and annotation (colData)
 #' @examples
 #'
-#' dd <- prolfqua_data('data_spectronautDIA250_A')
-#' config <- dd$config_f()
-#' analysis <- dd$analysis(dd$data,config)
-#' res <- tidy_to_wide_config(analysis, config)
+#' dd <- prolfqua::sim_lfq_data_peptide_config()
+#' config <- dd$config
+#' data <- dd$data
+#' res <- tidy_to_wide_config(data, config)
 #' testthat::expect_equal(nrow(res$rowdata), nrow(res$data))
 #' testthat::expect_equal(ncol(res$data) - ncol(res$rowdata) , nrow(res$annotation))
-#' res <- tidy_to_wide_config(analysis, config, as.matrix = TRUE)
-#' dim(res$data) == c(823,  45)
-#' dim(res$annotation) == c(45,  6)
-#' dim(res$rowdata) == c(823, 4)
+#' res <- tidy_to_wide_config(data, config, as.matrix = TRUE)
+#' dim(res$data) == c(28,  12)
+#' dim(res$annotation) == c(12,  3)
+#' dim(res$rowdata) == c(28, 3)
 #'
 #' res <- scale(res$data)
-#'
-tidy_to_wide_config <- function(data, config, as.matrix = FALSE, fileName = FALSE, sep="~lfq~"){
+#' tidy_to_wide_config(data, config,  value = config$table$nr_children)
+#'
+#'
+#' xt <- prolfqua::LFQData$new(dd$data, dd$config)
+#' xt$data$nr_children
+#' #xt$config$table$is_response_transformed <- TRUE
+#' res <- xt$get_Aggregator()
+#' x <- res$medpolish()
+#' dd <- prolfqua::sim_lfq_data_protein_config()
+#' dd$config$table$nr_children
+#' dd$data
+#' xt <- tidy_to_wide_config(dd$data, dd$config,  value = dd$config$table$nr_children)
+#' xt$data
+#'
+tidy_to_wide_config <- function(data, config,
+                                as.matrix = FALSE,
+                                fileName = FALSE,
+                                sep="~lfq~",
+                                value = config$table$get_response()
+                                ){
   if (fileName) {
     newcolname <- config$table$fileName
   }else{
@@ -245,7 +263,7 @@ tidy_to_wide_config <- function(data, config, as.matrix = FALSE, fileName = FALS
 
   res <- tidy_to_wide( data, c(config$table$hierarchy_keys(),config$table$isotopeLabel) ,
                  newcolname,
-                 value = config$table$get_response() )
+                 value = value )
   rowdata <- res |> dplyr::select(all_of(c(config$table$hierarchy_keys(),config$table$isotopeLabel)))
   if (as.matrix) {
     resMat <- as.matrix(dplyr::select(res,-dplyr::one_of(c(config$table$hierarchy_keys(),config$table$isotopeLabel))))
@@ -268,14 +286,14 @@ tidy_to_wide_config <- function(data, config, as.matrix = FALSE, fileName = FALS
 #'
 #' @keywords internal
 #' @examples
-#' dd <- prolfqua_data('data_spectronautDIA250_A')
-#' conf <- dd$config_f()
-#' analysis <- dd$analysis(dd$data,conf)
-#' res <- tidy_to_wide_config(analysis, conf, as.matrix = TRUE)
+#' dd <- prolfqua::sim_lfq_data_peptide_config()
+#' data <- dd$data
+#' conf <- dd$config
+#' res <- tidy_to_wide_config(data, conf, as.matrix = TRUE)
 #'
 #' res <- scale(res$data)
 #' xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf)
-#' xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf,analysis)
+#' xx <- response_matrix_as_tibble(res,"srm_intensityScaled", conf, data)
 #' conf$table$get_response() == "srm_intensityScaled"
 #'
 response_matrix_as_tibble <- function(pdata, value, config, data = NULL, sep = "~lfq~"){
@@ -311,10 +329,8 @@ response_matrix_as_tibble <- function(pdata, value, config, data = NULL, sep = "
 #' @examples
 #'
 #'
-#' bb <- prolfqua_data('data_ionstar')$filtered()
-#' bb$config <- old2new(bb$config)
-#' stopifnot(nrow(bb$data) == 25780)
-#' conf <- bb$config$clone(deep=TRUE)
+#' bb <- prolfqua::sim_lfq_data_peptide_config()
+#' conf <- bb$config
 #' sample_analysis <- bb$data
 #' pepIntensityNormalized <- transform_work_intensity(sample_analysis, conf, log2)
 #' s1 <- get_robscales(pepIntensityNormalized, conf)

diff --git a/R/tidyMS_aggregation.R b/R/tidyMS_aggregation.R
@@ -580,10 +580,8 @@ old2new <- function(config) {
 #' @examples
 #'
 #' dd <- prolfqua::sim_lfq_data_peptide_config()
-#'
 #' config <- dd$config
 #' data <- dd$data
-#'
 #' data <- prolfqua::transform_work_intensity(data, config, log2)
 #' bbMed <- estimate_intensity(data, config, .func = medpolish_estimate_dfconfig)
 #' bbRob <- estimate_intensity(data, config, .func = rlm_estimate_dfconfig)
@@ -601,8 +599,6 @@ estimate_intensity <- function(data, config, .func)
   config <- config$clone(deep = TRUE)
 
   xnested <- data |> group_by_at(config$table$hierarchy_keys_depth()) |> nest()
-  nr_children <- data |> group_by_at(config$table$hierarchy_keys_depth()) |>
-    summarize(!!config$table$nr_children := n())
   pb <- progress::progress_bar$new(total = nrow(xnested))
   message("starting aggregation")
 
@@ -622,10 +618,31 @@ estimate_intensity <- function(data, config, .func)
     dplyr::select_at(c(config$table$hierarchy_keys_depth(), makeName)) |>
     tidyr::unnest(cols = makeName) |>
     dplyr::ungroup()
-  unnested <- dplyr::inner_join(nr_children, unnested)
+
+  new_child = paste0("nr_",config$table$hierarchy_keys_depth())
+  res_nr_children <- nr_obs(data, config, new_child = new_child)
+  unnested <- inner_join(unnested, res_nr_children, by = c(config$table$hierarchy_keys_depth(), config$table$fileName))
+  newconfig$table$nr_children = new_child
   return(list(data = unnested, config = newconfig))
 }
 
+#' Aggregates e.g. protein abundances from peptide abundances
+#'
+#' @export
+#' @examples
+#' dd <- prolfqua::sim_lfq_data_peptide_config()
+#' dd$data <- na.omit(dd$data)
+#' xd <- nr_obs(dd$data, dd$config)
+#'
+#' #xd |> head()
+#'
+#' xd$nr_children |> table()
+nr_obs <- function(data, config, new_child = config$table$nr_children){
+  nr_children <- data |> group_by(!!!rlang::syms(c(config$table$hierarchy_keys_depth(), config$table$fileName))) |>
+    summarize(!!new_child := sum(!!sym(config$table$nr_children), na.rm = TRUE))
+  return(nr_children)
+}
+
 #' Plot feature data and result of aggregation
 #'
 #' @param data data.frame before aggregation
@@ -696,8 +713,8 @@ plot_estimate <- function(data, config, data_aggr, config_reduced, show.legend=
 #' @keywords internal
 #' @examples
 #'
-#' dd <- prolfqua_data('data_ionstar')$filtered()
-#' config <- old2new(dd$config)
+#' dd <- prolfqua::sim_lfq_data_peptide_config()
+#' config <- dd$config
 #' res <- dd$data
 #' ranked <- rank_peptide_by_intensity(res,config)
 #'
@@ -748,6 +765,13 @@ aggregate_intensity_topN <- function(pdata , config, .func, N = 3){
     config,
     workIntensity = newcol,
     hierarchy = config$table$hierarchy[seq_len(config$table$hierarchyDepth)])
+
+  new_child_name <- paste0("nr_", config$table$hierarchy_keys_depth() )
+  res_nr_children <- nr_obs(pdata, config, new_child = new_child_name)
+  sumTopInt <- inner_join(
+    sumTopInt, res_nr_children,
+    by = c(config$table$fileName, config$table$hierarchy_keys_depth()))
+  newconfig$table$nr_children = new_child_name
   return(list(data = sumTopInt, config = newconfig))
 }
 

diff --git a/man/LFQDataAggregator.Rd b/man/LFQDataAggregator.Rd
diff --git a/man/aggregate_intensity_topN.Rd b/man/aggregate_intensity_topN.Rd
diff --git a/man/estimate_intensity.Rd b/man/estimate_intensity.Rd
diff --git a/man/get_robscales.Rd b/man/get_robscales.Rd
diff --git a/man/nr_obs.Rd b/man/nr_obs.Rd
diff --git a/man/response_matrix_as_tibble.Rd b/man/response_matrix_as_tibble.Rd