simulate LFQ data peptide and protein

fgcz · Jan 18, 2024 · 9f9a807 · 9f9a807
1 parent 99dfce9
commit 9f9a807
Show file tree

Hide file tree

Showing 9 changed files with 142 additions and 95 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -165,7 +165,7 @@ export(separate_factors)
 export(separate_hierarchy)
 export(setup_analysis)
 export(sim_lfq_data)
-export(sim_lfq_data_config)
+export(sim_lfq_data_peptide_config)
 export(spread_response_by_IsotopeLabel)
 export(squeezeVarRob)
 export(strategy_glm)

diff --git a/R/simulate_LFQ_data.R b/R/simulate_LFQ_data.R
@@ -1,27 +1,22 @@
-#' simulate peptide level data with two groups
+#' simulate protein level data with two groups
 #' @export
 #' @param Nprot number of porteins
 #' @param N group size
 #' @param fc D - down regulation N - matrix,  U -  regulation
 #' @param prop proportion of down (D), up (U) and not regulated (N)
 #' @examples
-#' res <- sim_lfq_data()
-#' sim_lfq_data(Nprot = 10)
-#'
-#' dd <- sim_lfq_data()
-#' dd$abundance <- add_missing(dd$abundance)
 #'
-#' sim_lfq_data_config()
-#' sim_lfq_data_config(with_missing = FALSE)
-
+#' sim_lfq_data(Nprot = 10)
+#' res <- sim_lfq_data(Nprot = 10, PEPTIDE = TRUE)
 sim_lfq_data <- function(
     Nprot = 20,
     N = 4,
     fc = list(A = c(D = -2,  U = 2, N = 0), B = c(D = 1, U = -4)),
     prop = list(A = c(D = 10, U = 10), B = c(D = 5, U = 20)),
     mean_prot = 20,
-    sd = 1.2,
-    probability_of_success = 0.3
+    sdlog = log(1.2),
+    probability_of_success = 0.3,
+    PEPTIDE = FALSE
 ) {
 
   proteins <- stringi::stri_rand_strings(Nprot, 6)
@@ -32,7 +27,7 @@ sim_lfq_data <- function(
   prot <- data.frame(
     proteinID = proteins,
     nrPeptides = nrpeptides,
-    average_prot_abundance = rlnorm(Nprot,log(20),sdlog = log(sd)),
+    average_prot_abundance = rlnorm(Nprot,log(20),sdlog = sdlog),
     mean_Ctrl = 0,
     N_Ctrl = N,
     sd = 1
@@ -57,32 +52,46 @@ sim_lfq_data <- function(
     prot <- prot |> dplyr::mutate(!!groupMean := FC, !!groupSize := N)
   }
 
-  # add row for each protein
-  peptide_df <- prot |> tidyr::uncount( nrPeptides )
-  # create peptide ids
-  peptide_df$peptideID <- stringi::stri_rand_strings(sum(prot$nrPeptides), 8)
+  if (PEPTIDE) {
+
+    # add row for each protein
+    peptide_df <- prot |> tidyr::uncount( nrPeptides )
+    # create peptide ids
+    peptide_df$peptideID <- stringi::stri_rand_strings(sum(prot$nrPeptides), 8)
+  } else {
+    peptide_df <- prot
+  }
+
 
   # transform into long format
   peptide_df2 <- peptide_df |> tidyr::pivot_longer(cols = tidyselect::starts_with(c("mean", "N_")),
                                                    names_to = "group" , values_to = "mean")
   peptide_df2 <-  peptide_df2 |> tidyr::separate(group, c("what", "group"))
   peptide_df2 <- peptide_df2 |> tidyr::pivot_wider(names_from = "what", values_from = mean)
 
-  peptide_df2$avg_peptide_abd <-
-    with(peptide_df2,
-         rlnorm(nrow(peptide_df2),
-                meanlog = log(average_prot_abundance - mean),
-                sdlog = log(sd)))
-
   sample_from_normal <- function(mean, sd, n) {
     rnorm(n, mean, sd)
   }
-
   nrpep <- nrow(peptide_df2)
   sampled_data <- matrix(nrow = nrpep, ncol = N)
-  colnames(sampled_data) <- c("V1","V2","V3","V4")
-  for (i in seq_len(nrpep)) {
-    sampled_data[i,] <- sample_from_normal(peptide_df2$avg_peptide_abd[i], peptide_df2$sd[1], peptide_df2$N[i])
+  colnames(sampled_data) <- paste0("V", 1:ncol(sampled_data))
+
+  peptide_df2$average_prot_abundance <- peptide_df2$average_prot_abundance - peptide_df2$mean
+
+  if (PEPTIDE) {
+    peptide_df2$avg_peptide_abd <-
+      with(peptide_df2,
+           rlnorm(nrow(peptide_df2),
+                  meanlog = log(average_prot_abundance),
+                  sdlog = sdlog))
+    for (i in seq_len(nrpep)) {
+      sampled_data[i,] <- sample_from_normal(peptide_df2$avg_peptide_abd[i], peptide_df2$sd[1], peptide_df2$N[i])
+    }
+
+  } else {
+    for (i in seq_len(nrpep)) {
+      sampled_data[i,] <- sample_from_normal(peptide_df2$average_prot_abundance[i], peptide_df2$sd[1], peptide_df2$N[i])
+    }
   }
 
   x <- dplyr::bind_cols(peptide_df2,sampled_data)
@@ -97,6 +106,7 @@ sim_lfq_data <- function(
   return(peptideAbundances)
 }
 
+
 #' add missing values to x vector based on the values of x
 #' @export
 #' @param x vector of intensities
@@ -122,8 +132,8 @@ add_missing <- function(x){
 #' @export
 #' @examples
 #'
-sim_lfq_data_config <- function(Nprot = 10, with_missing = TRUE){
-  data <- sim_lfq_data(Nprot = Nprot)
+sim_lfq_data_peptide_config <- function(Nprot = 10, with_missing = TRUE){
+  data <- sim_lfq_data(Nprot = Nprot, PEPTIDE = TRUE)
   if (with_missing) {
     data$abundance <- add_missing(data$abundance)
   }
@@ -141,3 +151,23 @@ sim_lfq_data_config <- function(Nprot = 10, with_missing = TRUE){
   adata <- setup_analysis(data, config)
   return(list(data = adata, config = config))
 }
+
+sim_lfq_data_protein_config <- function(Nprot = 10, with_missing = TRUE){
+  data <- sim_lfq_data(Nprot = Nprot, PEPTIDE = FALSE)
+  if (with_missing) {
+    data$abundance <- add_missing(data$abundance)
+  }
+  data$isotopeLabel <- "light"
+  data$qValue <- 0
+
+  atable <- AnalysisTableAnnotation$new()
+  atable$sampleName = "sample"
+  atable$factors["group_"] = "group"
+  atable$hierarchy[["protein_Id"]] = "proteinID"
+  atable$set_response("abundance")
+
+  config <- AnalysisConfiguration$new(atable)
+  adata <- setup_analysis(data, config)
+  return(list(data = adata, config = config))
+}
+
diff --git a/R/tidyMS_plotting.R b/R/tidyMS_plotting.R
@@ -207,19 +207,25 @@ plot_hierarchies_boxplot <- function(pdata,
 #' @keywords internal
 #' @examples
 #'
-#'  iostar <- prolfqua_data('data_ionstar')$filtered()
-#'  iostar$config <- old2new(iostar$config)
-#'  iostar$data <- iostar$data |>
+#'  #iostar <- prolfqua_data('data_ionstar')$filtered()
+#'  #iostar$config <- old2new(iostar$config)
+#'  #iostar$data <- iostar$data |>
+#'  #  dplyr::filter(protein_Id %in% sample(protein_Id, 2))
+#'  #unique(iostar$data$protein_Id)
+#'
+#'  istar <- sim_lfq_data_config()
+#'  config <- istar$config
+#'  analysis <- istar$data
+#'  analysis <- analysis |>
 #'    dplyr::filter(protein_Id %in% sample(protein_Id, 2))
-#'  unique(iostar$data$protein_Id)
 #'
-#'  res <- plot_hierarchies_boxplot_df(iostar$data,iostar$config)
+#'  res <- plot_hierarchies_boxplot_df(analysis,config)
 #'  res$boxplot[[1]]
-#'  res <- plot_hierarchies_boxplot_df(iostar$data,iostar$config,iostar$config$table$hierarchy_keys()[1])
+#'  res <- plot_hierarchies_boxplot_df(analysis,config,config$table$hierarchy_keys()[1])
 #'  res$boxplot[[1]]
-#'  res <- plot_hierarchies_boxplot_df(iostar$data,iostar$config,
-#'                                     iostar$config$table$hierarchy_keys()[1],
-#'                                     facet_grid_on = iostar$config$table$hierarchy_keys()[2])
+#'  res <- plot_hierarchies_boxplot_df(analysis,config,
+#'                                     config$table$hierarchy_keys()[1],
+#'                                     facet_grid_on = config$table$hierarchy_keys()[2])
 #'  res$boxplot[[1]]
 #'
 #'  bb <- prolfqua_data('data_IonstarProtein_subsetNorm')
@@ -266,8 +272,8 @@ plot_hierarchies_boxplot_df <- function(pdata,
 #' @family plotting
 #' @examples
 #'
-#' istar <- prolfqua_data('data_ionstar')$filtered()
-#' config <- old2new(istar$config$clone(deep=TRUE))
+#' istar <- sim_lfq_data_config()
+#' config <- istar$config
 #' analysis <- istar$data
 #'
 #' pheat_map <- prolfqua::plot_heatmap_cor( analysis, config )
@@ -322,20 +328,21 @@ plot_heatmap_cor <- function(data,
 #' plot heatmap with annotations
 #'
 #' @export
+#' @param na_fraction fraction of NA values per row
+#' @param show_rownames if TRUE shows row names, default FALSE
 #' @keywords internal
 #' @family plotting
 #' @examples
 #'
-#' istar <- prolfqua_data('data_ionstar')$filtered()
-#' stopifnot(nrow(istar$data) == 25780)
-#' config <- old2new(istar$config$clone(deep=TRUE))
+#' istar <- sim_lfq_data_config()
+#' config <- istar$config
 #' analysis <- istar$data
 #'
-#' plot.new()
+#'
 #' p  <- plot_heatmap(analysis, config)
+#' stopifnot(class(p) == "pheatmap")
 #' p2 <- plot_heatmap(analysis, config, show_rownames = TRUE)
-#' plot.new()
-#' p2
+#' stopifnot(class(p) == "pheatmap")
 #'
 plot_heatmap <- function(data,
                          config,
@@ -394,17 +401,19 @@ plot_heatmap <- function(data,
 #' @family plotting
 #' @export
 #' @examples
-#'  bb <- prolfqua_data('data_IonstarProtein_subsetNorm')
-#'  new <- list(config = bb$config$clone( deep = TRUE), data = bb$data)
-#' istar <- LFQData$new(new$data, new$config)
-#' config <- old2new(istar$config$clone(deep=TRUE))
+#'
+#' istar <- sim_lfq_data_config()
+#' config <- istar$config
 #' analysis <- istar$data
-#' dev.off()
 #' rs <- plot_raster(analysis, config, show_rownames=FALSE)
-#' print(rs)
-#' plot_raster(analysis[1,], config)
-#' plot_raster(analysis, config, "var")
-#' plot_raster(analysis, config, show_rownames = TRUE)
+#' stopifnot(class(rs) == "pheatmap")
+#' rs <- plot_raster(analysis[1,], config)
+#' stopifnot(is.null(rs))
+#' rs <- plot_raster(analysis, config, "var")
+#' stopifnot(class(rs) == "pheatmap")
+#' rs <- plot_raster(analysis, config, show_rownames = TRUE)
+#' stopifnot(class(rs) == "pheatmap")
+#'
 plot_raster <- function(data,
                         config,
                         arrange = c("mean", "var"),

diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd
diff --git a/man/plot_heatmap_cor.Rd b/man/plot_heatmap_cor.Rd
diff --git a/man/plot_hierarchies_boxplot_df.Rd b/man/plot_hierarchies_boxplot_df.Rd
diff --git a/man/plot_raster.Rd b/man/plot_raster.Rd