From 450e4f72fff1392343bd73d313aace3ffa40902f Mon Sep 17 00:00:00 2001 From: Thomas Rauter Date: Fri, 20 Sep 2024 16:29:51 +0200 Subject: [PATCH] voom object now passable to run_limma_splines(), updated documentation --- .Rbuildignore | 1 + NAMESPACE | 1 + R/cluster_hits.R | 7 +- R/preprocess_rna_seq_data.R | 124 ++++ R/run_limma_splines.R | 230 ++----- R/screen_limma_hyperparams.R | 92 ++- R/splineomics_object.R | 76 ++- R/utils_input_validation.R | 14 +- README.Rmd | 13 +- README.md | 22 +- dev/function_testing_ground.R | 106 ++-- inst/tutorial/tutorial.Rmd | 1053 ++++++++++++++----------------- man/InputControl.Rd | 28 +- man/Level2Functions.Rd | 27 +- man/Level3Functions.Rd | 28 +- man/between_level.Rd | 8 +- man/check_null_elements.Rd | 6 +- man/create_p_value_histogram.Rd | 4 +- man/create_splineomics.Rd | 52 +- man/get_limma_combos_results.Rd | 4 + man/preprocess_rna_seq_data.Rd | 40 +- man/process_combo.Rd | 4 + man/process_top_table.Rd | 5 +- man/process_within_level.Rd | 8 +- man/run_limma_splines.Rd | 49 +- man/screen_limma_hyperparams.Rd | 6 +- man/within_level.Rd | 14 +- renv.lock | 123 +++- vignettes/get-started.Rmd | 358 ++++++++--- 29 files changed, 1410 insertions(+), 1093 deletions(-) create mode 100644 R/preprocess_rna_seq_data.R diff --git a/.Rbuildignore b/.Rbuildignore index 9128547..882673f 100755 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -16,4 +16,5 @@ ^CODE_OF_CONDUCT\.md$ ^inst/CITATION\.cff$ ^pkgdown$ +_pkgdown.yml$ diff --git a/NAMESPACE b/NAMESPACE index 5833e89..00898af 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(explore_data) export(extract_data) export(open_template) export(open_tutorial) +export(preprocess_rna_seq_data) export(run_gsea) export(run_limma_splines) export(screen_limma_hyperparams) diff --git a/R/cluster_hits.R b/R/cluster_hits.R index 9172adb..c283242 100755 --- a/R/cluster_hits.R +++ b/R/cluster_hits.R @@ -90,7 +90,7 @@ cluster_hits <- function( analysis_type = "time_effect", report = TRUE ) { - + report_dir <- normalizePath( report_dir, mustWork = FALSE @@ -226,6 +226,9 @@ cluster_hits <- function( # Add gene column for the run_gsea() function. clustered_hits_levels <- lapply(clustered_hits_levels, function(df) { + if (is.character(df)) { + return(df) + } df$gene <- genes[df$feature] return(df) }) @@ -490,7 +493,7 @@ make_clustering_report <- function( analysis_type, feature_name_columns ) { - + # Optionally remove the batch-effect with the batch column and design matrix # For mode == "integrated", the batch-effect is removed from the whole data # For mode == "isolated", the batch-effect is removed for every level diff --git a/R/preprocess_rna_seq_data.R b/R/preprocess_rna_seq_data.R new file mode 100644 index 0000000..503689a --- /dev/null +++ b/R/preprocess_rna_seq_data.R @@ -0,0 +1,124 @@ +# Exported function: preprocess_rna_seq_data() --------------------------------- + + +#' Perform default preprocessing of raw RNA-seq counts +#' +#' @description +#' The `preprocess_rna_seq_data()` function performs essential preprocessing +#' steps for raw RNA-seq counts. This includes creating a `DGEList` object, +#' normalizing the counts using the default TMM (Trimmed Mean of M-values) +#' normalization via the `edgeR::calcNormFactors` function, and applying the +#' `voom` transformation from the `limma` package to obtain log-transformed +#' counts per million (logCPM) with associated precision weights. If you +#' require a different normalization method, you can supply your own +#' custom normalization function. +#' +#' @param raw_counts A matrix of raw RNA-seq counts (genes as rows, samples as +#' columns). +#' @param meta A dataframe containing the metadata for data. +#' @param spline_params Parameters for spline functions (optional). Must contain +#' the named elements spline_type, which must contain either the string "n" for +#' natural cubic splines, or "b", for B-splines, the named element degree in the +#' case of B-splines, that must contain only an integer, and the named element +#' dof, specifying the degree of freedom, containing an integer and required +#' both for natural and B-splines. +#' @param design A design formula for the limma analysis, such as +#' '~ 1 + Phase*X + Reactor'. +#' @param normalize_func An optional normalization function. If provided, this +#' function will be used to normalize the `DGEList` object. If not provided, +#' TMM normalization (via `edgeR::calcNormFactors`) will be used by default. +#' Must take as +#' input the y of: y <- edgeR::DGEList(counts = raw_counts) and output the y +#' with the normalized counts. +#' @return A `voom` object, which includes the log2-counts per million (logCPM) +#' matrix and observation-specific weights. +#' +#' @importFrom limma voom +#' +#' @export +#' +preprocess_rna_seq_data <- function( + raw_counts, + meta, + spline_params, + design, + normalize_func = NULL +) { + + message("Preprocessing RNA-seq data (normalization + voom)...") + + # Check if edgeR is installed; if not, prompt the user + if (!requireNamespace("edgeR", quietly = TRUE)) { + message("The 'edgeR' package is not installed.") + + # Prompt user for action + repeat { + user_input <- readline( + prompt = + "What would you like to do?\n + 1: Automatically install edgeR\n + 2: Manually install edgeR\n + 3: Cancel\n + Please enter 1, 2, or 3: " + ) + + if (user_input == "1") { + # Try to install edgeR automatically from Bioconductor + message("Attempting to install 'edgeR' automatically + from Bioconductor...") + if (!requireNamespace("BiocManager", quietly = TRUE)) { + utils::install.packages("BiocManager") + } + tryCatch( + { + BiocManager::install("edgeR", update = FALSE) + }, + error = function(e) { + stop( + "Automatic installation of 'edgeR' failed. + Please install it manually and try again.", + call. = FALSE + ) + } + ) + break # Exit the loop if installation is successful + } else if (user_input == "2") { + stop( + "Please install 'edgeR' manually using + BiocManager::install('edgeR') and then re-run the function.", + call. = FALSE + ) + } else if (user_input == "3") { + stop("Operation canceled by the user.", call. = FALSE) + } else { + message("Invalid input. Please enter 1, 2, or 3.") + } + } + } + + design_matrix <- design2design_matrix( + meta = meta, + spline_params = spline_params, + level_index = 1, + design = design + ) + + # Step 1: Create DGEList object from raw counts + y <- edgeR::DGEList(counts = raw_counts) + + # Step 2: Apply the normalization function (either user-provided or default) + if (!is.null(normalize_func) && is.function(normalize_func)) { + y <- normalize_func(y) # user provided normalisation function + } else { + # Default: Normalize the counts using TMM normalization + y <- edgeR::calcNormFactors(y) + } + + # Step 3: Apply voom transformation to get logCPM values and weights + voom_obj <- limma::voom( + y, + design_matrix + ) + + return(voom_obj) +} \ No newline at end of file diff --git a/R/run_limma_splines.R b/R/run_limma_splines.R index 28ba398..9a1a0c8 100755 --- a/R/run_limma_splines.R +++ b/R/run_limma_splines.R @@ -13,8 +13,11 @@ #' @param splineomics An S3 object of class `SplineOmics` that contains the #' following elements: #' \itemize{ -#' \item \code{data}: The original expression dataset used for differential -#' expression analysis. +#' \item \code{data}: The matrix of the omics dataset, with the feature +#' names optionally as row headers. +#' \item \code{rna_seq_data}: An object containing the preprocessed +#' RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. #' \item \code{meta}: A dataframe containing metadata corresponding to the #' \code{data}, must include a 'Time' column and the column specified by #' \code{condition}. @@ -79,13 +82,13 @@ run_limma_splines <- function( input_control$auto_validate() data <- splineomics[["data"]] - preprocess_rna_seq <- splineomics[["preprocess_rna_seq"]] - normalization_fun <- splineomics[["normalization_fun"]] + rna_seq_data <- splineomics[["rna_seq_data"]] meta <- splineomics[["meta"]] spline_params <- splineomics[["spline_params"]] padjust_method <- splineomics[["padjust_method"]] feature_names <- rownames(data) + data_copy <- data rownames(data_copy) <- NULL # To just have numbers describing the rows @@ -97,8 +100,7 @@ run_limma_splines <- function( within_level, spline_params = spline_params, data = data_copy, - preprocess_rna_seq = preprocess_rna_seq, - normalization_fun = normalization_fun, + rna_seq_data = rna_seq_data, meta = meta, design = design, condition = condition, @@ -119,22 +121,6 @@ run_limma_splines <- function( purrr::map_chr(results_list, "name") ) - # For RNA-seq data, voom$E data matrices must be passed to cluster_hits() - voom_matrices <- lapply( - results_list, - function(x) x$voom_data_matrix_level - ) - - if (!any(sapply(voom_matrices, is.null))) { - if (args$mode == "isolated") { - data <- do.call(rbind, voom_matrices) # Combine from all levels - } else { # mode == "integrated" - # All levels contain the full data. Can just take the first one. - data <- voom_matrices[[1]] - } - rownames(data) <- feature_names # Readd the original row headers. - } - # Factor and Factor:Time comparisons between levels between_level_condition_only <- list() between_level_condition_time <- list() # Factor AND time @@ -144,8 +130,7 @@ run_limma_splines <- function( for (lev_combo in level_combinations) { result <- between_level( data = data_copy, - preprocess_rna_seq = preprocess_rna_seq, - normalization_fun = normalization_fun, + rna_seq_data = rna_seq_data, meta = meta, design = design, spline_params = spline_params, @@ -185,10 +170,10 @@ run_limma_splines <- function( avrg_diff_conditions = between_level_condition_only, interaction_condition_time = between_level_condition_time ) - + splineomics <- update_splineomics( splineomics = splineomics, - data = data, # In case voom_data_matrix has been generated. + data = data, limma_splines_result = limma_splines_result ) } @@ -205,8 +190,8 @@ run_limma_splines <- function( #' within a condition. #' #' @param data A matrix of data values. -#' @param preprocess_rna_seq Boolean specifying whether to preprocess RNA seq -#' @param normalization_fun Function for normalizing RNA-seq raw-counts. +#' @param rna_seq_data An object containing the preprocessed RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. #' @param meta A dataframe containing metadata, including a 'Time' column. #' @param design A design formula or matrix for the LIMMA analysis. #' @param spline_params A list of spline parameters for the analysis. @@ -233,8 +218,7 @@ run_limma_splines <- function( #' between_level <- function( data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, spline_params, @@ -255,13 +239,9 @@ between_level <- function( level_index = 1, design = design ) - - if (preprocess_rna_seq) { - data <- preprocess_rna_seq_data( - raw_counts = data, - design_matrix = design_matrix, - normalization_fun - ) + + if (!is.null(rna_seq_data)) { + data <- rna_seq_data } fit <- limma::lmFit( @@ -336,11 +316,11 @@ between_level <- function( #' #' @param level The level within the condition to process. #' @param level_index The index of the level within the condition. -#' @param spline_params A list of spline parameters for the analysis. +#' @param spline_params A list of spline parameters for the analysis. #' @param data A matrix of data values. -#' @param preprocess_rna_seq Boolean specifying whether to preprocess RNA seq -#' @param normalization_fun Function to normalize RNA-seq raw counts. -#' @param meta A dataframe containing metadata. +#' @param rna_seq_data An object containing the preprocessed RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. +#' @param meta A dataframe containing the metadata for data. #' @param design A design formula or matrix for the limma analysis. #' @param condition A character string specifying the condition. #' @param feature_names A non-empty character vector of feature names. @@ -361,8 +341,7 @@ within_level <- function( level_index, spline_params, data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, condition, @@ -388,8 +367,7 @@ within_level <- function( result <- process_within_level( data = data_copy, - preprocess_rna_seq = preprocess_rna_seq, - normalization_fun = normalization_fun, + rna_seq_data = rna_seq_data, meta = meta_copy, design = design, spline_params = spline_params, @@ -410,8 +388,7 @@ within_level <- function( list( name = results_name, - top_table = top_table, - voom_data_matrix_level = result$voom_data_matrix + top_table = top_table ) } @@ -419,105 +396,6 @@ within_level <- function( # Level 2 internal functions --------------------------------------------------- -#' Perform default preprocessing of raw RNA-seq counts -#' -#' @description -#' This function is called when `preprocess_rna_seq` is `TRUE`. It performs the -#' default preprocessing steps for raw RNA-seq counts, including creating a -#' `DGEList` object, normalizing the counts, and applying the `voom` -#' transformation. -#' -#' @param raw_counts A matrix of raw RNA-seq counts (genes as rows, samples as -#' columns). -#' @param design_matrix A design matrix used in the linear modeling, typically -#' specifying the experimental conditions. -#' @param normalize_func An optional normalization function. If provided, this -#' function will be used to normalize the `DGEList` object. If not provided, -#' TMM normalization (via `edgeR::calcNormFactors`) will be used by default. -#' -#' @return A `voom` object, which includes the log2-counts per million (logCPM) -#' matrix and observation-specific weights. -#' -#' @importFrom limma voom -#' -preprocess_rna_seq_data <- function( - raw_counts, - design_matrix, - normalize_func = NULL -) { - - message("Preprocessing RNA-seq data (normalization + voom)...") - - # Check if edgeR is installed; if not, prompt the user - if (!requireNamespace("edgeR", quietly = TRUE)) { - message("The 'edgeR' package is not installed.") - - # Prompt user for action - repeat { - user_input <- readline( - prompt = - "What would you like to do?\n - 1: Automatically install edgeR\n - 2: Manually install edgeR\n - 3: Cancel\n - Please enter 1, 2, or 3: " - ) - - if (user_input == "1") { - # Try to install edgeR automatically from Bioconductor - message("Attempting to install 'edgeR' automatically - from Bioconductor...") - if (!requireNamespace("BiocManager", quietly = TRUE)) { - utils::install.packages("BiocManager") - } - tryCatch( - { - BiocManager::install("edgeR", update = FALSE) - }, - error = function(e) { - stop( - "Automatic installation of 'edgeR' failed. - Please install it manually and try again.", - call. = FALSE - ) - } - ) - break # Exit the loop if installation is successful - } else if (user_input == "2") { - stop( - "Please install 'edgeR' manually using - BiocManager::install('edgeR') and then re-run the function.", - call. = FALSE - ) - } else if (user_input == "3") { - stop("Operation canceled by the user.", call. = FALSE) - } else { - message("Invalid input. Please enter 1, 2, or 3.") - } - } - } - - # Step 1: Create DGEList object from raw counts - y <- edgeR::DGEList(counts = raw_counts) - - # Step 2: Apply the normalization function (either user-provided or default) - if (!is.null(normalize_func) && is.function(normalize_func)) { - y <- normalize_func(y) # user provided normalisation function - } else { - # Default: Normalize the counts using TMM normalization - y <- edgeR::calcNormFactors(y) - } - - # Step 3: Apply voom transformation to get logCPM values and weights - voom_obj <- limma::voom( - y, - design_matrix - ) - - return(voom_obj) -} - - #' Process Top Table #' #' @description @@ -525,9 +403,8 @@ preprocess_rna_seq_data <- function( #' intercepts. #' #' @param process_within_level_result List of lists containing the limma -#' topTable, fit, and optionally the voom -#' object. All of this is from one specific -#' level. +#' topTable, and fit. All of this is from +#' one specific level. #' @param feature_names A non-empty character vector of feature names. #' #' @return A dataframe containing the processed top table with added intercepts. @@ -549,13 +426,11 @@ process_top_table <- function( top_table, feature_names ) - + intercepts <- as.data.frame(stats::coef(fit)[, "(Intercept)", drop = FALSE]) - intercepts_ordered <- intercepts[match(top_table$feature_nr, - rownames(intercepts)), , - drop = FALSE] + intercepts_ordered <- intercepts[top_table$feature_nr, , drop = FALSE] top_table$intercept <- intercepts_ordered[, 1] - + top_table } @@ -568,8 +443,8 @@ process_top_table <- function( #' analysis for a selected level of a factor #' #' @param data A matrix of data values. -#' @param preprocess_rna_seq Boolean specifying whether to preprocess RNA seq -#' @param normalization_fun Function for normalizing RNA-seq raw counts. +#' @param rna_seq_data An object containing the preprocessed RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. #' @param meta A dataframe containing metadata, including a 'Time' column. #' @param design A design formula or matrix for the limma analysis. #' @param spline_params A list of spline parameters for the analysis. @@ -589,8 +464,7 @@ process_top_table <- function( #' process_within_level <- function( data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, spline_params, @@ -605,17 +479,10 @@ process_within_level <- function( design ) - if (preprocess_rna_seq) { - data <- preprocess_rna_seq_data( - raw_counts = data, - design_matrix = design_matrix, - normalize_func = normalization_fun - ) - voom_data_matrix <- data$E - } else { - voom_data_matrix = NULL + if (!is.null(rna_seq_data)) { + data <- rna_seq_data } - + fit <- limma::lmFit( data, design_matrix @@ -639,13 +506,11 @@ process_within_level <- function( list( top_table = top_table, - fit = fit, - voom_data_matrix = voom_data_matrix + fit = fit ) } - # Level 3 internal functions --------------------------------------------------- @@ -666,7 +531,26 @@ modify_limma_top_table <- function( top_table, feature_names ) { - + + is_integer_string <- function(x) { + return(grepl("^[0-9]+$", x)) + } + + # Because the row headers of a potential rna_seq_data object were not + # converted to ints (written as strings) beforehand. This is run only when + # the row headers are still "real" strings. + if (!all(sapply(rownames(top_table), is_integer_string))) { + rownames(top_table) <- sapply( + rownames(top_table), + function(id) { + # Find the index of the current row name in feature_names + index <- which(feature_names == id) + # Return the index as a string + return(as.character(index)) + } + ) + } + top_table <- tidyr::as_tibble( top_table, rownames = "feature_nr" @@ -682,6 +566,6 @@ modify_limma_top_table <- function( # Sort and add feature names based on the feature_nr sorted_feature_names <- feature_names[top_table$feature_nr] top_table <- top_table |> dplyr::mutate(feature_names = sorted_feature_names) - + return(top_table) } diff --git a/R/screen_limma_hyperparams.R b/R/screen_limma_hyperparams.R index 91d5313..e82b4d7 100755 --- a/R/screen_limma_hyperparams.R +++ b/R/screen_limma_hyperparams.R @@ -38,7 +38,7 @@ #' removeBatchEffect supports a maximum of #' two batch columns.) #' } -#' @param datas A list of data frames containing the datasets to be analyzed. +#' @param datas A list of matrices containing the datasets to be analyzed. #' @param datas_descr A description object for the data. #' @param metas A list of data frames containing metadata for each dataset in #' `datas`. @@ -47,6 +47,8 @@ #' @param report_dir A non-empty string specifying the report directory. #' @param adj_pthresholds A numeric vector of p-value thresholds for #' significance determination. +#' @param rna_seq_datas A list of RNA-seq data objects, such as the voom object +#' derived from the limma::voom function. #' @param time_unit A character string specifying the time unit label for plots. #' @param padjust_method A character string specifying the method for p-value #' adjustment. @@ -68,10 +70,15 @@ screen_limma_hyperparams <- function( spline_test_configs, report_dir = here::here(), adj_pthresholds = c(0.05), + rna_seq_datas = NULL, time_unit = "min", # For the plot labels padjust_method = "BH" ) { + if (is.null(rna_seq_datas)) { # Set the default value. + rna_seq_datas <- vector("list", length(datas)) + } + report_dir <- normalizePath( report_dir, mustWork = FALSE @@ -109,6 +116,7 @@ screen_limma_hyperparams <- function( top_tables_combos <- get_limma_combos_results( datas = datas, + rna_seq_datas = rna_seq_datas, metas = metas, designs = designs, modes = modes, @@ -175,6 +183,8 @@ screen_limma_hyperparams <- function( #' spline configurations using the LIMMA method. #' #' @param datas A list of matrices. +#' @param rna_seq_datas A list of RNA-seq data objects, such as the voom object +#' derived from the limma::voom function. #' @param metas A list of metadata corresponding to the data matrices. #' @param designs A list of design matrices. #' @param modes A character vector containing 'isolated' or 'integrated'. @@ -196,6 +206,7 @@ screen_limma_hyperparams <- function( #' get_limma_combos_results <- function( datas, + rna_seq_datas, metas, designs, modes, @@ -223,6 +234,7 @@ get_limma_combos_results <- function( combos, process_combo, datas = datas, + rna_seq_datas = rna_seq_datas, metas = metas, designs = designs, modes = modes, @@ -279,7 +291,6 @@ plot_limma_combos_results <- function( ) ) - combos_separated <- lapply(unique(names_extracted), function(id) { top_tables_combos[names_extracted == id] }) @@ -288,11 +299,14 @@ plot_limma_combos_results <- function( combos <- names(combos_separated) combo_pairs <- combn(combos, 2, simplify = FALSE) - + print("Generating the plots for all pairwise hyperparams-combo comparisons") progress_ticks <- length(combo_pairs) - pb <- progress::progress_bar$new(total = progress_ticks, - format = "[:bar] :percent") + pb <- progress::progress_bar$new( + total = progress_ticks, + format = "[:bar] :percent" + ) + pb$tick(0) time_unit_label <- paste0("[", time_unit, "]") @@ -314,7 +328,7 @@ plot_limma_combos_results <- function( purrr::map(combo_pairs, function(pair) { combo_pair <- combos_separated[pair] - + hitcomp <- gen_hitcomp_plots(combo_pair) composites <- purrr::map(combo_pair, function(combo) { @@ -519,6 +533,8 @@ generate_reports_meta <- function( #' spline_test_configs list. #' @param pthreshold The p-value threshold for significance. #' @param datas A list of data matrices +#' @param rna_seq_datas A list of RNA-seq data objects, such as the voom object +#' derived from the limma::voom function. #' @param metas A list of metadata corresponding to the data matrices. #' @param designs A list of design matrices. #' @param modes A character vector containing 'isolated' or 'integrated'. @@ -541,6 +557,7 @@ process_combo <- function( spline_config_index, pthreshold, datas, + rna_seq_datas, metas, designs, modes, @@ -552,6 +569,7 @@ process_combo <- function( ) { data <- datas[[data_index]] + rna_seq_data <- rna_seq_datas[[data_index]] meta <- metas[[data_index]] design <- designs[[design_index]] mode <- modes[[design_index]] @@ -575,12 +593,13 @@ process_combo <- function( splineomics <- create_splineomics( data = data, + rna_seq_data = rna_seq_data, meta = meta, design = design, spline_params = spline_params, condition = condition, ) - + # suppressMessages will not affect warnings and error messages! result <- suppressMessages(run_limma_splines(splineomics)) @@ -748,10 +767,50 @@ hc_add <- function( #' @importFrom purrr flatten_chr #' hc_vennheatmap <- function(hc_obj) { - + hits_1 <- store_hits(hc_obj$data[[1]]) hits_2 <- store_hits(hc_obj$data[[2]]) + + color_palette <- c("white", "blue", "yellow", "green") + breaks <- c(-0.5, 0.5, 1.5, 2.5, 3.5) + # Check if all elements in hits_1 and hits_2 are character(0) + no_hits_1 <- all(sapply(hits_1, function(x) length(x) == 0)) + no_hits_2 <- all(sapply(hits_2, function(x) length(x) == 0)) + + # If both have no hits, create a placeholder plot for no hits + if (no_hits_1 && no_hits_2) { + # Create a simple empty matrix for the plot + venn_matrix <- matrix( + 0, + nrow = 1, + ncol = 1, + dimnames = list("No Hits", "No Hits") + ) + + plot_title <- sprintf( + "No hits found for %s and %s", + hc_obj$condition_names[[1]], + hc_obj$condition_names[[2]] + ) + + # Continue with your plotting code + vennheatmap_plot <- pheatmap::pheatmap( + venn_matrix, color = color_palette, + breaks = breaks, + cluster_cols = FALSE, + cluster_rows = FALSE, + show_rownames = TRUE, + show_colnames = TRUE, + border_color = NA, + main = plot_title, + silent = TRUE, + fontsize = 6 + ) + + return(list(vennheatmap = vennheatmap_plot, nrhits = 0)) + } + df <- tidyr::expand_grid( features = union( flatten_chr(hits_1), @@ -786,12 +845,9 @@ hc_vennheatmap <- function(hc_obj) { values_from = !!rlang::sym("x")) |> tibble::column_to_rownames("features") |> as.matrix() - + venn_matrix <- venn_matrix[, order(colnames(venn_matrix))] - color_palette <- c("white", "blue", "yellow", "green") - breaks <- c(-0.5, 0.5, 1.5, 2.5, 3.5) - plot_title <- sprintf("0 -> none, 1 -> %s, 2 -> %s, 3 -> both", hc_obj$condition_names[[1]], hc_obj$condition_names[[2]]) @@ -1360,17 +1416,7 @@ plot_composite_splines <- function( args <- list(x = smooth_timepoints, intercept = FALSE) args$df <- spline_test_configs$dof[[config_index]] - - # if (!is.na(spline_test_configs$dof[[config_index]])) { - # args$df <- spline_test_configs$dof[[config_index]] - # } else { - # args$knots <- spline_test_configs$knots[[config_index]] - # } - # - # if (!is.na(spline_test_configs$bknots[[config_index]])) { - # args$Boundary.knots <- spline_test_configs$bknots[[config_index]] - # } - + if (spline_test_configs$spline_type[config_index] == "b") { args$degree <- spline_test_configs$degree[[config_index]] X <- do.call(splines::bs, args) diff --git a/R/splineomics_object.R b/R/splineomics_object.R index 0a69fcc..117df73 100755 --- a/R/splineomics_object.R +++ b/R/splineomics_object.R @@ -1,25 +1,25 @@ -#' Create and update the SplineOmics object -#' ===== -#' -#' Description -#' ----------- -#' Contains the functions to create and update a SplineOmics object. This object -#' is used to collect function arguments, that are equivalent for more than one -#' exported function of the SplineOmics package. Additionally -#' -#' Functions -#' --------- -#' - create_splineomics: Create a SplineOmics object -#' - update_splineomics: Add additional arguments to the SplineOmics -#' object or overwrite existing arguments. -#' -#' Classes -#' ------- -#' None -#' -#' Notes -#' ----- -#' None +# Create and update the SplineOmics object +# ===== +# +# Description +# ----------- +# Contains the functions to create and update a SplineOmics object. This object +# is used to collect function arguments, that are equivalent for more than one +# exported function of the SplineOmics package. Additionally +# +# Functions +# --------- +# - create_splineomics: Create a SplineOmics object +# - update_splineomics: Add additional arguments to the SplineOmics +# object or overwrite existing arguments. +# +# Classes +# ------- +# None +# +# Notes +# ----- +# None # Exported functions ----------------------------------------------------------- @@ -31,9 +31,17 @@ #' Creates a SplineOmics object containing variables that are commonly used #' across multiple functions in the package. #' -#' @param data The actual omics data. +#' @param data The actual omics data. In the case the rna_seq_data argument is +#' used, still provide this argument. In that case, input the data matrix in +#' here (for example the $E part of the voom object). Assign your feature names +#' as row headers (otherwise, just numbers will be your feature names). #' @param meta Metadata associated with the omics data. #' @param condition A condition variable. +#' @param rna_seq_data An object containing the preprocessed RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. +#' This argument is not controlled by any function of the `SplineOmics` package. +#' Rather, in that regard it relies on the input control from the `limma::lmfit` +#' function. #' @param annotation A dataframe with the feature descriptions of data #' (optional). #' @param report_info A list containing report information such as omics data @@ -50,11 +58,12 @@ #' created. Use the same vector that was used to #' create the row headers for the data matrix! #' @param design A design matrix or similar object (optional). -#' @param spline_params Parameters for spline functions (optional). -#' @param preprocess_rna_seq Boolean specifying whether to preprocess RNA seq -#' @param normalization_fun Function used for normalizing RNA-seq. Must take as -#' input the y of: y <- edgeR::DGEList(counts = raw_counts) and output the y -#' with the normalized counts. +#' @param spline_params Parameters for spline functions (optional). Must contain +#' the named elements spline_type, which must contain either the string "n" for +#' natural cubic splines, or "b", for B-splines, the named element degree in the +#' case of B-splines, that must contain only an integer, and the named element +#' dof, specifying the degree of freedom, containing an integer and required +#' both for natural and B-splines. #' @param padjust_method Method for p-value adjustment, one of "none", "BH", #' "BY", "holm", "bonferroni", "hochberg", or "hommel". #' Defaults to "BH" (Benjamini-Hochberg). @@ -64,9 +73,10 @@ #' @export #' create_splineomics <- function( - data, + data, meta, condition, + rna_seq_data = NULL, annotation = NULL, report_info = NULL, meta_batch_column = NULL, @@ -74,14 +84,12 @@ create_splineomics <- function( feature_name_columns = NULL, design = NULL, spline_params = NULL, - preprocess_rna_seq = FALSE, - normalization_fun = NULL, padjust_method = "BH" ) { - + splineomics <- list( data = data, - preprocess_rna_seq = preprocess_rna_seq, + rna_seq_data = rna_seq_data, meta = meta, condition = condition, annotation = annotation, @@ -122,6 +130,7 @@ update_splineomics <- function( allowed_fields <- c( "data", + "rna_seq_data", "meta", "condition", "annotation", @@ -135,6 +144,7 @@ update_splineomics <- function( ) args <- list(...) + for (name in names(args)) { if (!(name %in% allowed_fields)) { stop(paste("Field", name, "is not allowed.")) diff --git a/R/utils_input_validation.R b/R/utils_input_validation.R index 0f05d34..b4b3eff 100755 --- a/R/utils_input_validation.R +++ b/R/utils_input_validation.R @@ -529,6 +529,18 @@ InputControl <- R6::R6Class("InputControl", call. = FALSE) } + # Ensure that the formula begins with an intercept (~ 1) + # Ignore whitespace, check the start of the string + if (!grepl("^\\s*~\\s*1", formula)) { + stop( + paste( + "The design formula must start with an intercept term '~ 1'.", + "This is because spline curves are plotted onto the data", + "which is not possible without an intercept" + ), + call. = FALSE) + } + # Ensure the formula contains the intercept term 'X' if (!grepl("\\bX\\b", formula)) { stop("The design formula must include the term 'X'.", @@ -2406,14 +2418,12 @@ check_splineomics_elements <- function( "report_info" ), "screen_limma_hyperparams" = c( - "preprocess_rna_seq", "condition", "report_info", "padjust_method" ), "run_limma_splines" = c( "data", - "preprocess_rna_seq", "meta", "design", "condition", diff --git a/README.Rmd b/README.Rmd index 754f54b..1499b2b 100755 --- a/README.Rmd +++ b/README.Rmd @@ -210,7 +210,18 @@ An explanation of the three different `limma` results is [here](https://csbg.git #### RNA-seq data -Transcriptomics data must be preprocessed for `limma`. This is done by setting the preprocess_rna_seq argument to TRUE (see [documentation of the create_splineomics function](https://csbg.github.io/SplineOmics/reference/create_splineomics.html)). Then, the raw RNA-seq counts provided in the data matrix will undergo normalization and transformation. The default normalization is performed using TMM (Trimmed Mean of M-values) normalization via the `edgeR`::calcNormFactors function, followed by the voom transformation from the `limma` package to obtain log-transformed counts per million (logCPM) with associated precision weights. If you require a different normalization method, you can supply your custom normalization function. +Transcriptomics data must be preprocessed for `limma`. You need to provide an +appropriate object, such as a `voom` object, in the `rna_seq_data` argument of +the `SplineOmics` object (see +[documentation](https://csbg.github.io/SplineOmics/reference/create_splineomics.html)). +Along with this, the normalized matrix +(e.g., the `$E` slot of the `voom` object) must be passed to the `data` +argument. This allows flexibility in preprocessing; you can use any method +you prefer as long as the final object and matrix are compatible with limma. +One way to preprocess your RNA-seq data is by using the `preprocess_rna_seq_data()` +function included in the `SplineOmics` package +(see [documentation](https://csbg.github.io/SplineOmics/reference/preprocess_rna_seq_data.html)). + #### Glycan fractional abundance data diff --git a/README.md b/README.md index 7b6de36..dcfcae2 100755 --- a/README.md +++ b/README.md @@ -269,17 +269,17 @@ An explanation of the three different `limma` results is #### RNA-seq data -Transcriptomics data must be preprocessed for `limma`. This is done by -setting the preprocess_rna_seq argument to TRUE (see [documentation of -the create_splineomics -function](https://csbg.github.io/SplineOmics/reference/create_splineomics.html)). -Then, the raw RNA-seq counts provided in the data matrix will undergo -normalization and transformation. The default normalization is performed -using TMM (Trimmed Mean of M-values) normalization via the -`edgeR`::calcNormFactors function, followed by the voom transformation -from the `limma` package to obtain log-transformed counts per million -(logCPM) with associated precision weights. If you require a different -normalization method, you can supply your custom normalization function. +Transcriptomics data must be preprocessed for `limma`. You need to +provide an appropriate object, such as a `voom` object, in the +`rna_seq_data` argument of the `SplineOmics` object (see +[documentation](https://csbg.github.io/SplineOmics/reference/create_splineomics.html)). +Along with this, the normalized matrix (e.g., the `$E` slot of the +`voom` object) must be passed to the `data` argument. This allows +flexibility in preprocessing; you can use any method you prefer as long +as the final object and matrix are compatible with limma. One way to +preprocess your RNA-seq data is by using the `preprocess_rna_seq_data()` +function included in the `SplineOmics` package (see +[documentation](https://csbg.github.io/SplineOmics/reference/preprocess_rna_seq_data.html)). #### Glycan fractional abundance data diff --git a/dev/function_testing_ground.R b/dev/function_testing_ground.R index f0e1777..b08ab0f 100755 --- a/dev/function_testing_ground.R +++ b/dev/function_testing_ground.R @@ -79,29 +79,39 @@ data <- extract_data( # Simulate RNA-seq data to test voom functionality ----------------------------- -# generate_rnaseq_data <- function(n_genes = 1000, n_samples = 36) { -# set.seed(123) # For reproducibility -# -# # Define sample and gene names -# gene_names <- paste0("Gene", 1:n_genes) -# sample_names <- paste0("Sample", 1:n_samples) -# -# # Generate random raw RNA-seq counts (Poisson distributed) -# # Base expression level with some variability -# base_expression <- rpois(n_genes, lambda = 20) # Baseline counts -# counts_matrix <- sapply(1:n_samples, function(x) rpois(n_genes, lambda = base_expression)) -# -# # Assign row and column names -# rownames(counts_matrix) <- gene_names -# colnames(counts_matrix) <- sample_names -# -# return(counts_matrix) -# } -# -# # Example usage: -# n_genes <- 7162 # Adjust the number of genes as needed +generate_rnaseq_data <- function(n_genes = 1000, n_samples = 36) { + set.seed(123) # For reproducibility + + # Define sample and gene names + gene_names <- paste0("Gene", 1:n_genes) + sample_names <- paste0("Sample", 1:n_samples) + + # Generate random raw RNA-seq counts (Poisson distributed) + # Base expression level with some variability + base_expression <- rpois(n_genes, lambda = 20) # Baseline counts + counts_matrix <- sapply(1:n_samples, function(x) rpois(n_genes, lambda = base_expression)) + + # Assign row and column names + rownames(counts_matrix) <- gene_names + colnames(counts_matrix) <- sample_names + + return(counts_matrix) +} + +# Example usage: +n_genes <- 4162 # Adjust the number of genes as needed # data <- generate_rnaseq_data(n_genes = n_genes) +voom_obj <- preprocess_rna_seq_data( + raw_counts = data, + meta = meta, + spline_params = list(spline_type = c("n"), # Chosen spline parameters + dof = c(2L)), + design = "~ 1 + Phase*X + Reactor" +) + +# data <- voom_obj$E + # Explore data ----------------------------------------------------------------- report_info <- list( @@ -115,23 +125,23 @@ report_info <- list( splineomics <- create_splineomics( data = data, + # rna_seq_data = voom_obj, meta = meta, annotation = annotation, feature_name_columns = feature_name_columns, report_info = report_info, condition = "Phase", meta_batch_column = "Reactor", - preprocess_rna_seq = FALSE ) report_dir <- here::here("results", "explore_data") # debug(explore_data) -plots <- explore_data( - splineomics, - report_dir = report_dir, - report = TRUE - ) +# plots <- explore_data( +# splineomics, +# report_dir = report_dir, +# report = TRUE +# ) # Prep input to hyperparams screen function ------------------------------------ @@ -140,15 +150,18 @@ meta1 <- meta data2 <- data[, -c(1, 2)] meta2 <- meta[-c(1, 2),] +# data2 <- data +# meta2 <- meta datas <- list(data1, data2) +# rna_seq_datas <- list(voom_obj, voom_obj) # Just to test it. datas_descr <- c("full_data", "outliers_removed") metas <- list(meta1, meta2) -designs <- c("~ 1 + Phase*X + Reactor", "~ 1 + X + Reactor") +designs <- c("~ 1 + Phase*X + Reactor", "~ 1 + Phase*X + Reactor") report_dir <- here::here("results", "hyperparams_screen_reports") -pthresholds <- c(0.05, 0.1) +pthresholds <- c(0.05, 0.01) # Every row a combo to test. spline_test_configs <- data.frame( @@ -160,16 +173,17 @@ spline_test_configs <- data.frame( # hyperparams screen limma ----------------------------------------------------- # debug(screen_limma_hyperparams) -screen_limma_hyperparams( - splineomics, - datas, - datas_descr, - metas, - designs, - spline_test_configs, - report_dir, - pthresholds - ) +# screen_limma_hyperparams( +# splineomics, +# datas, +# datas_descr, +# metas, +# designs, +# spline_test_configs, +# report_dir, +# pthresholds, +# rna_seq_datas, +# ) ## Run limma splines ----------------------------------------------------------- @@ -177,8 +191,8 @@ screen_limma_hyperparams( splineomics <- update_splineomics( splineomics = splineomics, design = "~ 1 + Phase*X + Reactor", - data = data1, - meta = meta1, + data = data2, + meta = meta2, spline_params = list(spline_type = c("n"), # Chosen spline parameters dof = c(2L)) ) @@ -191,11 +205,11 @@ splineomics <- run_limma_splines( report_dir <- here::here("results", "limma_reports") -plots <- create_limma_report( - splineomics, - adj_pthresh = 0.1, - report_dir = report_dir -) +# plots <- create_limma_report( +# splineomics, +# adj_pthresh = 0.1, +# report_dir = report_dir +# ) ## Cluster hits ---------------------------------------------------------------- diff --git a/inst/tutorial/tutorial.Rmd b/inst/tutorial/tutorial.Rmd index da01046..03ef107 100755 --- a/inst/tutorial/tutorial.Rmd +++ b/inst/tutorial/tutorial.Rmd @@ -1,7 +1,7 @@ --- title: "demo" author: "Thomas Rauter" -date: "15 July, 2024" +date: "20 September, 2024" output: html_document editor_options: markdown: @@ -17,24 +17,43 @@ knitr::opts_chunk$set( # About this tutorial -This tutorial demonstrates the capabilities of the SplineOmics package -through a comprehensive example: a time-series proteomics experiment -involving CHO cells cultivated in three bioreactors (biological -replicates). Samples were collected from each reactor in triplicates at -specific time points relative to cell feeding (60 min before, and 15, -60, 90, 120, and 240 min after feeding) during both exponential and -stationary growth phases. - -The objective is to identify which of the 7162 cellular proteins show -significant changes over time post-feeding. Proteins with significant -temporal changes are then clustered based on their patterns. A gene set -enrichment analysis is performed for each cluster to identify processes -that are up- or downregulated over time after feeding. - -Note: For a better understanding of the SplineOmics functions, the -required and optional arguments are documented here. These however are -only short forms of the full documentation of the arguments, which you -can find by selecting a function and pressing F2. +This tutorial intends to showcase and explain the capabilities of the +**SplineOmics** package by walking through a real and complete example, +from start to finish. + +### Example Overview + +The example involves a **time-series proteomics experiment**, where CHO +(chinese hamster ovary) cells were cultivated in three bioreactors +(three biological replicates). The experiment includes the following +setup: + +- Samples were taken both during the **exponential** and **stationary + growth phases**. +- Samples were collected in triplicates from each reactor at defined + timepoints relative to cell feeding: + - 60 minutes before feeding + - 15, 60, 90, 120, and 240 minutes after feeding + +### Analysis Goals + +The main goals of this analysis are: + +- **Identify proteins with significant temporal changes**: Out of 7162 + cellular proteins, the objective is to detect which proteins show a + significant change over time after the CHO cells were fed (i.e., the + impact of the feeding). +- **Cluster hits based on temporal patterns**: The proteins (hits) + with significant temporal changes will be clustered according to + their time-based patterns. +- **Perform gene set enrichment analysis**: For each cluster, a gene + set enrichment analysis will be performed to determine if specific + biological processes are up- or downregulated after feeding. + +### Note + +The documentation of all the **SplineOmics** package functions can be viewed +[here](https://csbg.github.io/SplineOmics/reference) Further note: To run the code of a box, click on the respective ▶️ play button like symbol. @@ -43,7 +62,9 @@ like symbol. Make sure all the required packages for this analysis script are installed. Part of these packages are not dependencies of the -SplineOmics package, that is why they could be missing. +SplineOmics package, that is why they could be missing. If the code block +below does not work for you, manually install those packages and skip this +block. ```{r Conditionally install missing packages} install_if_missing <- function(packages) { @@ -71,9 +92,9 @@ install_if_missing(packages_to_install) library(SplineOmics) # Functions are marked with SplineOmics:: # Additional packages needed to prepare SplineOmics function inputs -library(readxl) -library(here) -library(readr) +library(readxl) # for loading Excel files +library(here) # For managing filepaths +library(readr) # For reading the database TSV files ``` To avoid conflicts between functions from the dplyr package and base R @@ -82,7 +103,7 @@ the conflicted package. This ensures that the intended function is used, preventing potential errors and improving code clarity. ```{r Load dplyr package} -library(dplyr) +library(dplyr) # For data manipulation library(conflicted) # Explicitly state preference of functions @@ -96,25 +117,38 @@ conflicted::conflict_prefer("union", "base") # Load the files -In this example, the data.xlsx file contains numeric values -(intensities) and feature descriptions, such as gene and protein names -(annotation part). The meta.xlsx file contains meta information, which -describes the columns of the numeric values in data.xlsx. - -These example files are included in the package and do not need to be -present on your system. For your analysis, create file paths using the -here library instead of system.file. - -```{r Load the files} -data_excel <- readxl::read_excel( - system.file( - "extdata", - "proteomics_data.xlsx", - package = "SplineOmics" - ) - ) +In this example, the proteomics_data.rds file contains the numeric values (the +intensities) and also the feature descriptions, such as gene and protein +name (= annotation part). Usually, you would load the data from for example an +Excel file, but the .rds file is more compressed, which is the reason this +format was chosen here to limit the size of the SplineOmics package. + +The file meta.xlsx contains the meta information, which are the +descriptions of the columns of the numeric values of data. -meta <- readxl::read_excel( +(These example files are part of the package and don't have to be +present on your system). + +Please note that this dataset is an actual experimental dataset, but the +annotation information, such as gene names, has been removed since it was +not yet published at the time of making the SplineOmics package public. Instead, +the dataset includes randomly generated gene +symbols and gene names corresponding to Cricetulus griseus (Chinese +Hamster) for each row. This is intended to demonstrate the functionality +of the package. + +The left part of data contains the numeric values, and the right part the +annotation info, which can be copied in a separate dataframe, as shown below. + +```{r load the files} +data <- readRDS(system.file( + "extdata", + "proteomics_data.rds", + package = "SplineOmics" + )) + + +meta <- read_excel( system.file( "extdata", "proteomics_meta.xlsx", @@ -123,88 +157,130 @@ meta <- readxl::read_excel( ) # Extract the annotation part from the dataframe. -first_na_col <- which(is.na(data_excel[1,]))[1] -annotation <- data_excel |> - dplyr::select((first_na_col + 1):ncol(data_excel)) |> +first_na_col <- which(is.na(data[1,]))[1] +annotation <- data |> + dplyr::select((first_na_col + 1):ncol(data)) |> dplyr::slice(-c(1:3)) + +print(data) +print(meta) +print(annotation) ``` -Note that for this experiment, just a single treatment is present, which -is the growth phase (exponential or stationary) of the cells. This is -encoded in "condition" column of meta, here called "Phase". If there is -more than one treatment, they can be combined in the single condition -column. For example, if additionally, there would also be a temperature -shift, from 37 to 32 °C, this could for example be written in the -condition column in following way: exp_37, exp_32, stat_37, stat_32 -(Both treatments combined in one string and all placed in the single -condition column). - -# Bring the inputs into the standardized (required) format - -Since `data_excel` is not in the format required by the SplineOmics -package, it needs processing. This can be done with a few R commands, -but if your file looks like the one here, with the data matrix on the -left and annotation info on the right, separated by an empty column -(which is required!), the `extract_data()` function can handle this -automatically. - -The function identifies the data matrix and converts it into a -dataframe. Column headers are created from the information in the cells -above each data matrix column. If no annotation columns are specified, -row headers are simply increasing numbers. In this example, the -annotation columns "First.Protein.Description" and "ID" are specified to -form the row headers (feature names). These names will be used to label -any plots where a feature is shown individually, such as spline plots -with datapoints from an individual feature. - -## Required Arguments `extract_data()` - -- **data**: A dataframe loaded from a tabular file. - -## Optional Arguments `extract_data()` - -- **feature_name_columns**: A character vector specifying the columns - of the dataframe `data` that should be used to construct the feature - names. If omitted, the feature names are just numbers (stored as - characters) starting from 1 (1, 2, 3, etc.). - -(When you want to have meaningful feature descriptions, add feature-name -columns as arguments to the function below. They are used for the row -headers of the matrix. If you don't use this function, make sure you -have row headers to your matrix if you want feature descriptions for -your plots.) - -```{r Process the inputs} +## Bring the Inputs into the Standardized Format + +Since `data` is not in the format required by the **SplineOmics** +package, it needs some processing. The SplineOmics package requires data to be +a numeric matrix, so no element is allowed to be anything else than a number. +This can be done with a few commands +in R, but if your file has a specific structure, the function +`extract_data()` can handle this automatically. + +### File Structure Requirements + +If your file looks like the one used here, where: + +- The **data matrix field** is on the left +- The **annotation info** is on the right +- These fields are separated by one empty column + +### Usage of the extract_data() function + +Then, `extract_data()` can: + +- **Identify the data matrix field** and return it as a numeric matrix. +- **Create column headers** from the information written in the cells + above the respective columns of the data matrix field. +- **Assign rowheaders**: + - If no annotation columns are specified, rowheaders will be + increasing numbers. + - If annotation columns are specified (like + `"First.Protein.Description"` and `"ID"` in this example), these + will be combined to form the rowheaders (feature names). + +### Usage in Plotting + +The generated rowheaders will be used to label any plots where a feature +is shown individually, such as: + +- **Spline plots** with the datapoints from an individual feature. + +```{r process inputs, eval = TRUE} data <- SplineOmics::extract_data( - data = data_excel, - feature_name_columns = c( # Feature names will be a combo out of these col - "First.Protein.Description", # Prodivde the row headers of the matrix - "ID" - ) + # The dataframe with the numbers on the left and info on the right. + data = data, + # Use this annotation column for the feature names. + feature_name_columns = c("Gene_name"), + # When TRUE, you must confirm that data is in the required format. + user_prompt = FALSE ) ``` # Perform EDA (exploratory data analysis) -The first step in analyzing data is usually EDA. EDA involves -summarizing the main characteristics of the data, often using plots such -as density distributions, boxplots, PCA, and correlation heatmaps. This -process can be carried out using the package function `explore_data()`. +Now that we have the data in the required format (numeric matrix) we can go on. + +The first step in analyzing data is typically **Exploratory Data +Analysis (EDA)**. EDA involves summarizing the main characteristics of +the data, often through visualizations. + +### Common EDA Plots + +Some common types of EDA plots include: + +- **Density distributions** +- **Boxplots** +- **PCA (Principal Component Analysis)** +- **Correlation heatmaps** + +Again, you can generate those plots yourself with a few lines of R code. +However, if you prefer, for convenience, the `explore_data()` function can +handle this for you. + +### Using `explore_data()` for EDA + +The **SplineOmics** package provides the function `explore_data()` to +perform EDA. This function requires the following arguments: + +- **data**: The numeric data matrix. +- **meta**: The metadata table. +- **condition**: The name of the column in the metadata that contains + the levels of the experiment (e.g., "Exponential" and "Stationary"). +- **report_info**: A list that contains general information about the + analysis, such as the name of the analyst and the datatype (e.g. proteomics) + +### Optional Arguments -These batch columns are used to run the `removeBatchEffect` function of -limma to remove the batch effect from the data for plotting. When at -least one batch column is provided, the function generates two EDA HTML -reports: one for the uncorrected data and one for the batch corrected -data. +In addition to the required arguments, `explore_data()` offers several +optional arguments: -### Report Generation +- **meta_batch_column**: The name of the column that contains the + first batch effect. -The reports are written to the current working directory by default or -to a specified location using the optional argument `report_dir`. The -function also returns all generated plots. If no report should be -generated, set the optional argument `report` to `FALSE`. +- **meta_batch2_column**: The name of the column that contains the + second batch effect. -```{r Define info that is written in all the HTML reports} + If at least one batch column is provided, the function will: + + - Use the `removeBatchEffect()` function from **limma** to remove + the batch effect from the data before plotting. + - Generate two EDA HTML reports: one for the **uncorrected data** + and one for the **batch-corrected data**. + +### Output and Report Options + +- By default, the reports are saved in the **current working + directory**, but this location can be changed using the `report_dir` + argument. +- The function also **returns all plots** generated during the + analysis, so that you can modify them according to your own needs. +- If you do not want a report to be generated, you can set the + `report` argument to `FALSE` (when you for example just want the figures + in the R environment) + +```{r Load EDA arguments, eval = TRUE} +# Those fields are mandatory, because we believe that when such a report is +# opened after half a year, those infos can be very helpful. report_info <- list( omics_data_type = "PTX", data_description = "Proteomics data of CHO cells", @@ -213,13 +289,23 @@ report_info <- list( contact_info = "thomas.rauter@plus.ac.at", project_name = "DGTX" ) + +report_dir <- here::here( + "demo_results", + "explore_data" + ) ``` ## SplineOmics Object -The SplineOmics object is used because multiple functions in the package -take the same inputs, and some functions generate "intermediate" output -that is used by subsequent functions in the workflow. +In the SplineOmics package, multiple functions take the same arguments as input. +To make this easier and to avoid errors, we decided that those arguments are not +provided individually to the functions, but are all stored in an R6 object +(which is of type 'SplineOmics') and then this object is passed to the +functions. Additionally, some functions generate intermediate output, which is +just necessary for the next function in the workflow, which is then also just +passed along by updating the SplineOmics object. But you don't have to worry +about this. ### Functionality @@ -229,7 +315,13 @@ from the object and potentially adds new data or results back into it. ### Documentation -The documentation for each function specifies which arguments must be +The documentation of the function that creates the SplineOmics object can be +found [here](https://csbg.github.io/SplineOmics/reference/create_splineomics.html) +and the documentation of the function that updates it +[[here](https://csbg.github.io/SplineOmics/reference/update_splineomics.html) + +The documentation for each function that takes the SplineOmics object as input +specifies which arguments must be present in the SplineOmics object when it is passed to the respective function. @@ -242,6 +334,8 @@ function. ## Optional Arguments `create_splineomics()` +- **rna_seq_data**: An object containing the preprocessed RNA-seq data, + such as the output from `limma::voom` function. - **annotation**: A dataframe with the feature descriptions of data. - **report_info**: A list containing general information about the analysis. @@ -250,522 +344,339 @@ function. - **design**: A limma design formula - **spline_params**: Parameters for the spline functions. -```{r Create the SplineOmics object} +```{r Create the SplineOmics object, eval = TRUE} +# splineomics now contains the SplineOmics object. splineomics <- SplineOmics::create_splineomics( data = data, meta = meta, annotation = annotation, report_info = report_info, - condition = "Phase", - meta_batch_column = "Reactor" + condition = "Phase", # Column of meta that contains the levels. + meta_batch_column = "Reactor" # For batch effect removal ) ``` -## Required Arguments `explore_data()` - -- **splineomics**: A SplineOmics object containing the following - fields: - - **data**: A matrix with the data. - - **meta**: Metadata associated with the data. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **report_info**: A list containing general information about the - analysis. - - **meta_batch_column**: Column for meta batch information. - - **meta_batch2_column**: Column for secondary meta batch - information. - -## Optional Arguments `explore_data()` - -- **report_dir**: The path to the output directory. Default is current - work dir. -- **report**: A Boolean TRUE or FALSE value specifying if a report - should be generated. Default is TRUE. - -```{r Run the EDA function} -report_dir <- here::here( - "demo_results", - "explore_data" - ) +Now that we have the SplineOmics object defined, we can perform our exploratory +data analysis. +```{r Run EDA function, eval = TRUE} plots <- SplineOmics::explore_data( - splineomics = splineomics, + splineomics = splineomics, # SplineOmics object report_dir = report_dir ) ``` -The EDA plots can reveal a range of insights. In the HTML report, the -plots are grouped into three categories: distribution and variability -analysis, time series analysis, and dimensional reduction and -clustering. +The EDA plots can tell you a range of things. The plots in the HTML +report are grouped into three categories: Distribution and Variability +Analysis, Time Series Analysis, and Dimensionality Reduction and +Clustering. -### Correlation Heatmaps +If you look at the correlation heatmaps in the HTML report, you can see +that the samples E12_TP05_Exponential and E10_TP10_Stationary stick out. +Seeing this, you might want to remove them from the data. You can test +out what happens when you do this, along with testing how other +hyperparameter choices influence the results, with the package function +screen_limma_hyperparams(). -If you examine the correlation heatmaps in the HTML report, you might -notice that the samples `E12_TP05_Exponential` and `E10_TP10_Stationary` -stand out. Based on this observation, you might decide to remove these -samples from the data. The impact of such decicions can be explored with -the screen_limma_hyperparams() function. +## Finding the Best Hyperparameters -# Find the best hyperparameters +Before running the **limma spline analysis**, it is important to find +the best "hyperparameters". In this context, hyperparameters include: -## Determining the Best Hyperparameters +- **Degree of freedom (DoF)** +- **Different versions of the data** (e.g., outlier removed vs. not + removed) +- **Different limma design formulas** -Before running the limma spline analysis, we need to determine the best -"hyperparameters." Hyperparameters in this context include the degree of -freedom, different versions of the data (e.g., outlier removed vs. not -removed), different limma design formulas, etc. Rationally choosing the -best combination of hyperparameters is challenging, so it is often -better to try out multiple combinations and select the best one. +### Challenge of Hyperparameter Selection + +Rationally determining the best combination of hyperparameters can be +very challenging. By rationally, I mean deciding upon the final hyperparameters +without ever testing any, just by scientific reasoning. It is much easier just +testing a few and seeing how they actually behave. However, manually selecting +combinations can be tedious, and you have to work very systematically, which +can be challenging. To solve this problem, the `screen_limma_hyperparams()` +function was written. ### Using `screen_limma_hyperparams()` -The function `screen_limma_hyperparams()` automates this process. For -each hyperparameter, you specify the values you want to try, and the -function runs the limma spline analysis with various combinations of -these hyperparameters. Not every single combination is generated. -Instead, there are "inner" and "outer" hyperparameters. All combinations -are generated for "outer" hyperparameters, while specific combinations -are generated for "inner" hyperparameters. - -"Inner" hyperparameters include the adjusted p-value thresholds and -spline parameters. For example, if you have two versions of a dataset -(one with potential outliers removed and one without), these are -considered "outer" hyperparameters. The function generates all possible -comparisons for the "outer" hyperparameters, resulting in a single -comparison. Then, for each version of the data, it generates every -combination of the "inner" hyperparameters. +The function `screen_limma_hyperparams()` automates the process of +testing different combinations of hyperparameters. Here's how it works: + +- **Specify values**: For each hyperparameter, you can specify all the + values you want to test. +- **Run combinations**: The function runs the **limma spline + analysis** with combinations formed from the hyperparameters you've + provided in a semi combinatorial way. + +### Inner vs. Outer Hyperparameters + +Semi combinatorial here means that not every possible combination is generated. +Instead, there are **inner** and **outer** hyperparameters: + +- **Outer hyperparameters**: These include things like **different + versions of the dataset** (e.g., full dataset vs. dataset with + outliers removed). + - All possible combinations of outer hyperparameters are + generated. +- **Inner hyperparameters**: These include **adjusted p-value + thresholds** and **spline parameters** (e.g., degree of freedom). + - For each version of the data (outer hyperparameter), all + combinations of inner hyperparameters are tested. + +This approach is neccessary, because otherwise the amount of combos would +explode. ### Example -For example, if you specify natural cubic splines with a degree of -freedom of 2 or 3, and adjusted p-value thresholds of 0.05 or 0.1, the -function will test all combinations: +For example, if you have two versions of a dataset (one full dataset, +and one with some outliers removed), these versions are considered outer +hyperparameters. Additionaly, lets say, you want to test two different limma +design formulas, formula 1 and 2. The function will test out all combinations +of those outer hyperparameters and compare them with each other, which results +in a total of 6 combinations here: + +- **Full Dataset Formula 1** vs **Full Dataset Formula 2** +- **Full Dataset Formula 1** vs **Outliers Removed Dataset Formula 1** +- **Full Dataset Formula 1** vs **Outliers Removed Dataset Formula 2** + +- **Full Dataset Formula 2** vs **Outliers Removed Dataset Formula 1** +- **Full Dataset Formula 2** vs **Outliers Removed Dataset Formula 2** + +- **Outliers Removed Dataset Formula 1** vs **Outliers Removed Dataset Formula 2** -- DoF = 2, threshold = 0.05 -- DoF = 3, threshold = 0.05 -- DoF = 2, threshold = 0.1 -- DoF = 3, threshold = 0.1 +Let's say you specified the following inner +hyperparameters: -This systematic approach helps in identifying the best hyperparameters -for the analysis. +- **Spline parameters**: Natural cubic splines with a degree of + freedom of either 2 or 3. +- **Adjusted p-value threshold**: 0.05 or 0.1. -```{r Load hyperparameter-screening args} +The function will generate and test all combinations of the spline +parameters and p-value thresholds for all 4 combos: + +Combo 1: +- **DoF = 2, threshold = 0.05** +- **DoF = 3, threshold = 0.05** +- **DoF = 2, threshold = 0.1** +- **DoF = 3, threshold = 0.1** + +Combo 2: +- **DoF = 2, threshold = 0.05** +- **DoF = 3, threshold = 0.05** +- **DoF = 2, threshold = 0.1** +- **DoF = 3, threshold = 0.1** + +Combo 3: +... + +This allows you to systematically explore different combinations and +select the optimal hyperparameters for your analysis. + +Below is an example for our proteomics data: + +```{r Load hyperparameter-screening args, eval = TRUE} data1 <- data meta1 <- meta -data2 <- data[, !(colnames(data) %in% c( # Remove potential outliers - "E12_TP05_Exponential", +# Remove the "outliers" +data2 <- data[, !(colnames(data) %in% c( + "E12_TP05_Exponential", "E10_TP10_Stationary" ) )] +# Adjust meta so that it matches data2 meta2 <- meta[!meta$`Sample.ID` %in% c( "E12_TP05_Exponential", "E10_TP10_Stationary" ), ] +# As mentioned above, all the values of one hyperparameter are stored +# and provided as a list. datas <- list(data1, data2) + +# This will be used to describe the versions of the data. datas_descr <- c( "full_data", "outliers_removed" ) -metas <- list( - meta1, - meta2 - ) +metas <- list(meta1, meta2) +# Test two different limma designs designs <- c( "~ 1 + Phase*X + Reactor", "~ 1 + X + Reactor" ) +# Specify the meta "level" column +condition <- "Phase" + +report_dir <- here::here( + "demo_results", + "hyperparams_screen_reports" + ) + +# To remove the batch effect +meta_batch_column = "Reactor" + +# Test out two different p-value thresholds (inner hyperparameter) pthresholds <- c( 0.05, 0.1 ) -``` - -### Spline Configuration Parameters - -The `spline_test_configs` dataframe (see box below) is used to specify -the parameters for different runs of spline analysis. Each row in the -dataframe corresponds to one set of spline settings. The supported -spline types are natural cubic splines (denoted by "n") and B-splines -(denoted by "b"). - -#### Parameters - -- **spline_type**: The type of spline to use. Options are: - - `"n"`: Natural cubic splines - - `"b"`: B-splines -- **degree**: The degree of the spline. This is only required for - B-splines (`spline_type = "b"`). For natural cubic splines - (`spline_type = "n"`), this should be set to `NA`. -- **dof**: Degrees of Freedom (DoF) for the spline. This parameter - controls the flexibility of the spline. Higher values allow more - flexibility. -- **knots**: A list specifying the positions of the knots. If set to - `NA`, the knots are placed automatically in a central fashion. Knots - are only needed when you want to manually specify their positions. - Each element of the list should correspond to a vector of knot - positions for the respective spline. - -#### Understanding the Relationship Between Degree, DoF, and Knots - -The relationship between the degree of the spline, the degrees of -freedom (DoF), and the number of internal knots (k) varies between -B-splines and natural cubic splines. Here's a breakdown: - -- **B-splines**: - - - **degree**: The degree of the spline. - - **DoF**: Degrees of freedom. - - **k**: Number of internal knots. - - The relationships are given by the following formulas: - - - `DoF = k + degree` - - `k = DoF - degree` - -- **Natural cubic splines**: - - **degree**: Always 3 for cubic splines. - - **DoF**: Degrees of freedom. - - **k**: Number of internal knots. - - The relationships are given by the following formulas: - - - `DoF = k + 1` - - `k = DoF - 1` - -#### Specifying Parameters - -You either specify the degrees of freedom (DoF) or the knots, not both. -The choice depends on how you want to control the flexibility of the -spline: - -- **Specifying DoF**: The number of internal knots (k) will be - determined automatically based on the DoF. -- **Specifying Knots**: The degrees of freedom (DoF) will be - calculated based on the number of knots and the degree of the - spline. - -```{r spline_test_configs definition} -# Every row a combo to test. +# Create a dataframe with combinations of spline parameters to test +# (every row a combo to test) spline_test_configs <- data.frame( - spline_type = c("n", "n", "n", "n"), # All should use natural splines (n) - degree = c(NA, NA, NA, NA), # only needed for B-splines (spline_type = b) - dof = c(2L, 3L, 4L, 5L), # Test these variations of the DoF. - # Per default, knots are placed automatically in a central fashion. - knots = I(list(c(NA), c(NA), c(NA), c(NA))), - bknots = I(list(c(NA), c(NA), c(NA), c(NA))) - ) -``` - -## Required Arguments `screen_limma_hyperparams()` - -- **splineomics**: A SplineOmics object containing the following - fields: - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **report_info**: A list containing general information about the - analysis. - - **meta_batch_column**: Column for meta batch information. - - **meta_batch2_column**: Column for secondary meta batch - information. -- **datas**: A list of data frames containing the datasets to be - analyzed. -- **datas_descr**: A description object for the data. -- **metas**: A list of data frames containing metadata for each - dataset in `datas`. -- **designs**: A character vector of design formulas for the limma - analysis. -- **spline_test_configs**: A configuration object for spline tests. + # 'n' stands for natural cubic splines, b for B-splines. + spline_type = c("n", "n", "b", "b"), + # Degree is not applicable (NA) for natural splines. + degree = c(NA, NA, 2L, 4L), + # Degrees of freedom (DoF) to test. + # Higher dof means spline can fit more complex patterns. + dof = c(2L, 3L, 3L, 4L) +) -## Optional Arguments `screen_limma_hyperparams()` +print(spline_test_configs) +``` -- **report_dir**: The path to the output directory. Default is current - work dir. -- **adj_pthresholds**: A numeric vector of p-value thresholds for - significance determination. -- **time_unit**: A character string specifying the time unit label for - plots. -- **padjust_method**: A character string specifying the method for - p-value adjustment. Default is "BH" (Benjamini-Hochberg). - -```{r Perform hyperparameter-screening} -report_dir <- here::here( - "demo_results", - "hyperparams_screen_reports" - ) +Now that we specified all the values for each hyperparameter that we want to +test, we can run the `screen_limma_hyperparams()` function. +```{r Perform hyperparameter-screening, eval = TRUE} SplineOmics::screen_limma_hyperparams( - splineomics = splineomics, - datas = datas, - datas_descr = datas_descr, - metas = metas, - designs = designs, - spline_test_configs = spline_test_configs, - report_dir = report_dir, - adj_pthresholds = pthresholds + splineomics, + datas, + datas_descr, + metas, + designs, + spline_test_configs, + report_dir, + pthresholds, ) -``` -The last HTML generated by `screen_limma_hyperparams()` describes the -meaning of all short words used in the reports. For example it states -that Design_1 = "\~ 1 + Phase\*X + Reactor", and Design_2 = "\~ 1 + X + -Reactor". +``` # Run limma spline analysis -Once we identify the hyperparameters that are likely the best, we can -run the limma spline analysis with them to obtain the results. For this, -the SplineOmics object must be updated based on our findings from the -hyperparameter screening. - -For example, we figured out that natural cubic splines with a DoF of 2 -perform the best in terms of avoiding under- and overfitting. - -### Spline Parameters List - -The `spline_params` list is used to specify the final spline parameters -that you want to use for your analysis. It takes the same arguments as -the `spline_test_configs` dataframe, but here you define the actual -parameters for the spline analysis. - -#### Parameters - -- **spline_type**: The type of spline to use. Options are: - - `"n"`: Natural cubic splines - - `"b"`: B-splines -- **degree**: The degree of the spline. This is only required for - B-splines (`spline_type = "b"`). For natural cubic splines - (`spline_type = "n"`), this should be set to `NA`. -- **dof**: Degrees of Freedom (DoF) for the spline. This parameter - controls the flexibility of the spline. Higher values allow more - flexibility. -- **knots**: A list specifying the positions of the knots. If set to - `NA`, the knots are placed automatically in a central fashion. Knots - are only needed when you want to manually specify their positions. - Each element of the list should correspond to a vector of knot - positions for the respective spline. - -#### Usage with Limma Design Formula - -- **Interaction Effects**: If the limma design formula contains an - interaction effect, you must specify a single option for each - parameter. This single option will be applied to all levels. This - approach ensures consistency across all levels when interactions are - present. -- **No Interaction Effects**: If the limma design formula contains no - interaction effects, you must specify one element in the vector for - each level. Each element corresponds to the respective level in the - order they appear in the metadata. - -```{r best spline params} -spline_params = list( - spline_type = c("n"), # Natural splines for all levels - dof = c(2L) # Degree of freedom of 2 for all levels - ) -``` +Once we identified the hyperparameters that are likely the best ones, we +can run the limma spline analysis with them and get the results. -The spline_params, among others, can be loaded in the SplineOmics object -with the `update_splineomics()` function. - -## Required Arguments `update_splineomics()` - -- **splineomics**: A SplineOmics object to be updated. - -## Optional Arguments `update_splineomics()` - -- **...**: Named arguments with new values for fields to be updated or - added. Allowed fields include: - - **data**: A matrix with the data. - - **meta**: Metadata associated with the data. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **annotation**: A dataframe with the feature descriptions of - data. - - **report_info**: A list containing report information such as - omics data type, data description, data collection date, analyst - name, contact info, and project name. - - **meta_batch_column**: Column for meta batch information. - - **meta_batch2_column**: Column for secondary meta batch - information. - - **feature_name_columns**: Columns used to construct feature - names. - - **design**: A limma design formula - - **spline_params**: Parameters for spline functions. - - **limma_splines_result**: Results from the limma splines - analysis. - -```{r Update the SplineOmics object} +Lets just assume for now that the new parameters, with which the +SplineOmics object is updated, are the best for this analysis. The +choice depends on the analysis. For example, for this analysis, natural +cubic splines (n) with a dof of two seemed to fit the data best (not +overfitting, but also not underfitting), which was the reason those +spline parameters were chosen. + +```{r Update the SplineOmics object, eval = TRUE} splineomics <- SplineOmics::update_splineomics( splineomics = splineomics, - data = data2, # Currently data1 (data is loaded) - meta = meta2, # Currently meta1 (meta is loaded) - design = "~ 1 + Phase*X + Reactor", - spline_params = spline_params + design = "~ 1 + Phase*X + Reactor", # best design formula + data = data2, # data without "outliers" was better + meta = meta2, + spline_params = list( + spline_type = c("n"), # natural cubic splines + dof = c(2L) + ) ) ``` -## Required Arguments `run_limma_splines()` - -- **splineomics**: A SplineOmics object containing the following - fields: - - **data**: A matrix with the data. - - **meta**: Metadata associated with the data. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **design**: A limma design formula - - **spline_params**: Parameters for spline functions. - -## Optional Arguments `run_limma_splines()` - -- **padjust_method**: A character string specifying the method for - p-value adjustment. Default is "BH" (Benjamini-Hochberg). +Run the `run_limma_splines()` function with the updated SplineOmics object: -```{r Run the limma spline analysis} -# Run the limma spline analysis +```{r limma spline analysis, eval = TRUE} splineomics <- SplineOmics::run_limma_splines( - splineomics = splineomics + splineomics ) ``` -The function run_limma_splines() adds a named list to the returned -SplineOmics object. Each element in this list represents a specific -"category" of results. These elements are lists containing the -respective limma topTables, either for each level or for comparisons -between two levels. +The output of the function run_limma_splines() is a named list, where +each element is a specific "category" of results. Refer to [this +document](https://csbg.github.io/SplineOmics/articles/limma_result_categories.html) +for an explanation of the different result categories. Each of those +elements is a list, containing as elements the respective limma +topTables, either for each level or each comparison between two levels. -The element "time_effect" is a list where each element is a topTable -reporting the p-values for each feature for the respective level. +The element "time_effect" is a list, where each element is the topTable +where the p-value for each feature for the respective level are +reported. -The element "avrg_diff_conditions" is a list containing topTables that -represent the comparison of the average differences between the levels. +The element "avrg_diff_conditions" is a list that contains as elements +the topTables, that represent the comparison of the average differences +of the levels. -The element "interaction_condition_time" is a list containing topTables -that represent the interaction between the levels, which includes both -time and average differences. +The element "interaction_condition_time" is a list that contains as +elements the topTables, that represent the interaction between the +levels (which includes both time and the average differences) # Build limma report The topTables of all three categories can be used to generate p-value histograms an volcano plots. -## Required Arguments `create_limma_report()` - -- **splineomics**: A SplineOmics object containing the following - fields: - - **limma_splines_result**: A list containing the results of the - limma analysis with splines. It should have three components: - `time_effect`, `avrg_diff_conditions`, and - `interaction_condition_time`. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **annotation**: A dataframe with the feature descriptions of - data. - - **report_info**: A list containing metadata and other - information to be included in the report. - -## Optional Arguments `create_limma_report()` - -- **adj_pthresh**: A numeric value specifying the adjusted p-value - threshold for significance. Default is 0.05. -- **report_dir**: The path to the output directory. Default is current - work dir. - -```{r Build limma report} +```{r build limma report, eval = TRUE} report_dir <- here::here( "demo_results", "create_limma_reports" ) plots <- SplineOmics::create_limma_report( - splineomics = splineomics, + splineomics, report_dir = report_dir ) ``` # Cluster the hits (significant features) -After obtaining the limma spline results, we can cluster the hits based -on their temporal patterns (spline shapes). A hit is defined by setting -an adjusted p-value threshold for each level. Hierarchical clustering is -then used to assign each hit to one of the specified number of clusters -for that level. - -```{r Prepare inputs for the cluster_hits function} -adj_pthresholds <- c( - 0.05, # threshold for the exponential phase - 0.05 # threshold for the stationary phase +After we obtained the limma spline results, we can cluster the hits +based on their temporal pattern (their spline shape). We define what a +hit is by setting an adj. p-value threshold for every level. Hits are features +(e.g. proteins) that have an adj. p-value below the threshold. +Hierarchical clustering is used to place every hit in one of as many +clusters as we have specified for that specific level. + +```{r cluster the hits, eval = TRUE} +adj_pthresholds <- c( # 0.05 for both levels + 0.05, # exponential + 0.05 # stationary ) -clusters <- list( - 6L, # 6 clusters for the exponential phase - 3L # 3 clusters for the stationary phase +clusters <- c( + 6L, # 6 clusters for the exponential phase level + 3L # 3 clusters for the stationary phase level ) -plot_info = list( # For the spline plots - y_axis_label = "log2 intensity", # Unit of the values in the data matrix. - time_unit = "min", - treatment_labels = c("Feeding"), - treatment_timepoints = c(0) # The feeding occurred at 0 minutes. -) - -gene_column_name <- "Genes" -genes <- data_excel[[gene_column_name]][4:nrow(data_excel)] -``` - -## Required Arguments `cluster_hits()` - -- **splineomics**: An S3 object of class `SplineOmics` that contains - all the necessary data and parameters for the analysis, including: - - **data**: A matrix with the data. - - **meta**: Metadata associated with the data. - - **design**: A limma design formula. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **spline_params**: Parameters for spline functions. - - **meta_batch_column**: Column for meta batch information. - - **meta_batch2_column**: Column for secondary meta batch - information. - - **limma_splines_result**: A list of data frames, each - representing a top table from differential expression analysis, - containing at least 'adj.P.Val' and expression data columns. - - **genes**: A character vector containing the gene names of the - features to be analyzed. - -## Optional Arguments `cluster_hits()` - -- **adj_pthresholds**: Numeric vector of p-value thresholds for - filtering hits in each top table. Default is 0.05. -- **clusters**: Character or integer vector specifying the number of - clusters or 'auto' for automatic estimation. -- **report_info**: A character string to be printed at the top of the - report. -- **time_unit**: A character string specifying the time unit label for - plots (e.g., 'min' for minutes). -- **report_dir**: The path to the output directory. Default is current - work dir. -- **report**: Boolean TRUE or FALSE value specifying if a report - should be generated. Default is TRUE. - -```{r Cluster the hits} report_dir <- here::here( "demo_results", "clustering_reports" ) +plot_info = list( # For the spline plots + y_axis_label = "log2 intensity", + time_unit = "min", # our measurements were in minutes + treatment_labels = c("Feeding"), + treatment_timepoints = c(0) # Feeding was at 0 minutes. +) + +# Get all the gene names. They are used for generating files +# which contents can be directly used as the input for the Enrichr webtool, +# if you prefer to manually perform the enrichment. Those files are +# embedded in the output HTML report and can be downloaded from there. +gene_column_name <- "Gene_symbol" +genes <- data_excel[[gene_column_name]][4:nrow(data_excel)] + clustering_results <- SplineOmics::cluster_hits( splineomics = splineomics, - genes = genes, - analysis_type = "time_effect", # effect of time on features in level. + # Cluster the hits from the time_effect results. You can also cluster + # the hits from the other two limma result categories by specifying + # it here with this argument. + analysis_type = "time_effect", adj_pthresholds = adj_pthresholds, clusters = clusters, + genes = genes, plot_info = plot_info, report_dir = report_dir, ) @@ -773,11 +684,13 @@ clustering_results <- SplineOmics::cluster_hits( # Perform gene set enrichment analysis (GSEA) -To each clustered hit, the respective gene can be assigned and GSEA +Usually, the final step in such a bioinformatics analysis is GSEA. To each +clustered hit, the respective gene can be assigned and GSEA performed. For this, the Enrichr databases of choice have to be downloaded: -```{r Define which Enrichr databases to download} +```{r download Enrichr databases, eval = TRUE} +# Specify which databases you want to download from Enrichr gene_set_lib <- c( "WikiPathways_2019_Human", "NCI-Nature_2016", @@ -792,19 +705,7 @@ gene_set_lib <- c( "GO_Molecular_Function_2018", "Human_Gene_Atlas" ) -``` - -## Required Arguments `download_enrichr_databases()` - -- **gene_set_lib**: A char vector of database names to download from - Enrichr. - -## Optional Arguments `download_enrichr_databases()` - -- **output_dir**: The path to the output directory where the .tsv file - will be saved. Defaults to the current working directory. -```{r Download the Enrichr databases} output_dir <- here::here( "demo_results", "downloaded_databases" @@ -816,24 +717,16 @@ SplineOmics::download_enrichr_databases( ) ``` -To run GSEA, a genes vector must be created, containing all the -underlying genes of the features. The downloaded database file should be -loaded as a dataframe. Additionally, the clusterProfiler parameters and -the report directory can optionally be specified. The function -create_gsea_report() runs GSEA using clusterProfiler, generates an HTML -report, and returns the GSEA dot plots in R. - -```{r Prepare GSEA inputs} -# Get gene vector. Every gene must be in the standardized format expected by -# the enrichment tools. The subsequent code extracts this part. -# For your analysis, this code needs to be customized based on the format. -gene_column_name <- "Genes" -genes <- annotation[[gene_column_name]][1:nrow(annotation)] -genes <- sub(" .*", "", genes) -genes <- sub(";.*", "", genes) -genes <- sub("_.*", "", genes) -genes <- sub("-.*", "", genes) +Per default the file is placed in the current working directory, which +is the root dir of the R project. +To run GSEA, the downloaded database file has to be loaded as a +dataframe. Further, optionally, the clusterProfiler parameters and the +report dir can be specified. The function create_gsea_report() runs GSEA +using clusterProfiler, generates an HTML report and returns the GSEA +dotplots in R. + +```{r run GSEA, eval = TRUE} # The file has a timestamp, but this code takes it irrespective of it. downloaded_dbs_filepath <- list.files( path = output_dir, @@ -846,6 +739,7 @@ databases <- readr::read_tsv( col_types = readr::cols() ) +# Specify the clusterProfiler parameters clusterProfiler_params <- list( adj_p_value = 0.05, pAdjustMethod = "BH", @@ -853,50 +747,43 @@ clusterProfiler_params <- list( maxGSSize = 500, qvalueCutoff = 0.2 ) -``` - -## Required Arguments `create_gsea_report()` - -- **levels_clustered_hits**: A list of clustered hits for the - different levels. -- **genes**: A vector of genes of all features of the dataset -- **databases**: A list of databases to be used in the analysis. -- **report_info**: A list containing information for the report - generation. - -## Optional Arguments `create_gsea_report()` -- **params**: Additional parameters for the GSEA analysis. Default is - NA. -- **plot_titles**: Titles for the plots. Default is NA. -- **background**: Background data. Default is NULL. -- **report_dir**: The path to the output directory where the report - will be saved. Default is the current working directory. - -```{r Run GSEA} report_dir <- here::here( "demo_results", "gsea_reports" ) +``` -result <- SplineOmics::create_gsea_report( +The function below runs the clusterProfiler for all clusters and all levels, +and generates the HTML report: + +```{r run GSEA, eval = TRUE} +result <- SplineOmics::run_gsea( + # A dataframe with three columns: feature, cluster, and gene. Feature contains + # the integer index of the feature, cluster the integer specifying the cluster + # number, and gene the string of the gene, such as "CLSTN2". levels_clustered_hits = clustering_results$clustered_hits_levels, - genes = genes, databases = databases, - params = clusterProfiler_params, + clusterProfiler_params = clusterProfiler_params, report_info = report_info, report_dir = report_dir ) ``` -In the output HTML, each row in the dot plots represents a term from a -specific database, and the columns correspond to the respective -clusters. The color scale indicates the odds ratio, while the size -represents the -log10 adjusted p-value. Only terms with support from -more than 2 genes are included in the plot. For each cluster, a maximum -of 5 terms with the highest odds ratios are shown. - -Note that if, for example, cluster 1 already has 5 terms and cluster 2 -receives a term that was also found for cluster 1, this term will be -included as the sixth term for cluster 1. This is how the maximum of 5 -can be exceeded. +This report first shows all enrichment results, where more than 2 genes +supported a term, in a tabular format. The table with all the terms with +\< 2 genes supporting it can be downloaded by clicking on a button below +that table. + +For the dotplots below that, every row is a term from a specific +database, and the columns are the respective clusters. The color scale +contains the info about the odds ratio and the size the -log10 adj. +p-value. Only terms that have \> 2 genes as support are included in the +plot. Further, for each cluster, just maximally 5 terms are shown (the +terms with the highest odds ratios). Note that when for example cluster +1 already has 5 terms, and cluster 2 does not, and gets a term which was +also found for cluster 1, than this term would be included as the sixth +term for cluster 1, so this is a way the maximum of 5 can be exceeded. + +If a phase, like stationary here, does not lead to any enrichment +results, that is stated with a red message. diff --git a/man/InputControl.Rd b/man/InputControl.Rd index 8d51d9b..723af99 100755 --- a/man/InputControl.Rd +++ b/man/InputControl.Rd @@ -73,9 +73,9 @@ If any of these checks fail, an informative error message is returned. This function performs the following checks: 1. Ensures `feature_name_columns` and `annotation` are not `NULL`. 2. Verifies that each element in `feature_name_columns` is a character - with + with a length of 1. -3. Checks that all elements of `feature_name_columns` are valid column +3. Checks that all elements of `feature_name_columns` are valid column names in the `annotation` data frame. Check Report @@ -84,7 +84,7 @@ Check Report The function performs the following checks: - Whether the `report` argument is present. -- If `report` is not a Boolean value (`TRUE` or `FALSE`), it throws +- If `report` is not a Boolean value (`TRUE` or `FALSE`), it throws an error. } \section{Functions}{ @@ -751,18 +751,18 @@ Check Analysis Mode \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-InputControl-check_analysis_type}{}}} \subsection{Method \code{check_analysis_type()}}{ -This method checks the validity of the `analysis_mode` argument. -It ensures that +This method checks the validity of the `analysis_mode` argument. +It ensures that `analysis_mode` is a character vector of length 1 and that it matches - one of the -allowed analysis modes: "time_effect", "avrg_diff_conditions", or + one of the +allowed analysis modes: "time_effect", "avrg_diff_conditions", or "interaction_condition_time". \subsection{Usage}{ \if{html}{\out{
}}\preformatted{InputControl$check_analysis_type()}\if{html}{\out{
}} } \subsection{Returns}{ -NULL if `analysis_mode` is not provided or invalid. Otherwise, +NULL if `analysis_mode` is not provided or invalid. Otherwise, performs checks and potentially raises errors if checks fail. @@ -773,17 +773,17 @@ Check Feature Name Columns \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-InputControl-check_feature_name_columns}{}}} \subsection{Method \code{check_feature_name_columns()}}{ -This function checks whether all elements of `feature_name_columns` are -characters of length 1 and whether they are valid column names in the +This function checks whether all elements of `feature_name_columns` are +characters of length 1 and whether they are valid column names in the `annotation` data frame. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{InputControl$check_feature_name_columns()}\if{html}{\out{
}} } \subsection{Returns}{ -Returns `NULL` if any required arguments are missing. Throws +Returns `NULL` if any required arguments are missing. Throws an error -if any element of `feature_name_columns` is not a character of length 1 +if any element of `feature_name_columns` is not a character of length 1 or if any element is not a column name in `annotation`. Returns `TRUE` if all checks @@ -794,9 +794,9 @@ pass. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-InputControl-check_report}{}}} \subsection{Method \code{check_report()}}{ -This function verifies the `report` argument within the object's +This function verifies the `report` argument within the object's arguments. -It checks if the `report` argument is present and validates its +It checks if the `report` argument is present and validates its Boolean value. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{InputControl$check_report()}\if{html}{\out{
}} diff --git a/man/Level2Functions.Rd b/man/Level2Functions.Rd index bcea1c9..f81341f 100755 --- a/man/Level2Functions.Rd +++ b/man/Level2Functions.Rd @@ -80,7 +80,7 @@ Check Data Matrix \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Level2Functions-check_data}{}}} \subsection{Method \code{check_data()}}{ -This function checks the validity of the data matrix, ensuring that it +This function checks the validity of the data matrix, ensuring that it is a matrix, contains only numeric values, has no missing values, and all elements are non-negative. Additionally, it @@ -102,7 +102,7 @@ for error messages. Default is NA.} \if{html}{\out{}} } \subsection{Returns}{ -Returns TRUE if all checks pass. Stops execution and returns an +Returns TRUE if all checks pass. Stops execution and returns an error message if any check fails. @@ -133,14 +133,14 @@ messages regarding its use. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{meta}}{A dataframe containing the metadata, including the 'Time' +\item{\code{meta}}{A dataframe containing the metadata, including the 'Time' column.} -\item{\code{condition}}{A single character string specifying the column name +\item{\code{condition}}{A single character string specifying the column name in the meta dataframe to be checked.} -\item{\code{meta_batch_column}}{An optional parameter specifying the column +\item{\code{meta_batch_column}}{An optional parameter specifying the column name in the meta dataframe used to remove the batch effect. Default is NA.} @@ -164,7 +164,7 @@ Check Dataframe \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Level2Functions-check_dataframe}{}}} \subsection{Method \code{check_dataframe()}}{ -Validates that the dataframe contains all required columns with the +Validates that the dataframe contains all required columns with the correct data types. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Level2Functions$check_dataframe(df)}\if{html}{\out{
}} @@ -236,7 +236,6 @@ Validates the spline parameters depending on the specified mode. \subsection{Returns}{ No return value, called for side effects. - Check Columns in Spline Test Configurations } } @@ -244,7 +243,7 @@ Check Columns in Spline Test Configurations \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Level2Functions-check_columns_spline_test_configs}{}}} \subsection{Method \code{check_columns_spline_test_configs()}}{ -Validates that the spline test configurations contain the required columns +Validates that the spline test configurations contain the required columns in the correct order. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Level2Functions$check_columns_spline_test_configs(spline_test_configs)}\if{html}{\out{
}} @@ -266,7 +265,7 @@ No return value, called for side effects. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Level2Functions-check_spline_type_column}{}}} \subsection{Method \code{check_spline_type_column()}}{ -Validates that the 'spline_type' column in the spline test configurations +Validates that the 'spline_type' column in the spline test configurations contains only 'n' or 'b'. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Level2Functions$check_spline_type_column(spline_test_configs)}\if{html}{\out{
}} @@ -275,7 +274,7 @@ contains only 'n' or 'b'. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{spline_test_configs}}{A dataframe containing spline test +\item{\code{spline_test_configs}}{A dataframe containing spline test configurations.} } \if{html}{\out{
}} @@ -288,7 +287,7 @@ No return value, called for side effects. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Level2Functions-check_spline_type_params}{}}} \subsection{Method \code{check_spline_type_params()}}{ -Validates the parameters for each row in the spline test configurations +Validates the parameters for each row in the spline test configurations based on the spline type. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Level2Functions$check_spline_type_params(spline_test_configs)}\if{html}{\out{
}} @@ -297,7 +296,7 @@ based on the spline type. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{spline_test_configs}}{A dataframe containing spline test +\item{\code{spline_test_configs}}{A dataframe containing spline test configurations.} } \if{html}{\out{
}} @@ -310,7 +309,7 @@ TRUE if all checks pass, otherwise an error is thrown. \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Level2Functions-check_max_and_min_dof}{}}} \subsection{Method \code{check_max_and_min_dof()}}{ -Validates the degrees of freedom (DoF) for each row in the spline test +Validates the degrees of freedom (DoF) for each row in the spline test configurations based on the metadata. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Level2Functions$check_max_and_min_dof(spline_test_configs, metas)}\if{html}{\out{
}} @@ -319,7 +318,7 @@ configurations based on the metadata. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{spline_test_configs}}{A dataframe containing spline test +\item{\code{spline_test_configs}}{A dataframe containing spline test configurations.} \item{\code{metas}}{A list of metadata corresponding to the data matrices.} diff --git a/man/Level3Functions.Rd b/man/Level3Functions.Rd index 7a03fbb..ce6cd5c 100755 --- a/man/Level3Functions.Rd +++ b/man/Level3Functions.Rd @@ -13,13 +13,13 @@ This class provides methods for creating error messages and checking batch columns. -The function verifies that the `voom` object contains the following +The function verifies that the `voom` object contains the following components: - `E`: A matrix of log2-counts per million (logCPM) values. -- `weights`: A matrix of observation-specific weights that matches the +- `weights`: A matrix of observation-specific weights that matches the dimensions of `E`. -- `design`: A matrix representing the design matrix used in the linear -modeling, +- `design`: A matrix representing the design matrix used in the linear +modeling, with the same number of rows as there are columns in `E`. The function also checks for optional components such as: @@ -59,10 +59,10 @@ Check the structure of a voom object \if{latex}{\out{\hypertarget{method-Level3Functions-check_voom_structure}{}}} \subsection{Method \code{check_voom_structure()}}{ This function checks the structure of a `voom` object to ensure that it -contains -all the expected components and that these components have the correct -types -and dimensions. The function does not check the actual data within the +contains +all the expected components and that these components have the correct +types +and dimensions. The function does not check the actual data within the matrices. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Level3Functions$check_voom_structure(voom_obj)}\if{html}{\out{
}} @@ -71,8 +71,8 @@ matrices. \subsection{Arguments}{ \if{html}{\out{
}} \describe{ -\item{\code{voom_obj}}{A list representing a `voom` object, typically created -by the +\item{\code{voom_obj}}{A list representing a `voom` object, typically created +by the `voom` function from the `limma` package.} } \if{html}{\out{
}} @@ -88,7 +88,7 @@ Check Batch Column \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-Level3Functions-check_batch_column}{}}} \subsection{Method \code{check_batch_column()}}{ -This method checks the batch column in the metadata and provides +This method checks the batch column in the metadata and provides appropriate messages. \subsection{Usage}{ \if{html}{\out{
}}\preformatted{Level3Functions$check_batch_column(meta, meta_batch_column, data_meta_index)}\if{html}{\out{
}} @@ -108,7 +108,7 @@ data/meta pair. Default is NA.} \if{html}{\out{
}} } \subsection{Returns}{ -NULL. The method is used for its side effects of throwing errors +NULL. The method is used for its side effects of throwing errors or printing messages. Check Condition Time Consistency @@ -119,9 +119,9 @@ Check Condition Time Consistency \if{latex}{\out{\hypertarget{method-Level3Functions-check_condition_time_consistency}{}}} \subsection{Method \code{check_condition_time_consistency()}}{ This function checks whether the values in the `condition` column -have unique values for each block of identical `Time` values in the +have unique values for each block of identical `Time` values in the `meta` dataframe. -Additionally, it ensures that every new block of a given time has a +Additionally, it ensures that every new block of a given time has a new value in the `condition` column. \subsection{Usage}{ diff --git a/man/between_level.Rd b/man/between_level.Rd index 6fb4d31..4088ddf 100755 --- a/man/between_level.Rd +++ b/man/between_level.Rd @@ -6,8 +6,7 @@ \usage{ between_level( data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, spline_params, @@ -20,9 +19,8 @@ between_level( \arguments{ \item{data}{A matrix of data values.} -\item{preprocess_rna_seq}{Boolean specifying whether to preprocess RNA seq} - -\item{normalization_fun}{Function for normalizing RNA-seq raw-counts.} +\item{rna_seq_data}{An object containing the preprocessed RNA-seq data, +such as the output from `limma::voom` or a similar preprocessing pipeline.} \item{meta}{A dataframe containing metadata, including a 'Time' column.} diff --git a/man/check_null_elements.Rd b/man/check_null_elements.Rd index ef348b8..fe11075 100755 --- a/man/check_null_elements.Rd +++ b/man/check_null_elements.Rd @@ -10,12 +10,12 @@ check_null_elements(args) \item{args}{A list of arguments to check for `NULL` elements.} } \value{ -This function does not return a value. It stops execution if +This function does not return a value. It stops execution if any `NULL` elements are found in the input list. } \description{ -This function checks if any elements in the provided list of arguments +This function checks if any elements in the provided list of arguments are `NULL`. -If any `NULL` elements are found, it stops the execution and returns +If any `NULL` elements are found, it stops the execution and returns an informative error message. } diff --git a/man/create_p_value_histogram.Rd b/man/create_p_value_histogram.Rd index 95c6d61..c45e4a7 100755 --- a/man/create_p_value_histogram.Rd +++ b/man/create_p_value_histogram.Rd @@ -6,7 +6,7 @@ \usage{ create_p_value_histogram( top_table, - adj_pthresh = 0.05, + pthresh = 0.05, title = "P-Value Histogram" ) } @@ -14,7 +14,7 @@ create_p_value_histogram( \item{top_table}{A data frame containing the limma top_table with a column named `P.Value` for unadjusted p-values.} -\item{adj_pthresh}{A numeric value for the adjusted p-value threshold +\item{pthresh}{A numeric value for the adjusted p-value threshold (not used in this function, included for consistency).} \item{title}{A character string for the title of the histogram.} diff --git a/man/create_splineomics.Rd b/man/create_splineomics.Rd index 13f3566..de86a3f 100755 --- a/man/create_splineomics.Rd +++ b/man/create_splineomics.Rd @@ -2,13 +2,13 @@ % Please edit documentation in R/splineomics_object.R \name{create_splineomics} \alias{create_splineomics} -\title{Create and update the SplineOmics object -=====} +\title{Create a SplineOmics object} \usage{ create_splineomics( data, meta, condition, + rna_seq_data = NULL, annotation = NULL, report_info = NULL, meta_batch_column = NULL, @@ -16,18 +16,25 @@ create_splineomics( feature_name_columns = NULL, design = NULL, spline_params = NULL, - preprocess_rna_seq = FALSE, - normalization_fun = NULL, padjust_method = "BH" ) } \arguments{ -\item{data}{The actual omics data.} +\item{data}{The actual omics data. In the case the rna_seq_data argument is +used, still provide this argument. In that case, input the data matrix in +here (for example the $E part of the voom object). Assign your feature names +as row headers (otherwise, just numbers will be your feature names).} \item{meta}{Metadata associated with the omics data.} \item{condition}{A condition variable.} +\item{rna_seq_data}{An object containing the preprocessed RNA-seq data, +such as the output from `limma::voom` or a similar preprocessing pipeline. +This argument is not controlled by any function of the `SplineOmics` package. +Rather, in that regard it relies on the input control from the `limma::lmfit` +function.} + \item{annotation}{A dataframe with the feature descriptions of data (optional).} @@ -50,13 +57,12 @@ create the row headers for the data matrix!} \item{design}{A design matrix or similar object (optional).} -\item{spline_params}{Parameters for spline functions (optional).} - -\item{preprocess_rna_seq}{Boolean specifying whether to preprocess RNA seq} - -\item{normalization_fun}{Function used for normalizing RNA-seq. Must take as -input the y of: y <- edgeR::DGEList(counts = raw_counts) and output the y -with the normalized counts.} +\item{spline_params}{Parameters for spline functions (optional). Must contain +the named elements spline_type, which must contain either the string "n" for +natural cubic splines, or "b", for B-splines, the named element degree in the +case of B-splines, that must contain only an integer, and the named element +dof, specifying the degree of freedom, containing an integer and required +both for natural and B-splines.} \item{padjust_method}{Method for p-value adjustment, one of "none", "BH", "BY", "holm", "bonferroni", "hochberg", or "hommel". @@ -69,25 +75,3 @@ A SplineOmics object. Creates a SplineOmics object containing variables that are commonly used across multiple functions in the package. } -\details{ -Description ------------ -Contains the functions to create and update a SplineOmics object. This object -is used to collect function arguments, that are equivalent for more than one -exported function of the SplineOmics package. Additionally - -Functions ---------- -- create_splineomics: Create a SplineOmics object -- update_splineomics: Add additional arguments to the SplineOmics -object or overwrite existing arguments. - -Classes -------- -None - -Notes ------ -None -Create a SplineOmics object -} diff --git a/man/get_limma_combos_results.Rd b/man/get_limma_combos_results.Rd index be06470..f31e22d 100755 --- a/man/get_limma_combos_results.Rd +++ b/man/get_limma_combos_results.Rd @@ -6,6 +6,7 @@ \usage{ get_limma_combos_results( datas, + rna_seq_datas, metas, designs, modes, @@ -19,6 +20,9 @@ get_limma_combos_results( \arguments{ \item{datas}{A list of matrices.} +\item{rna_seq_datas}{A list of RNA-seq data objects, such as the voom object +derived from the limma::voom function.} + \item{metas}{A list of metadata corresponding to the data matrices.} \item{designs}{A list of design matrices.} diff --git a/man/preprocess_rna_seq_data.Rd b/man/preprocess_rna_seq_data.Rd index 293c81c..3f7ace9 100755 --- a/man/preprocess_rna_seq_data.Rd +++ b/man/preprocess_rna_seq_data.Rd @@ -1,29 +1,51 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/run_limma_splines.R +% Please edit documentation in R/preprocess_rna_seq_data.R \name{preprocess_rna_seq_data} \alias{preprocess_rna_seq_data} \title{Perform default preprocessing of raw RNA-seq counts} \usage{ -preprocess_rna_seq_data(raw_counts, design_matrix, normalize_func = NULL) +preprocess_rna_seq_data( + raw_counts, + meta, + spline_params, + design, + normalize_func = NULL +) } \arguments{ \item{raw_counts}{A matrix of raw RNA-seq counts (genes as rows, samples as columns).} -\item{design_matrix}{A design matrix used in the linear modeling, typically -specifying the experimental conditions.} +\item{meta}{A dataframe containing the metadata for data.} + +\item{spline_params}{Parameters for spline functions (optional). Must contain +the named elements spline_type, which must contain either the string "n" for +natural cubic splines, or "b", for B-splines, the named element degree in the +case of B-splines, that must contain only an integer, and the named element +dof, specifying the degree of freedom, containing an integer and required +both for natural and B-splines.} + +\item{design}{A design formula for the limma analysis, such as +'~ 1 + Phase*X + Reactor'.} \item{normalize_func}{An optional normalization function. If provided, this function will be used to normalize the `DGEList` object. If not provided, - TMM normalization (via `edgeR::calcNormFactors`) will be used by default.} +TMM normalization (via `edgeR::calcNormFactors`) will be used by default. +Must take as +input the y of: y <- edgeR::DGEList(counts = raw_counts) and output the y +with the normalized counts.} } \value{ A `voom` object, which includes the log2-counts per million (logCPM) matrix and observation-specific weights. } \description{ -This function is called when `preprocess_rna_seq` is `TRUE`. It performs the -default preprocessing steps for raw RNA-seq counts, including creating a -`DGEList` object, normalizing the counts, and applying the `voom` -transformation. +The `preprocess_rna_seq_data()` function performs essential preprocessing +steps for raw RNA-seq counts. This includes creating a `DGEList` object, +normalizing the counts using the default TMM (Trimmed Mean of M-values) +normalization via the `edgeR::calcNormFactors` function, and applying the +`voom` transformation from the `limma` package to obtain log-transformed +counts per million (logCPM) with associated precision weights. If you +require a different normalization method, you can supply your own +custom normalization function. } diff --git a/man/process_combo.Rd b/man/process_combo.Rd index 0fd0f7e..3139f97 100755 --- a/man/process_combo.Rd +++ b/man/process_combo.Rd @@ -10,6 +10,7 @@ process_combo( spline_config_index, pthreshold, datas, + rna_seq_datas, metas, designs, modes, @@ -32,6 +33,9 @@ spline_test_configs list.} \item{datas}{A list of data matrices} +\item{rna_seq_datas}{A list of RNA-seq data objects, such as the voom object +derived from the limma::voom function.} + \item{metas}{A list of metadata corresponding to the data matrices.} \item{designs}{A list of design matrices.} diff --git a/man/process_top_table.Rd b/man/process_top_table.Rd index 050e922..749eec0 100755 --- a/man/process_top_table.Rd +++ b/man/process_top_table.Rd @@ -8,9 +8,8 @@ process_top_table(process_within_level_result, feature_names) } \arguments{ \item{process_within_level_result}{List of lists containing the limma -topTable, fit, and optionally the voom -object. All of this is from one specific -level.} +topTable, and fit. All of this is from +one specific level.} \item{feature_names}{A non-empty character vector of feature names.} } diff --git a/man/process_within_level.Rd b/man/process_within_level.Rd index 120f81a..0d862c5 100755 --- a/man/process_within_level.Rd +++ b/man/process_within_level.Rd @@ -6,8 +6,7 @@ \usage{ process_within_level( data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, spline_params, @@ -18,9 +17,8 @@ process_within_level( \arguments{ \item{data}{A matrix of data values.} -\item{preprocess_rna_seq}{Boolean specifying whether to preprocess RNA seq} - -\item{normalization_fun}{Function for normalizing RNA-seq raw counts.} +\item{rna_seq_data}{An object containing the preprocessed RNA-seq data, +such as the output from `limma::voom` or a similar preprocessing pipeline.} \item{meta}{A dataframe containing metadata, including a 'Time' column.} diff --git a/man/run_limma_splines.Rd b/man/run_limma_splines.Rd index 3b2276e..bf8f1d4 100755 --- a/man/run_limma_splines.Rd +++ b/man/run_limma_splines.Rd @@ -2,18 +2,37 @@ % Please edit documentation in R/run_limma_splines.R \name{run_limma_splines} \alias{run_limma_splines} -\title{run_limma_splines.R contains the exported package function run_limma_splines -and all the functions that make up the functionality of run_limma_splines. -run_limma_splines performs a limma analysis, using splines, to assign a -p-value to every feature of a time series omics dataset, to find out which -features are significantly changed over the time course. -Run Limma Analysis with Spline Interpolation for Hyperparameter Screening} +\title{Run limma analysis with splines} \usage{ run_limma_splines(splineomics) } \arguments{ -\item{splineomics}{A SplineOmics object, containing data, meta, design, -condition, and spline_params.} +\item{splineomics}{An S3 object of class `SplineOmics` that contains the +following elements: +\itemize{ + \item \code{data}: The matrix of the omics dataset, with the feature + names optionally as row headers. + \item \code{rna_seq_data}: An object containing the preprocessed + RNA-seq data, + such as the output from `limma::voom` or a similar preprocessing pipeline. + \item \code{meta}: A dataframe containing metadata corresponding to the + \code{data}, must include a 'Time' column and the column specified by + \code{condition}. + \item \code{design}: A character string representing the limma design + formula. + \item \code{condition}: A character string specifying the column name + in \code{meta} used to define groups for analysis. + \item \code{spline_params}: A list of spline parameters used in the + analysis, including: + \itemize{ + \item \code{spline_type}: The type of spline (e.g., "n" for natural + splines or "b" for B-splines). + \item \code{dof}: Degrees of freedom for the spline. + \item \code{knots}: Positions of the internal knots (for B-splines). + \item \code{bknots}: Boundary knots (for B-splines). + \item \code{degree}: Degree of the spline (for B-splines only). + } +}} } \value{ The SplineOmics object, updated with a list with three elements: @@ -28,13 +47,9 @@ The SplineOmics object, updated with a list with three elements: the condition and the time. } \description{ -This function conducts differential expression analysis using the Limma -package, -incorporating spline interpolation to model the effect of various -experimental -factors across different levels. It supports both isolated and integrated -modes -for within-level analysis and between-level comparison, adjusting for -multiple -degrees of freedom corresponding to the factors under investigation. +This function performs a limma spline analysis to identify significant +time-dependent changes in features (e.g., proteins) within an omics +time-series dataset. It evaluates features within each condition level +and between levels by comparing average differences and interactions +between time and condition. } diff --git a/man/screen_limma_hyperparams.Rd b/man/screen_limma_hyperparams.Rd index b7d18bd..30e4112 100755 --- a/man/screen_limma_hyperparams.Rd +++ b/man/screen_limma_hyperparams.Rd @@ -22,6 +22,7 @@ screen_limma_hyperparams( spline_test_configs, report_dir = here::here(), adj_pthresholds = c(0.05), + rna_seq_datas = NULL, time_unit = "min", padjust_method = "BH" ) @@ -44,7 +45,7 @@ necessary data and parameters for the analysis, including: two batch columns.) }} -\item{datas}{A list of data frames containing the datasets to be analyzed.} +\item{datas}{A list of matrices containing the datasets to be analyzed.} \item{datas_descr}{A description object for the data.} @@ -60,6 +61,9 @@ necessary data and parameters for the analysis, including: \item{adj_pthresholds}{A numeric vector of p-value thresholds for significance determination.} +\item{rna_seq_datas}{A list of RNA-seq data objects, such as the voom object +derived from the limma::voom function.} + \item{time_unit}{A character string specifying the time unit label for plots.} \item{padjust_method}{A character string specifying the method for p-value diff --git a/man/within_level.Rd b/man/within_level.Rd index 23da6c7..267ebdb 100755 --- a/man/within_level.Rd +++ b/man/within_level.Rd @@ -9,8 +9,7 @@ within_level( level_index, spline_params, data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, condition, @@ -28,13 +27,12 @@ within_level( \item{data}{A matrix of data values.} -\item{preprocess_rna_seq}{Boolean specifying whether to preprocess RNA seq} +\item{rna_seq_data}{An object containing the preprocessed RNA-seq data, +such as the output from `limma::voom` or a similar preprocessing pipeline.} -\item{normalization_fun}{Function to normalize RNA-seq raw counts.} +\item{meta}{A dataframe containing the metadata for data.} -\item{meta}{A dataframe containing metadata.} - -\item{design}{A design formula or matrix for the LIMMA analysis.} +\item{design}{A design formula or matrix for the limma analysis.} \item{condition}{A character string specifying the condition.} @@ -50,7 +48,7 @@ A list containing the name of the results and the top table of results. } \description{ -Processes a single level within a condition, performing LIMMA analysis +Processes a single level within a condition, performing limma analysis and generating the top table of results. } \seealso{ diff --git a/renv.lock b/renv.lock index 1f97e81..d1d9dfa 100755 --- a/renv.lock +++ b/renv.lock @@ -818,13 +818,13 @@ }, "curl": { "Package": "curl", - "Version": "5.2.1", + "Version": "5.2.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" + "Hash": "8f27335f2bcff4d6035edcc82d7d46de" }, "data.table": { "Package": "data.table", @@ -906,6 +906,26 @@ ], "Hash": "451e5edf411987991ab6a5410c45011f" }, + "downlit": { + "Package": "downlit", + "Version": "0.4.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "brio", + "desc", + "digest", + "evaluate", + "fansi", + "memoise", + "rlang", + "vctrs", + "withr", + "yaml" + ], + "Hash": "45a6a596bf0108ee1ff16a040a2df897" + }, "downloader": { "Package": "downloader", "Version": "0.4", @@ -1453,6 +1473,27 @@ ], "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" }, + "httr2": { + "Package": "httr2", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "cli", + "curl", + "glue", + "lifecycle", + "magrittr", + "openssl", + "rappdirs", + "rlang", + "vctrs", + "withr" + ], + "Hash": "836e9564fbeca3bb390bb429a53cd401" + }, "igraph": { "Package": "igraph", "Version": "2.0.3", @@ -1819,6 +1860,36 @@ ], "Hash": "01f28d4278f15c76cddbea05899c5d6f" }, + "pkgdown": { + "Package": "pkgdown", + "Version": "2.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bslib", + "callr", + "cli", + "desc", + "digest", + "downlit", + "fontawesome", + "fs", + "httr2", + "jsonlite", + "openssl", + "purrr", + "ragg", + "rlang", + "rmarkdown", + "tibble", + "whisker", + "withr", + "xml2", + "yaml" + ], + "Hash": "df2912d5873422b55a13002510f02c9f" + }, "pkgload": { "Package": "pkgload", "Version": "1.4.0", @@ -1962,6 +2033,17 @@ ], "Hash": "76fc42834c44b69e4021f6ade524d299" }, + "ragg": { + "Package": "ragg", + "Version": "1.3.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "systemfonts", + "textshaping" + ], + "Hash": "0595fe5e47357111f29ad19101c7d271" + }, "rappdirs": { "Package": "rappdirs", "Version": "0.3.3", @@ -2073,7 +2155,7 @@ }, "rmarkdown": { "Package": "rmarkdown", - "Version": "2.27", + "Version": "2.28", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2092,7 +2174,7 @@ "xfun", "yaml" ], - "Hash": "27f9502e1cdbfa195f94e03b0f517484" + "Hash": "062470668513dcda416927085ee9bdc7" }, "rprojroot": { "Package": "rprojroot", @@ -2302,6 +2384,19 @@ ], "Hash": "3f6e7e5e2220856ff865e4834766bf2b" }, + "textshaping": { + "Package": "textshaping", + "Version": "0.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11", + "lifecycle", + "systemfonts" + ], + "Hash": "5142f8bc78ed3d819d26461b641627ce" + }, "tibble": { "Package": "tibble", "Version": "3.2.1", @@ -2558,6 +2653,13 @@ ], "Hash": "c7d3fd6d29ab077cbac8f0e2751449e6" }, + "whisker": { + "Package": "whisker", + "Version": "0.4.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c6abfa47a46d281a7d5159d0a8891e88" + }, "withr": { "Package": "withr", "Version": "3.0.1", @@ -2582,6 +2684,19 @@ ], "Hash": "00ce32f398db0415dde61abfef11300c" }, + "xml2": { + "Package": "xml2", + "Version": "1.3.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "methods", + "rlang" + ], + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" + }, "yaml": { "Package": "yaml", "Version": "2.3.10", diff --git a/vignettes/get-started.Rmd b/vignettes/get-started.Rmd index a433e19..e9d7f0e 100755 --- a/vignettes/get-started.Rmd +++ b/vignettes/get-started.Rmd @@ -54,18 +54,27 @@ The main goals of this analysis are: set enrichment analysis will be performed to determine if specific biological processes are up- or downregulated after feeding. +### Note + +The documentation of all the **SplineOmics** package functions can be viewed +[here](https://csbg.github.io/SplineOmics/reference) + # Load the packages -```{r setup} +```{r setup, eval = TRUE} library(SplineOmics) -library(readxl) +library(readxl) # for loading Excel files +library(here) # For managing filepaths +library(dplyr) # For data manipulation ``` # Load the files -In this example, the data.xlsx file contains the numeric values (the +In this example, the proteomics_data.rds file contains the numeric values (the intensities) and also the feature descriptions, such as gene and protein -name (= annotation part) +name (= annotation part). Usually, you would load the data from for example an +Excel file, but the .rds file is more compressed, which is the reason this +format was chosen here to limit the size of the SplineOmics package. The file meta.xlsx contains the meta information, which are the descriptions of the columns of the numeric values of data. @@ -74,15 +83,19 @@ descriptions of the columns of the numeric values of data. present on your system). Please note that this dataset is an actual experimental dataset, but the -annotation information, such as gene names, has been removed since it is -not yet published. Instead, the dataset includes randomly generated gene +annotation information, such as gene names, has been removed since it was +not yet published at the time of making the SplineOmics package public. Instead, +the dataset includes randomly generated gene symbols and gene names corresponding to Cricetulus griseus (Chinese Hamster) for each row. This is intended to demonstrate the functionality of the package. +The left part of data contains the numeric values, and the right part the +annotation info, which can be copied in a separate dataframe, as shown below. + ```{r load the files} -data_excel <- readRDS(system.file( +data <- readRDS(system.file( "extdata", "proteomics_data.rds", package = "SplineOmics" @@ -98,20 +111,22 @@ meta <- read_excel( ) # Extract the annotation part from the dataframe. -first_na_col <- which(is.na(data_excel[1,]))[1] -annotation <- data_excel |> - dplyr::select((first_na_col + 1):ncol(data_excel)) |> +first_na_col <- which(is.na(data[1,]))[1] +annotation <- data |> + dplyr::select((first_na_col + 1):ncol(data)) |> dplyr::slice(-c(1:3)) -print(data_excel) -print(annotation) +print(data) print(meta) +print(annotation) ``` ## Bring the Inputs into the Standardized Format -Since `data_excel` is not in the format required by the **SplineOmics** -package, it needs some processing. This can be done with a few commands +Since `data` is not in the format required by the **SplineOmics** +package, it needs some processing. The SplineOmics package requires data to be +a numeric matrix, so no element is allowed to be anything else than a number. +This can be done with a few commands in R, but if your file has a specific structure, the function `extract_data()` can handle this automatically. @@ -123,9 +138,11 @@ If your file looks like the one used here, where: - The **annotation info** is on the right - These fields are separated by one empty column +### Usage of the extract_data() function + Then, `extract_data()` can: -- **Identify the data matrix field** and convert it into a dataframe. +- **Identify the data matrix field** and return it as a numeric matrix. - **Create column headers** from the information written in the cells above the respective columns of the data matrix field. - **Assign rowheaders**: @@ -143,15 +160,20 @@ is shown individually, such as: - **Spline plots** with the datapoints from an individual feature. ```{r process inputs, eval = TRUE} -data <- extract_data( - data = data_excel, - feature_name_columns = c("Gene_name"), - user_prompt = FALSE +data <- SplineOmics::extract_data( + # The dataframe with the numbers on the left and info on the right. + data = data, + # Use this annotation column for the feature names. + feature_name_columns = c("Gene_name"), + # When TRUE, you must confirm that data is in the required format. + user_prompt = FALSE ) ``` # Perform EDA (exploratory data analysis) +Now that we have the data in the required format (numeric matrix) we can go on. + The first step in analyzing data is typically **Exploratory Data Analysis (EDA)**. EDA involves summarizing the main characteristics of the data, often through visualizations. @@ -165,17 +187,21 @@ Some common types of EDA plots include: - **PCA (Principal Component Analysis)** - **Correlation heatmaps** +Again, you can generate those plots yourself with a few lines of R code. +However, if you prefer, for convenience, the `explore_data()` function can +handle this for you. + ### Using `explore_data()` for EDA The **SplineOmics** package provides the function `explore_data()` to perform EDA. This function requires the following arguments: -- **data**: The data matrix. +- **data**: The numeric data matrix. - **meta**: The metadata table. - **condition**: The name of the column in the metadata that contains the levels of the experiment (e.g., "Exponential" and "Stationary"). - **report_info**: A list that contains general information about the - analysis. + analysis, such as the name of the analyst and the datatype (e.g. proteomics) ### Optional Arguments @@ -201,11 +227,14 @@ optional arguments: directory**, but this location can be changed using the `report_dir` argument. - The function also **returns all plots** generated during the - analysis. + analysis, so that you can modify them according to your own needs. - If you do not want a report to be generated, you can set the - `report` argument to `FALSE`. + `report` argument to `FALSE` (when you for example just want the figures + in the R environment) ```{r Load EDA arguments, eval = TRUE} +# Those fields are mandatory, because we believe that when such a report is +# opened after half a year, those infos can be very helpful. report_info <- list( omics_data_type = "PTX", data_description = "Proteomics data of CHO cells", @@ -221,26 +250,82 @@ report_dir <- here::here( ) ``` +## SplineOmics Object + +In the SplineOmics package, multiple functions take the same arguments as input. +To make this easier and to avoid errors, we decided that those arguments are not +provided individually to the functions, but are all stored in an R6 object +(which is of type 'SplineOmics') and then this object is passed to the +functions. Additionally, some functions generate intermediate output, which is +just necessary for the next function in the workflow, which is then also just +passed along by updating the SplineOmics object. But you don't have to worry +about this. + +### Functionality + +The SplineOmics object can be seen as a container where all necessary +arguments are stored. Each function retrieves the required arguments +from the object and potentially adds new data or results back into it. + +### Documentation + +The documentation of the function that creates the SplineOmics object can be +found [here](https://csbg.github.io/SplineOmics/reference/create_splineomics.html) +and the documentation of the function that updates it +[[here](https://csbg.github.io/SplineOmics/reference/update_splineomics.html) + +The documentation for each function that takes the SplineOmics object as input +specifies which arguments must be +present in the SplineOmics object when it is passed to the respective +function. + +## Required Arguments `create_splineomics()` + +- **data**: A matrix with the data +- **meta**: Metadata associated with the data. +- **condition**: Meta column name of the levels (e.g., Exponential and + Stationary). + +## Optional Arguments `create_splineomics()` + +- **rna_seq_data**: An object containing the preprocessed RNA-seq data, + such as the output from `limma::voom` function. +- **annotation**: A dataframe with the feature descriptions of data. +- **report_info**: A list containing general information about the + analysis. +- **meta_batch_column**: Column for meta batch information. +- **meta_batch2_column**: Column for secondary meta batch information. +- **design**: A limma design formula +- **spline_params**: Parameters for the spline functions. + ```{r Create the SplineOmics object, eval = TRUE} -splineomics <- create_splineomics( +# splineomics now contains the SplineOmics object. +splineomics <- SplineOmics::create_splineomics( data = data, meta = meta, annotation = annotation, report_info = report_info, - condition = "Phase", - meta_batch_column = "Reactor" + condition = "Phase", # Column of meta that contains the levels. + meta_batch_column = "Reactor" # For batch effect removal ) ``` +Now that we have the SplineOmics object defined, we can perform our exploratory +data analysis. + ```{r Run EDA function, eval = FALSE} -plots <- explore_data( - splineomics = splineomics, +plots <- SplineOmics::explore_data( + splineomics = splineomics, # SplineOmics object report_dir = report_dir ) ``` -[Here](https://csbg.github.io/SplineOmics_html_reports/explore_data_PTX_19_09_2024-13_43_21.html) you can see the HTML report of the explore_data() function with the NOT batch-corrected data, and [here](https://csbg.github.io/SplineOmics_html_reports/explore_batch_corrected_data_PTX_19_09_2024-13_43_21.html) the report for the batch-corrected data. +[Here](https://csbg.github.io/SplineOmics_html_reports/explore_data_PTX_19_09_2024-13_43_21.html) +you can see the HTML report of the explore_data() function with the NOT +batch-corrected data, and +[here](https://csbg.github.io/SplineOmics_html_reports/explore_batch_corrected_data_PTX_19_09_2024-13_43_21.html) +the report for the batch-corrected data. The EDA plots can tell you a range of things. The plots in the HTML report are grouped into three categories: Distribution and Variability @@ -267,9 +352,12 @@ the best "hyperparameters". In this context, hyperparameters include: ### Challenge of Hyperparameter Selection Rationally determining the best combination of hyperparameters can be -very challenging. Instead of manually selecting combinations, it is -often more effective to try out multiple combinations and choose the -best-performing one. +very challenging. By rationally, I mean deciding upon the final hyperparameters +without ever testing any, just by scientific reasoning. It is much easier just +testing a few and seeing how they actually behave. However, manually selecting +combinations can be tedious, and you have to work very systematically, which +can be challenging. To solve this problem, the `screen_limma_hyperparams()` +function was written. ### Using `screen_limma_hyperparams()` @@ -280,12 +368,12 @@ testing different combinations of hyperparameters. Here's how it works: values you want to test. - **Run combinations**: The function runs the **limma spline analysis** with combinations formed from the hyperparameters you've - provided. + provided in a semi combinatorial way. ### Inner vs. Outer Hyperparameters -Not every possible combination is generated. Instead, there are -**inner** and **outer** hyperparameters: +Semi combinatorial here means that not every possible combination is generated. +Instead, there are **inner** and **outer** hyperparameters: - **Outer hyperparameters**: These include things like **different versions of the dataset** (e.g., full dataset vs. dataset with @@ -297,14 +385,28 @@ Not every possible combination is generated. Instead, there are - For each version of the data (outer hyperparameter), all combinations of inner hyperparameters are tested. +This approach is neccessary, because otherwise the amount of combos would +explode. + ### Example For example, if you have two versions of a dataset (one full dataset, and one with some outliers removed), these versions are considered outer -hyperparameters. The function will generate comparisons for both -versions of the dataset. +hyperparameters. Additionaly, lets say, you want to test two different limma +design formulas, formula 1 and 2. The function will test out all combinations +of those outer hyperparameters and compare them with each other, which results +in a total of 6 combinations here: + +- **Full Dataset Formula 1** vs **Full Dataset Formula 2** +- **Full Dataset Formula 1** vs **Outliers Removed Dataset Formula 1** +- **Full Dataset Formula 1** vs **Outliers Removed Dataset Formula 2** -For each version, let's say you specify the following inner +- **Full Dataset Formula 2** vs **Outliers Removed Dataset Formula 1** +- **Full Dataset Formula 2** vs **Outliers Removed Dataset Formula 2** + +- **Outliers Removed Dataset Formula 1** vs **Outliers Removed Dataset Formula 2** + +Let's say you specified the following inner hyperparameters: - **Spline parameters**: Natural cubic splines with a degree of @@ -312,47 +414,75 @@ hyperparameters: - **Adjusted p-value threshold**: 0.05 or 0.1. The function will generate and test all combinations of the spline -parameters and p-value thresholds for both versions of the data: +parameters and p-value thresholds for all 4 combos: + +Combo 1: +- **DoF = 2, threshold = 0.05** +- **DoF = 3, threshold = 0.05** +- **DoF = 2, threshold = 0.1** +- **DoF = 3, threshold = 0.1** +Combo 2: - **DoF = 2, threshold = 0.05** - **DoF = 3, threshold = 0.05** - **DoF = 2, threshold = 0.1** - **DoF = 3, threshold = 0.1** +Combo 3: +... + This allows you to systematically explore different combinations and select the optimal hyperparameters for your analysis. +Below is an example for our proteomics data: + ```{r Load hyperparameter-screening args, eval = TRUE} data1 <- data meta1 <- meta +# Remove the "outliers" data2 <- data[, !(colnames(data) %in% c( "E12_TP05_Exponential", "E10_TP10_Stationary" ) )] + +# Adjust meta so that it matches data2 meta2 <- meta[!meta$`Sample.ID` %in% c( "E12_TP05_Exponential", "E10_TP10_Stationary" ), ] +# As mentioned above, all the values of one hyperparameter are stored +# and provided as a list. datas <- list(data1, data2) + +# This will be used to describe the versions of the data. datas_descr <- c( "full_data", "outliers_removed" ) metas <- list(meta1, meta2) + +# Test two different limma designs designs <- c( "~ 1 + Phase*X + Reactor", "~ 1 + X + Reactor" ) + +# Specify the meta "level" column condition <- "Phase" + report_dir <- here::here( "results", "hyperparams_screen_reports" ) + +# To remove the batch effect meta_batch_column = "Reactor" + +# Test out two different p-value thresholds (inner hyperparameter) pthresholds <- c( 0.05, 0.1 @@ -373,8 +503,11 @@ spline_test_configs <- data.frame( print(spline_test_configs) ``` +Now that we specified all the values for each hyperparameter that we want to +test, we can run the `screen_limma_hyperparams()` function. + ```{r Perform hyperparameter-screening, eval = FALSE} -screen_limma_hyperparams( +SplineOmics::screen_limma_hyperparams( splineomics, datas, datas_descr, @@ -387,16 +520,17 @@ screen_limma_hyperparams( ``` -You can view an example report +As mentioned, this function generates a report for each comparison of the outer +hyperparameters, which are too many to show here. You can view an example report [here](https://csbg.github.io/SplineOmics_html_reports/Data_1_Design_1_vs_Data_1_Design_2_PTX_19_09_2024-13_44_10.html) This report contains the results for the comparison of the "outer" -hyperparameters data 1 and design (formula) 1 against data 1 and design +hyperparameters data 1 and design (formula) 1 compared against data 1 and design 2. For both of those, all combinations of the "inner" hyperparameters are generated (every possible combination of all specified adj. p-value thresholds and spline configs). -The encoding of this is +The encoding used in the reports and the titles is [here](https://csbg.github.io/SplineOmics_html_reports/hyperparams_screen_meta_table_19_09_2024-13_44_10.html) (This is part of the output of the screen_limma_hyperparams function). @@ -413,32 +547,32 @@ overfitting, but also not underfitting), which was the reason those spline parameters were chosen. ```{r Update the SplineOmics object, eval = TRUE} -splineomics <- update_splineomics( +splineomics <- SplineOmics::update_splineomics( splineomics = splineomics, - design = "~ 1 + Phase*X + Reactor", - data = data2, + design = "~ 1 + Phase*X + Reactor", # best design formula + data = data2, # data without "outliers" was better meta = meta2, spline_params = list( - spline_type = c("n"), + spline_type = c("n"), # natural cubic splines dof = c(2L) ) ) - ``` -```{r limma spline analysis, eval = TRUE} +Run the `run_limma_splines()` function with the updated SplineOmics object: -# Run the limma spline analysis -splineomics <- run_limma_splines( +```{r limma spline analysis, eval = TRUE} +splineomics <- SplineOmics::run_limma_splines( splineomics ) ``` The output of the function run_limma_splines() is a named list, where -each element is a specific "category" of results. Refer to [this document](https://csbg.github.io/SplineOmics/articles/limma_result_categories.html) for an -explanation of the different result categories. Each of those elements -is a list, containing as elements the respective limma topTables, either -for each level or each comparison between two levels. +each element is a specific "category" of results. Refer to [this +document](https://csbg.github.io/SplineOmics/articles/limma_result_categories.html) +for an explanation of the different result categories. Each of those +elements is a list, containing as elements the respective limma +topTables, either for each level or each comparison between two levels. The element "time_effect" is a list, where each element is the topTable where the p-value for each feature for the respective level are @@ -463,7 +597,7 @@ report_dir <- here::here( "create_limma_reports" ) -plots <- create_limma_report( +plots <- SplineOmics::create_limma_report( splineomics, report_dir = report_dir ) @@ -477,19 +611,20 @@ function After we obtained the limma spline results, we can cluster the hits based on their temporal pattern (their spline shape). We define what a -hit is by setting an adj. p-value threshold for every level. Then, -hierarchical clustering is used to place every hit in one of as many +hit is by setting an adj. p-value threshold for every level. Hits are features +(e.g. proteins) that have an adj. p-value below the threshold. +Hierarchical clustering is used to place every hit in one of as many clusters as we have specified for that specific level. ```{r cluster the hits, eval = FALSE} -adj_pthresholds <- c( - 0.05, - 0.05 +adj_pthresholds <- c( # 0.05 for both levels + 0.05, # exponential + 0.05 # stationary ) clusters <- c( - 6L, - 3L + 6L, # 6 clusters for the exponential phase level + 3L # 3 clusters for the stationary phase level ) report_dir <- here::here( @@ -497,19 +632,26 @@ report_dir <- here::here( "clustering_reports" ) -plot_info = list( +plot_info = list( # For the spline plots y_axis_label = "log2 intensity", - time_unit = "min", - treatment_labels = c("Feeding"), - treatment_timepoints = c(0) + time_unit = "min", # our measurements were in minutes + treatment_labels = c("Feeding"), + treatment_timepoints = c(0) # Feeding was at 0 minutes. ) +# Get all the gene names. They are used for generating files +# which contents can be directly used as the input for the Enrichr webtool, +# if you prefer to manually perform the enrichment. Those files are +# embedded in the output HTML report and can be downloaded from there. gene_column_name <- "Gene_symbol" genes <- data_excel[[gene_column_name]][4:nrow(data_excel)] -clustering_results <- cluster_hits( +clustering_results <- SplineOmics::cluster_hits( splineomics = splineomics, - analysis_type = "time_effect", + # Cluster the hits from the time_effect results. You can also cluster + # the hits from the other two limma result categories by specifying + # it here with this argument. + analysis_type = "time_effect", adj_pthresholds = adj_pthresholds, clusters = clusters, genes = genes, @@ -523,11 +665,13 @@ You can view the generated analysis report of the cluster_hits function # Perform gene set enrichment analysis (GSEA) -To each clustered hit, the respective gene can be assigned and GSEA +Usually, the final step in such a bioinformatics analysis is GSEA. To each +clustered hit, the respective gene can be assigned and GSEA performed. For this, the Enrichr databases of choice have to be downloaded: ```{r download Enrichr databases, eval = FALSE} +# Specify which databases you want to download from Enrichr gene_set_lib <- c( "WikiPathways_2019_Human", "NCI-Nature_2016", @@ -543,7 +687,7 @@ gene_set_lib <- c( "Human_Gene_Atlas" ) -download_enrichr_databases(gene_set_lib) +SplineOmics::download_enrichr_databases(gene_set_lib) ``` Per default the file is placed in the current working directory, which @@ -555,15 +699,19 @@ report dir can be specified. The function create_gsea_report() runs GSEA using clusterProfiler, generates an HTML report and returns the GSEA dotplots in R. -```{r run GSEA, eval = FALSE} +```{r prepare arguments for GSEA, eval = FALSE} +# Specify the filepath of the TSV file with the database info downloaded_dbs_filepath <- here::here("all_databases_08_04_2024-12_41_50.tsv") -databases <- readr::read_tsv( +# Load the file +databases <- read.delim( downloaded_dbs_filepath, - col_types = readr::cols() - ) + sep = "\t", + stringsAsFactors = FALSE +) +# Specify the clusterProfiler parameters clusterProfiler_params <- list( adj_p_value = 0.05, pAdjustMethod = "BH", @@ -576,11 +724,19 @@ report_dir <- here::here( "results", "gsea_reports" ) +``` + +The function below runs the clusterProfiler for all clusters and all levels, +and generates the HTML report: -result <- create_gsea_report( +```{r run GSEA, eval = FALSE} +result <- SplineOmics::run_gsea( + # A dataframe with three columns: feature, cluster, and gene. Feature contains + # the integer index of the feature, cluster the integer specifying the cluster + # number, and gene the string of the gene, such as "CLSTN2". levels_clustered_hits = clustering_results$clustered_hits_levels, databases = databases, - params = clusterProfiler_params, + clusterProfiler_params = clusterProfiler_params, report_info = report_info, report_dir = report_dir ) @@ -589,12 +745,42 @@ result <- create_gsea_report( You can view the generated analysis report of the cluster_hits function [here](https://csbg.github.io/SplineOmics_html_reports/create_gsea_report_PTX_19_09_2024-13_47_33.html). -Every row in the dotplots is a term from a specific database, and the -columns are the respective clusters. The color scale contains the info -about the odds ratio and the size the -log10 adj. p-value. Only terms -that have \> 2 genes as support are included in the plot. Further, for -each cluster, just maximally 5 terms are shown (the terms with the -highest odds ratios). Note that when for example cluster 1 already has 5 -terms, and cluster 2 does not, and gets a term which was also found for -cluster 1, than this term would be included as the sixth term for -cluster 1, so this is a way the maximum of 5 can be exceeded. +This report first shows all enrichment results, where more than 2 genes +supported a term, in a tabular format. The table with all the terms with +\< 2 genes supporting it can be downloaded by clicking on a button below +that table. + +For the dotplots below that, every row is a term from a specific +database, and the columns are the respective clusters. The color scale +contains the info about the odds ratio and the size the -log10 adj. +p-value. Only terms that have \> 2 genes as support are included in the +plot. Further, for each cluster, just maximally 5 terms are shown (the +terms with the highest odds ratios). Note that when for example cluster +1 already has 5 terms, and cluster 2 does not, and gets a term which was +also found for cluster 1, than this term would be included as the sixth +term for cluster 1, so this is a way the maximum of 5 can be exceeded. + +If a phase, like stationary here, does not lead to any enrichment +results, that is stated with a red message. + +# Conclusion + +This example showed most functionalities of the SplineOmics package. You can +also run other datatypes with it, including timeseries RNA-seq and glycan data +(for those, refer to the documentation in the README file on the GitHub page +under Usage/RNA-seq and Glycan Data). + +To get an interactive version of this +example, download the SplineOmics package and run the function `open_tutorial()` +which opens an R Markdown file, where you can run the different code blocks and +if your are working in R Studio (which is recommendet) you can easily check out +the values of the individual variables and generate the output reports yourself. + +When you run the function `open_template()` you get a minimal R Markdown file, +where the code is written so that you can use it as a skeleton to plug in your +own data and run it. + +We hope that the SplineOmics package makes your scientific data analysis easier. +If you face any problems (bugs in the code) or are not satisfied with the +documentation, open an issue on GitHub or check out the other options under the +Feedback section of the README on GitHub. Thank you! \ No newline at end of file