diff --git a/.Rbuildignore b/.Rbuildignore index 9128547..882673f 100755 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -16,4 +16,5 @@ ^CODE_OF_CONDUCT\.md$ ^inst/CITATION\.cff$ ^pkgdown$ +_pkgdown.yml$ diff --git a/NAMESPACE b/NAMESPACE index 5833e89..00898af 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ export(explore_data) export(extract_data) export(open_template) export(open_tutorial) +export(preprocess_rna_seq_data) export(run_gsea) export(run_limma_splines) export(screen_limma_hyperparams) diff --git a/R/cluster_hits.R b/R/cluster_hits.R index 9172adb..c283242 100755 --- a/R/cluster_hits.R +++ b/R/cluster_hits.R @@ -90,7 +90,7 @@ cluster_hits <- function( analysis_type = "time_effect", report = TRUE ) { - + report_dir <- normalizePath( report_dir, mustWork = FALSE @@ -226,6 +226,9 @@ cluster_hits <- function( # Add gene column for the run_gsea() function. clustered_hits_levels <- lapply(clustered_hits_levels, function(df) { + if (is.character(df)) { + return(df) + } df$gene <- genes[df$feature] return(df) }) @@ -490,7 +493,7 @@ make_clustering_report <- function( analysis_type, feature_name_columns ) { - + # Optionally remove the batch-effect with the batch column and design matrix # For mode == "integrated", the batch-effect is removed from the whole data # For mode == "isolated", the batch-effect is removed for every level diff --git a/R/preprocess_rna_seq_data.R b/R/preprocess_rna_seq_data.R new file mode 100644 index 0000000..503689a --- /dev/null +++ b/R/preprocess_rna_seq_data.R @@ -0,0 +1,124 @@ +# Exported function: preprocess_rna_seq_data() --------------------------------- + + +#' Perform default preprocessing of raw RNA-seq counts +#' +#' @description +#' The `preprocess_rna_seq_data()` function performs essential preprocessing +#' steps for raw RNA-seq counts. This includes creating a `DGEList` object, +#' normalizing the counts using the default TMM (Trimmed Mean of M-values) +#' normalization via the `edgeR::calcNormFactors` function, and applying the +#' `voom` transformation from the `limma` package to obtain log-transformed +#' counts per million (logCPM) with associated precision weights. If you +#' require a different normalization method, you can supply your own +#' custom normalization function. +#' +#' @param raw_counts A matrix of raw RNA-seq counts (genes as rows, samples as +#' columns). +#' @param meta A dataframe containing the metadata for data. +#' @param spline_params Parameters for spline functions (optional). Must contain +#' the named elements spline_type, which must contain either the string "n" for +#' natural cubic splines, or "b", for B-splines, the named element degree in the +#' case of B-splines, that must contain only an integer, and the named element +#' dof, specifying the degree of freedom, containing an integer and required +#' both for natural and B-splines. +#' @param design A design formula for the limma analysis, such as +#' '~ 1 + Phase*X + Reactor'. +#' @param normalize_func An optional normalization function. If provided, this +#' function will be used to normalize the `DGEList` object. If not provided, +#' TMM normalization (via `edgeR::calcNormFactors`) will be used by default. +#' Must take as +#' input the y of: y <- edgeR::DGEList(counts = raw_counts) and output the y +#' with the normalized counts. +#' @return A `voom` object, which includes the log2-counts per million (logCPM) +#' matrix and observation-specific weights. +#' +#' @importFrom limma voom +#' +#' @export +#' +preprocess_rna_seq_data <- function( + raw_counts, + meta, + spline_params, + design, + normalize_func = NULL +) { + + message("Preprocessing RNA-seq data (normalization + voom)...") + + # Check if edgeR is installed; if not, prompt the user + if (!requireNamespace("edgeR", quietly = TRUE)) { + message("The 'edgeR' package is not installed.") + + # Prompt user for action + repeat { + user_input <- readline( + prompt = + "What would you like to do?\n + 1: Automatically install edgeR\n + 2: Manually install edgeR\n + 3: Cancel\n + Please enter 1, 2, or 3: " + ) + + if (user_input == "1") { + # Try to install edgeR automatically from Bioconductor + message("Attempting to install 'edgeR' automatically + from Bioconductor...") + if (!requireNamespace("BiocManager", quietly = TRUE)) { + utils::install.packages("BiocManager") + } + tryCatch( + { + BiocManager::install("edgeR", update = FALSE) + }, + error = function(e) { + stop( + "Automatic installation of 'edgeR' failed. + Please install it manually and try again.", + call. = FALSE + ) + } + ) + break # Exit the loop if installation is successful + } else if (user_input == "2") { + stop( + "Please install 'edgeR' manually using + BiocManager::install('edgeR') and then re-run the function.", + call. = FALSE + ) + } else if (user_input == "3") { + stop("Operation canceled by the user.", call. = FALSE) + } else { + message("Invalid input. Please enter 1, 2, or 3.") + } + } + } + + design_matrix <- design2design_matrix( + meta = meta, + spline_params = spline_params, + level_index = 1, + design = design + ) + + # Step 1: Create DGEList object from raw counts + y <- edgeR::DGEList(counts = raw_counts) + + # Step 2: Apply the normalization function (either user-provided or default) + if (!is.null(normalize_func) && is.function(normalize_func)) { + y <- normalize_func(y) # user provided normalisation function + } else { + # Default: Normalize the counts using TMM normalization + y <- edgeR::calcNormFactors(y) + } + + # Step 3: Apply voom transformation to get logCPM values and weights + voom_obj <- limma::voom( + y, + design_matrix + ) + + return(voom_obj) +} \ No newline at end of file diff --git a/R/run_limma_splines.R b/R/run_limma_splines.R index 28ba398..9a1a0c8 100755 --- a/R/run_limma_splines.R +++ b/R/run_limma_splines.R @@ -13,8 +13,11 @@ #' @param splineomics An S3 object of class `SplineOmics` that contains the #' following elements: #' \itemize{ -#' \item \code{data}: The original expression dataset used for differential -#' expression analysis. +#' \item \code{data}: The matrix of the omics dataset, with the feature +#' names optionally as row headers. +#' \item \code{rna_seq_data}: An object containing the preprocessed +#' RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. #' \item \code{meta}: A dataframe containing metadata corresponding to the #' \code{data}, must include a 'Time' column and the column specified by #' \code{condition}. @@ -79,13 +82,13 @@ run_limma_splines <- function( input_control$auto_validate() data <- splineomics[["data"]] - preprocess_rna_seq <- splineomics[["preprocess_rna_seq"]] - normalization_fun <- splineomics[["normalization_fun"]] + rna_seq_data <- splineomics[["rna_seq_data"]] meta <- splineomics[["meta"]] spline_params <- splineomics[["spline_params"]] padjust_method <- splineomics[["padjust_method"]] feature_names <- rownames(data) + data_copy <- data rownames(data_copy) <- NULL # To just have numbers describing the rows @@ -97,8 +100,7 @@ run_limma_splines <- function( within_level, spline_params = spline_params, data = data_copy, - preprocess_rna_seq = preprocess_rna_seq, - normalization_fun = normalization_fun, + rna_seq_data = rna_seq_data, meta = meta, design = design, condition = condition, @@ -119,22 +121,6 @@ run_limma_splines <- function( purrr::map_chr(results_list, "name") ) - # For RNA-seq data, voom$E data matrices must be passed to cluster_hits() - voom_matrices <- lapply( - results_list, - function(x) x$voom_data_matrix_level - ) - - if (!any(sapply(voom_matrices, is.null))) { - if (args$mode == "isolated") { - data <- do.call(rbind, voom_matrices) # Combine from all levels - } else { # mode == "integrated" - # All levels contain the full data. Can just take the first one. - data <- voom_matrices[[1]] - } - rownames(data) <- feature_names # Readd the original row headers. - } - # Factor and Factor:Time comparisons between levels between_level_condition_only <- list() between_level_condition_time <- list() # Factor AND time @@ -144,8 +130,7 @@ run_limma_splines <- function( for (lev_combo in level_combinations) { result <- between_level( data = data_copy, - preprocess_rna_seq = preprocess_rna_seq, - normalization_fun = normalization_fun, + rna_seq_data = rna_seq_data, meta = meta, design = design, spline_params = spline_params, @@ -185,10 +170,10 @@ run_limma_splines <- function( avrg_diff_conditions = between_level_condition_only, interaction_condition_time = between_level_condition_time ) - + splineomics <- update_splineomics( splineomics = splineomics, - data = data, # In case voom_data_matrix has been generated. + data = data, limma_splines_result = limma_splines_result ) } @@ -205,8 +190,8 @@ run_limma_splines <- function( #' within a condition. #' #' @param data A matrix of data values. -#' @param preprocess_rna_seq Boolean specifying whether to preprocess RNA seq -#' @param normalization_fun Function for normalizing RNA-seq raw-counts. +#' @param rna_seq_data An object containing the preprocessed RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. #' @param meta A dataframe containing metadata, including a 'Time' column. #' @param design A design formula or matrix for the LIMMA analysis. #' @param spline_params A list of spline parameters for the analysis. @@ -233,8 +218,7 @@ run_limma_splines <- function( #' between_level <- function( data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, spline_params, @@ -255,13 +239,9 @@ between_level <- function( level_index = 1, design = design ) - - if (preprocess_rna_seq) { - data <- preprocess_rna_seq_data( - raw_counts = data, - design_matrix = design_matrix, - normalization_fun - ) + + if (!is.null(rna_seq_data)) { + data <- rna_seq_data } fit <- limma::lmFit( @@ -336,11 +316,11 @@ between_level <- function( #' #' @param level The level within the condition to process. #' @param level_index The index of the level within the condition. -#' @param spline_params A list of spline parameters for the analysis. +#' @param spline_params A list of spline parameters for the analysis. #' @param data A matrix of data values. -#' @param preprocess_rna_seq Boolean specifying whether to preprocess RNA seq -#' @param normalization_fun Function to normalize RNA-seq raw counts. -#' @param meta A dataframe containing metadata. +#' @param rna_seq_data An object containing the preprocessed RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. +#' @param meta A dataframe containing the metadata for data. #' @param design A design formula or matrix for the limma analysis. #' @param condition A character string specifying the condition. #' @param feature_names A non-empty character vector of feature names. @@ -361,8 +341,7 @@ within_level <- function( level_index, spline_params, data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, condition, @@ -388,8 +367,7 @@ within_level <- function( result <- process_within_level( data = data_copy, - preprocess_rna_seq = preprocess_rna_seq, - normalization_fun = normalization_fun, + rna_seq_data = rna_seq_data, meta = meta_copy, design = design, spline_params = spline_params, @@ -410,8 +388,7 @@ within_level <- function( list( name = results_name, - top_table = top_table, - voom_data_matrix_level = result$voom_data_matrix + top_table = top_table ) } @@ -419,105 +396,6 @@ within_level <- function( # Level 2 internal functions --------------------------------------------------- -#' Perform default preprocessing of raw RNA-seq counts -#' -#' @description -#' This function is called when `preprocess_rna_seq` is `TRUE`. It performs the -#' default preprocessing steps for raw RNA-seq counts, including creating a -#' `DGEList` object, normalizing the counts, and applying the `voom` -#' transformation. -#' -#' @param raw_counts A matrix of raw RNA-seq counts (genes as rows, samples as -#' columns). -#' @param design_matrix A design matrix used in the linear modeling, typically -#' specifying the experimental conditions. -#' @param normalize_func An optional normalization function. If provided, this -#' function will be used to normalize the `DGEList` object. If not provided, -#' TMM normalization (via `edgeR::calcNormFactors`) will be used by default. -#' -#' @return A `voom` object, which includes the log2-counts per million (logCPM) -#' matrix and observation-specific weights. -#' -#' @importFrom limma voom -#' -preprocess_rna_seq_data <- function( - raw_counts, - design_matrix, - normalize_func = NULL -) { - - message("Preprocessing RNA-seq data (normalization + voom)...") - - # Check if edgeR is installed; if not, prompt the user - if (!requireNamespace("edgeR", quietly = TRUE)) { - message("The 'edgeR' package is not installed.") - - # Prompt user for action - repeat { - user_input <- readline( - prompt = - "What would you like to do?\n - 1: Automatically install edgeR\n - 2: Manually install edgeR\n - 3: Cancel\n - Please enter 1, 2, or 3: " - ) - - if (user_input == "1") { - # Try to install edgeR automatically from Bioconductor - message("Attempting to install 'edgeR' automatically - from Bioconductor...") - if (!requireNamespace("BiocManager", quietly = TRUE)) { - utils::install.packages("BiocManager") - } - tryCatch( - { - BiocManager::install("edgeR", update = FALSE) - }, - error = function(e) { - stop( - "Automatic installation of 'edgeR' failed. - Please install it manually and try again.", - call. = FALSE - ) - } - ) - break # Exit the loop if installation is successful - } else if (user_input == "2") { - stop( - "Please install 'edgeR' manually using - BiocManager::install('edgeR') and then re-run the function.", - call. = FALSE - ) - } else if (user_input == "3") { - stop("Operation canceled by the user.", call. = FALSE) - } else { - message("Invalid input. Please enter 1, 2, or 3.") - } - } - } - - # Step 1: Create DGEList object from raw counts - y <- edgeR::DGEList(counts = raw_counts) - - # Step 2: Apply the normalization function (either user-provided or default) - if (!is.null(normalize_func) && is.function(normalize_func)) { - y <- normalize_func(y) # user provided normalisation function - } else { - # Default: Normalize the counts using TMM normalization - y <- edgeR::calcNormFactors(y) - } - - # Step 3: Apply voom transformation to get logCPM values and weights - voom_obj <- limma::voom( - y, - design_matrix - ) - - return(voom_obj) -} - - #' Process Top Table #' #' @description @@ -525,9 +403,8 @@ preprocess_rna_seq_data <- function( #' intercepts. #' #' @param process_within_level_result List of lists containing the limma -#' topTable, fit, and optionally the voom -#' object. All of this is from one specific -#' level. +#' topTable, and fit. All of this is from +#' one specific level. #' @param feature_names A non-empty character vector of feature names. #' #' @return A dataframe containing the processed top table with added intercepts. @@ -549,13 +426,11 @@ process_top_table <- function( top_table, feature_names ) - + intercepts <- as.data.frame(stats::coef(fit)[, "(Intercept)", drop = FALSE]) - intercepts_ordered <- intercepts[match(top_table$feature_nr, - rownames(intercepts)), , - drop = FALSE] + intercepts_ordered <- intercepts[top_table$feature_nr, , drop = FALSE] top_table$intercept <- intercepts_ordered[, 1] - + top_table } @@ -568,8 +443,8 @@ process_top_table <- function( #' analysis for a selected level of a factor #' #' @param data A matrix of data values. -#' @param preprocess_rna_seq Boolean specifying whether to preprocess RNA seq -#' @param normalization_fun Function for normalizing RNA-seq raw counts. +#' @param rna_seq_data An object containing the preprocessed RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. #' @param meta A dataframe containing metadata, including a 'Time' column. #' @param design A design formula or matrix for the limma analysis. #' @param spline_params A list of spline parameters for the analysis. @@ -589,8 +464,7 @@ process_top_table <- function( #' process_within_level <- function( data, - preprocess_rna_seq, - normalization_fun, + rna_seq_data, meta, design, spline_params, @@ -605,17 +479,10 @@ process_within_level <- function( design ) - if (preprocess_rna_seq) { - data <- preprocess_rna_seq_data( - raw_counts = data, - design_matrix = design_matrix, - normalize_func = normalization_fun - ) - voom_data_matrix <- data$E - } else { - voom_data_matrix = NULL + if (!is.null(rna_seq_data)) { + data <- rna_seq_data } - + fit <- limma::lmFit( data, design_matrix @@ -639,13 +506,11 @@ process_within_level <- function( list( top_table = top_table, - fit = fit, - voom_data_matrix = voom_data_matrix + fit = fit ) } - # Level 3 internal functions --------------------------------------------------- @@ -666,7 +531,26 @@ modify_limma_top_table <- function( top_table, feature_names ) { - + + is_integer_string <- function(x) { + return(grepl("^[0-9]+$", x)) + } + + # Because the row headers of a potential rna_seq_data object were not + # converted to ints (written as strings) beforehand. This is run only when + # the row headers are still "real" strings. + if (!all(sapply(rownames(top_table), is_integer_string))) { + rownames(top_table) <- sapply( + rownames(top_table), + function(id) { + # Find the index of the current row name in feature_names + index <- which(feature_names == id) + # Return the index as a string + return(as.character(index)) + } + ) + } + top_table <- tidyr::as_tibble( top_table, rownames = "feature_nr" @@ -682,6 +566,6 @@ modify_limma_top_table <- function( # Sort and add feature names based on the feature_nr sorted_feature_names <- feature_names[top_table$feature_nr] top_table <- top_table |> dplyr::mutate(feature_names = sorted_feature_names) - + return(top_table) } diff --git a/R/screen_limma_hyperparams.R b/R/screen_limma_hyperparams.R index 91d5313..e82b4d7 100755 --- a/R/screen_limma_hyperparams.R +++ b/R/screen_limma_hyperparams.R @@ -38,7 +38,7 @@ #' removeBatchEffect supports a maximum of #' two batch columns.) #' } -#' @param datas A list of data frames containing the datasets to be analyzed. +#' @param datas A list of matrices containing the datasets to be analyzed. #' @param datas_descr A description object for the data. #' @param metas A list of data frames containing metadata for each dataset in #' `datas`. @@ -47,6 +47,8 @@ #' @param report_dir A non-empty string specifying the report directory. #' @param adj_pthresholds A numeric vector of p-value thresholds for #' significance determination. +#' @param rna_seq_datas A list of RNA-seq data objects, such as the voom object +#' derived from the limma::voom function. #' @param time_unit A character string specifying the time unit label for plots. #' @param padjust_method A character string specifying the method for p-value #' adjustment. @@ -68,10 +70,15 @@ screen_limma_hyperparams <- function( spline_test_configs, report_dir = here::here(), adj_pthresholds = c(0.05), + rna_seq_datas = NULL, time_unit = "min", # For the plot labels padjust_method = "BH" ) { + if (is.null(rna_seq_datas)) { # Set the default value. + rna_seq_datas <- vector("list", length(datas)) + } + report_dir <- normalizePath( report_dir, mustWork = FALSE @@ -109,6 +116,7 @@ screen_limma_hyperparams <- function( top_tables_combos <- get_limma_combos_results( datas = datas, + rna_seq_datas = rna_seq_datas, metas = metas, designs = designs, modes = modes, @@ -175,6 +183,8 @@ screen_limma_hyperparams <- function( #' spline configurations using the LIMMA method. #' #' @param datas A list of matrices. +#' @param rna_seq_datas A list of RNA-seq data objects, such as the voom object +#' derived from the limma::voom function. #' @param metas A list of metadata corresponding to the data matrices. #' @param designs A list of design matrices. #' @param modes A character vector containing 'isolated' or 'integrated'. @@ -196,6 +206,7 @@ screen_limma_hyperparams <- function( #' get_limma_combos_results <- function( datas, + rna_seq_datas, metas, designs, modes, @@ -223,6 +234,7 @@ get_limma_combos_results <- function( combos, process_combo, datas = datas, + rna_seq_datas = rna_seq_datas, metas = metas, designs = designs, modes = modes, @@ -279,7 +291,6 @@ plot_limma_combos_results <- function( ) ) - combos_separated <- lapply(unique(names_extracted), function(id) { top_tables_combos[names_extracted == id] }) @@ -288,11 +299,14 @@ plot_limma_combos_results <- function( combos <- names(combos_separated) combo_pairs <- combn(combos, 2, simplify = FALSE) - + print("Generating the plots for all pairwise hyperparams-combo comparisons") progress_ticks <- length(combo_pairs) - pb <- progress::progress_bar$new(total = progress_ticks, - format = "[:bar] :percent") + pb <- progress::progress_bar$new( + total = progress_ticks, + format = "[:bar] :percent" + ) + pb$tick(0) time_unit_label <- paste0("[", time_unit, "]") @@ -314,7 +328,7 @@ plot_limma_combos_results <- function( purrr::map(combo_pairs, function(pair) { combo_pair <- combos_separated[pair] - + hitcomp <- gen_hitcomp_plots(combo_pair) composites <- purrr::map(combo_pair, function(combo) { @@ -519,6 +533,8 @@ generate_reports_meta <- function( #' spline_test_configs list. #' @param pthreshold The p-value threshold for significance. #' @param datas A list of data matrices +#' @param rna_seq_datas A list of RNA-seq data objects, such as the voom object +#' derived from the limma::voom function. #' @param metas A list of metadata corresponding to the data matrices. #' @param designs A list of design matrices. #' @param modes A character vector containing 'isolated' or 'integrated'. @@ -541,6 +557,7 @@ process_combo <- function( spline_config_index, pthreshold, datas, + rna_seq_datas, metas, designs, modes, @@ -552,6 +569,7 @@ process_combo <- function( ) { data <- datas[[data_index]] + rna_seq_data <- rna_seq_datas[[data_index]] meta <- metas[[data_index]] design <- designs[[design_index]] mode <- modes[[design_index]] @@ -575,12 +593,13 @@ process_combo <- function( splineomics <- create_splineomics( data = data, + rna_seq_data = rna_seq_data, meta = meta, design = design, spline_params = spline_params, condition = condition, ) - + # suppressMessages will not affect warnings and error messages! result <- suppressMessages(run_limma_splines(splineomics)) @@ -748,10 +767,50 @@ hc_add <- function( #' @importFrom purrr flatten_chr #' hc_vennheatmap <- function(hc_obj) { - + hits_1 <- store_hits(hc_obj$data[[1]]) hits_2 <- store_hits(hc_obj$data[[2]]) + + color_palette <- c("white", "blue", "yellow", "green") + breaks <- c(-0.5, 0.5, 1.5, 2.5, 3.5) + # Check if all elements in hits_1 and hits_2 are character(0) + no_hits_1 <- all(sapply(hits_1, function(x) length(x) == 0)) + no_hits_2 <- all(sapply(hits_2, function(x) length(x) == 0)) + + # If both have no hits, create a placeholder plot for no hits + if (no_hits_1 && no_hits_2) { + # Create a simple empty matrix for the plot + venn_matrix <- matrix( + 0, + nrow = 1, + ncol = 1, + dimnames = list("No Hits", "No Hits") + ) + + plot_title <- sprintf( + "No hits found for %s and %s", + hc_obj$condition_names[[1]], + hc_obj$condition_names[[2]] + ) + + # Continue with your plotting code + vennheatmap_plot <- pheatmap::pheatmap( + venn_matrix, color = color_palette, + breaks = breaks, + cluster_cols = FALSE, + cluster_rows = FALSE, + show_rownames = TRUE, + show_colnames = TRUE, + border_color = NA, + main = plot_title, + silent = TRUE, + fontsize = 6 + ) + + return(list(vennheatmap = vennheatmap_plot, nrhits = 0)) + } + df <- tidyr::expand_grid( features = union( flatten_chr(hits_1), @@ -786,12 +845,9 @@ hc_vennheatmap <- function(hc_obj) { values_from = !!rlang::sym("x")) |> tibble::column_to_rownames("features") |> as.matrix() - + venn_matrix <- venn_matrix[, order(colnames(venn_matrix))] - color_palette <- c("white", "blue", "yellow", "green") - breaks <- c(-0.5, 0.5, 1.5, 2.5, 3.5) - plot_title <- sprintf("0 -> none, 1 -> %s, 2 -> %s, 3 -> both", hc_obj$condition_names[[1]], hc_obj$condition_names[[2]]) @@ -1360,17 +1416,7 @@ plot_composite_splines <- function( args <- list(x = smooth_timepoints, intercept = FALSE) args$df <- spline_test_configs$dof[[config_index]] - - # if (!is.na(spline_test_configs$dof[[config_index]])) { - # args$df <- spline_test_configs$dof[[config_index]] - # } else { - # args$knots <- spline_test_configs$knots[[config_index]] - # } - # - # if (!is.na(spline_test_configs$bknots[[config_index]])) { - # args$Boundary.knots <- spline_test_configs$bknots[[config_index]] - # } - + if (spline_test_configs$spline_type[config_index] == "b") { args$degree <- spline_test_configs$degree[[config_index]] X <- do.call(splines::bs, args) diff --git a/R/splineomics_object.R b/R/splineomics_object.R index 0a69fcc..117df73 100755 --- a/R/splineomics_object.R +++ b/R/splineomics_object.R @@ -1,25 +1,25 @@ -#' Create and update the SplineOmics object -#' ===== -#' -#' Description -#' ----------- -#' Contains the functions to create and update a SplineOmics object. This object -#' is used to collect function arguments, that are equivalent for more than one -#' exported function of the SplineOmics package. Additionally -#' -#' Functions -#' --------- -#' - create_splineomics: Create a SplineOmics object -#' - update_splineomics: Add additional arguments to the SplineOmics -#' object or overwrite existing arguments. -#' -#' Classes -#' ------- -#' None -#' -#' Notes -#' ----- -#' None +# Create and update the SplineOmics object +# ===== +# +# Description +# ----------- +# Contains the functions to create and update a SplineOmics object. This object +# is used to collect function arguments, that are equivalent for more than one +# exported function of the SplineOmics package. Additionally +# +# Functions +# --------- +# - create_splineomics: Create a SplineOmics object +# - update_splineomics: Add additional arguments to the SplineOmics +# object or overwrite existing arguments. +# +# Classes +# ------- +# None +# +# Notes +# ----- +# None # Exported functions ----------------------------------------------------------- @@ -31,9 +31,17 @@ #' Creates a SplineOmics object containing variables that are commonly used #' across multiple functions in the package. #' -#' @param data The actual omics data. +#' @param data The actual omics data. In the case the rna_seq_data argument is +#' used, still provide this argument. In that case, input the data matrix in +#' here (for example the $E part of the voom object). Assign your feature names +#' as row headers (otherwise, just numbers will be your feature names). #' @param meta Metadata associated with the omics data. #' @param condition A condition variable. +#' @param rna_seq_data An object containing the preprocessed RNA-seq data, +#' such as the output from `limma::voom` or a similar preprocessing pipeline. +#' This argument is not controlled by any function of the `SplineOmics` package. +#' Rather, in that regard it relies on the input control from the `limma::lmfit` +#' function. #' @param annotation A dataframe with the feature descriptions of data #' (optional). #' @param report_info A list containing report information such as omics data @@ -50,11 +58,12 @@ #' created. Use the same vector that was used to #' create the row headers for the data matrix! #' @param design A design matrix or similar object (optional). -#' @param spline_params Parameters for spline functions (optional). -#' @param preprocess_rna_seq Boolean specifying whether to preprocess RNA seq -#' @param normalization_fun Function used for normalizing RNA-seq. Must take as -#' input the y of: y <- edgeR::DGEList(counts = raw_counts) and output the y -#' with the normalized counts. +#' @param spline_params Parameters for spline functions (optional). Must contain +#' the named elements spline_type, which must contain either the string "n" for +#' natural cubic splines, or "b", for B-splines, the named element degree in the +#' case of B-splines, that must contain only an integer, and the named element +#' dof, specifying the degree of freedom, containing an integer and required +#' both for natural and B-splines. #' @param padjust_method Method for p-value adjustment, one of "none", "BH", #' "BY", "holm", "bonferroni", "hochberg", or "hommel". #' Defaults to "BH" (Benjamini-Hochberg). @@ -64,9 +73,10 @@ #' @export #' create_splineomics <- function( - data, + data, meta, condition, + rna_seq_data = NULL, annotation = NULL, report_info = NULL, meta_batch_column = NULL, @@ -74,14 +84,12 @@ create_splineomics <- function( feature_name_columns = NULL, design = NULL, spline_params = NULL, - preprocess_rna_seq = FALSE, - normalization_fun = NULL, padjust_method = "BH" ) { - + splineomics <- list( data = data, - preprocess_rna_seq = preprocess_rna_seq, + rna_seq_data = rna_seq_data, meta = meta, condition = condition, annotation = annotation, @@ -122,6 +130,7 @@ update_splineomics <- function( allowed_fields <- c( "data", + "rna_seq_data", "meta", "condition", "annotation", @@ -135,6 +144,7 @@ update_splineomics <- function( ) args <- list(...) + for (name in names(args)) { if (!(name %in% allowed_fields)) { stop(paste("Field", name, "is not allowed.")) diff --git a/R/utils_input_validation.R b/R/utils_input_validation.R index 0f05d34..b4b3eff 100755 --- a/R/utils_input_validation.R +++ b/R/utils_input_validation.R @@ -529,6 +529,18 @@ InputControl <- R6::R6Class("InputControl", call. = FALSE) } + # Ensure that the formula begins with an intercept (~ 1) + # Ignore whitespace, check the start of the string + if (!grepl("^\\s*~\\s*1", formula)) { + stop( + paste( + "The design formula must start with an intercept term '~ 1'.", + "This is because spline curves are plotted onto the data", + "which is not possible without an intercept" + ), + call. = FALSE) + } + # Ensure the formula contains the intercept term 'X' if (!grepl("\\bX\\b", formula)) { stop("The design formula must include the term 'X'.", @@ -2406,14 +2418,12 @@ check_splineomics_elements <- function( "report_info" ), "screen_limma_hyperparams" = c( - "preprocess_rna_seq", "condition", "report_info", "padjust_method" ), "run_limma_splines" = c( "data", - "preprocess_rna_seq", "meta", "design", "condition", diff --git a/README.Rmd b/README.Rmd index 754f54b..1499b2b 100755 --- a/README.Rmd +++ b/README.Rmd @@ -210,7 +210,18 @@ An explanation of the three different `limma` results is [here](https://csbg.git #### RNA-seq data -Transcriptomics data must be preprocessed for `limma`. This is done by setting the preprocess_rna_seq argument to TRUE (see [documentation of the create_splineomics function](https://csbg.github.io/SplineOmics/reference/create_splineomics.html)). Then, the raw RNA-seq counts provided in the data matrix will undergo normalization and transformation. The default normalization is performed using TMM (Trimmed Mean of M-values) normalization via the `edgeR`::calcNormFactors function, followed by the voom transformation from the `limma` package to obtain log-transformed counts per million (logCPM) with associated precision weights. If you require a different normalization method, you can supply your custom normalization function. +Transcriptomics data must be preprocessed for `limma`. You need to provide an +appropriate object, such as a `voom` object, in the `rna_seq_data` argument of +the `SplineOmics` object (see +[documentation](https://csbg.github.io/SplineOmics/reference/create_splineomics.html)). +Along with this, the normalized matrix +(e.g., the `$E` slot of the `voom` object) must be passed to the `data` +argument. This allows flexibility in preprocessing; you can use any method +you prefer as long as the final object and matrix are compatible with limma. +One way to preprocess your RNA-seq data is by using the `preprocess_rna_seq_data()` +function included in the `SplineOmics` package +(see [documentation](https://csbg.github.io/SplineOmics/reference/preprocess_rna_seq_data.html)). + #### Glycan fractional abundance data diff --git a/README.md b/README.md index 7b6de36..dcfcae2 100755 --- a/README.md +++ b/README.md @@ -269,17 +269,17 @@ An explanation of the three different `limma` results is #### RNA-seq data -Transcriptomics data must be preprocessed for `limma`. This is done by -setting the preprocess_rna_seq argument to TRUE (see [documentation of -the create_splineomics -function](https://csbg.github.io/SplineOmics/reference/create_splineomics.html)). -Then, the raw RNA-seq counts provided in the data matrix will undergo -normalization and transformation. The default normalization is performed -using TMM (Trimmed Mean of M-values) normalization via the -`edgeR`::calcNormFactors function, followed by the voom transformation -from the `limma` package to obtain log-transformed counts per million -(logCPM) with associated precision weights. If you require a different -normalization method, you can supply your custom normalization function. +Transcriptomics data must be preprocessed for `limma`. You need to +provide an appropriate object, such as a `voom` object, in the +`rna_seq_data` argument of the `SplineOmics` object (see +[documentation](https://csbg.github.io/SplineOmics/reference/create_splineomics.html)). +Along with this, the normalized matrix (e.g., the `$E` slot of the +`voom` object) must be passed to the `data` argument. This allows +flexibility in preprocessing; you can use any method you prefer as long +as the final object and matrix are compatible with limma. One way to +preprocess your RNA-seq data is by using the `preprocess_rna_seq_data()` +function included in the `SplineOmics` package (see +[documentation](https://csbg.github.io/SplineOmics/reference/preprocess_rna_seq_data.html)). #### Glycan fractional abundance data diff --git a/dev/function_testing_ground.R b/dev/function_testing_ground.R index f0e1777..b08ab0f 100755 --- a/dev/function_testing_ground.R +++ b/dev/function_testing_ground.R @@ -79,29 +79,39 @@ data <- extract_data( # Simulate RNA-seq data to test voom functionality ----------------------------- -# generate_rnaseq_data <- function(n_genes = 1000, n_samples = 36) { -# set.seed(123) # For reproducibility -# -# # Define sample and gene names -# gene_names <- paste0("Gene", 1:n_genes) -# sample_names <- paste0("Sample", 1:n_samples) -# -# # Generate random raw RNA-seq counts (Poisson distributed) -# # Base expression level with some variability -# base_expression <- rpois(n_genes, lambda = 20) # Baseline counts -# counts_matrix <- sapply(1:n_samples, function(x) rpois(n_genes, lambda = base_expression)) -# -# # Assign row and column names -# rownames(counts_matrix) <- gene_names -# colnames(counts_matrix) <- sample_names -# -# return(counts_matrix) -# } -# -# # Example usage: -# n_genes <- 7162 # Adjust the number of genes as needed +generate_rnaseq_data <- function(n_genes = 1000, n_samples = 36) { + set.seed(123) # For reproducibility + + # Define sample and gene names + gene_names <- paste0("Gene", 1:n_genes) + sample_names <- paste0("Sample", 1:n_samples) + + # Generate random raw RNA-seq counts (Poisson distributed) + # Base expression level with some variability + base_expression <- rpois(n_genes, lambda = 20) # Baseline counts + counts_matrix <- sapply(1:n_samples, function(x) rpois(n_genes, lambda = base_expression)) + + # Assign row and column names + rownames(counts_matrix) <- gene_names + colnames(counts_matrix) <- sample_names + + return(counts_matrix) +} + +# Example usage: +n_genes <- 4162 # Adjust the number of genes as needed # data <- generate_rnaseq_data(n_genes = n_genes) +voom_obj <- preprocess_rna_seq_data( + raw_counts = data, + meta = meta, + spline_params = list(spline_type = c("n"), # Chosen spline parameters + dof = c(2L)), + design = "~ 1 + Phase*X + Reactor" +) + +# data <- voom_obj$E + # Explore data ----------------------------------------------------------------- report_info <- list( @@ -115,23 +125,23 @@ report_info <- list( splineomics <- create_splineomics( data = data, + # rna_seq_data = voom_obj, meta = meta, annotation = annotation, feature_name_columns = feature_name_columns, report_info = report_info, condition = "Phase", meta_batch_column = "Reactor", - preprocess_rna_seq = FALSE ) report_dir <- here::here("results", "explore_data") # debug(explore_data) -plots <- explore_data( - splineomics, - report_dir = report_dir, - report = TRUE - ) +# plots <- explore_data( +# splineomics, +# report_dir = report_dir, +# report = TRUE +# ) # Prep input to hyperparams screen function ------------------------------------ @@ -140,15 +150,18 @@ meta1 <- meta data2 <- data[, -c(1, 2)] meta2 <- meta[-c(1, 2),] +# data2 <- data +# meta2 <- meta datas <- list(data1, data2) +# rna_seq_datas <- list(voom_obj, voom_obj) # Just to test it. datas_descr <- c("full_data", "outliers_removed") metas <- list(meta1, meta2) -designs <- c("~ 1 + Phase*X + Reactor", "~ 1 + X + Reactor") +designs <- c("~ 1 + Phase*X + Reactor", "~ 1 + Phase*X + Reactor") report_dir <- here::here("results", "hyperparams_screen_reports") -pthresholds <- c(0.05, 0.1) +pthresholds <- c(0.05, 0.01) # Every row a combo to test. spline_test_configs <- data.frame( @@ -160,16 +173,17 @@ spline_test_configs <- data.frame( # hyperparams screen limma ----------------------------------------------------- # debug(screen_limma_hyperparams) -screen_limma_hyperparams( - splineomics, - datas, - datas_descr, - metas, - designs, - spline_test_configs, - report_dir, - pthresholds - ) +# screen_limma_hyperparams( +# splineomics, +# datas, +# datas_descr, +# metas, +# designs, +# spline_test_configs, +# report_dir, +# pthresholds, +# rna_seq_datas, +# ) ## Run limma splines ----------------------------------------------------------- @@ -177,8 +191,8 @@ screen_limma_hyperparams( splineomics <- update_splineomics( splineomics = splineomics, design = "~ 1 + Phase*X + Reactor", - data = data1, - meta = meta1, + data = data2, + meta = meta2, spline_params = list(spline_type = c("n"), # Chosen spline parameters dof = c(2L)) ) @@ -191,11 +205,11 @@ splineomics <- run_limma_splines( report_dir <- here::here("results", "limma_reports") -plots <- create_limma_report( - splineomics, - adj_pthresh = 0.1, - report_dir = report_dir -) +# plots <- create_limma_report( +# splineomics, +# adj_pthresh = 0.1, +# report_dir = report_dir +# ) ## Cluster hits ---------------------------------------------------------------- diff --git a/inst/tutorial/tutorial.Rmd b/inst/tutorial/tutorial.Rmd index da01046..03ef107 100755 --- a/inst/tutorial/tutorial.Rmd +++ b/inst/tutorial/tutorial.Rmd @@ -1,7 +1,7 @@ --- title: "demo" author: "Thomas Rauter" -date: "15 July, 2024" +date: "20 September, 2024" output: html_document editor_options: markdown: @@ -17,24 +17,43 @@ knitr::opts_chunk$set( # About this tutorial -This tutorial demonstrates the capabilities of the SplineOmics package -through a comprehensive example: a time-series proteomics experiment -involving CHO cells cultivated in three bioreactors (biological -replicates). Samples were collected from each reactor in triplicates at -specific time points relative to cell feeding (60 min before, and 15, -60, 90, 120, and 240 min after feeding) during both exponential and -stationary growth phases. - -The objective is to identify which of the 7162 cellular proteins show -significant changes over time post-feeding. Proteins with significant -temporal changes are then clustered based on their patterns. A gene set -enrichment analysis is performed for each cluster to identify processes -that are up- or downregulated over time after feeding. - -Note: For a better understanding of the SplineOmics functions, the -required and optional arguments are documented here. These however are -only short forms of the full documentation of the arguments, which you -can find by selecting a function and pressing F2. +This tutorial intends to showcase and explain the capabilities of the +**SplineOmics** package by walking through a real and complete example, +from start to finish. + +### Example Overview + +The example involves a **time-series proteomics experiment**, where CHO +(chinese hamster ovary) cells were cultivated in three bioreactors +(three biological replicates). The experiment includes the following +setup: + +- Samples were taken both during the **exponential** and **stationary + growth phases**. +- Samples were collected in triplicates from each reactor at defined + timepoints relative to cell feeding: + - 60 minutes before feeding + - 15, 60, 90, 120, and 240 minutes after feeding + +### Analysis Goals + +The main goals of this analysis are: + +- **Identify proteins with significant temporal changes**: Out of 7162 + cellular proteins, the objective is to detect which proteins show a + significant change over time after the CHO cells were fed (i.e., the + impact of the feeding). +- **Cluster hits based on temporal patterns**: The proteins (hits) + with significant temporal changes will be clustered according to + their time-based patterns. +- **Perform gene set enrichment analysis**: For each cluster, a gene + set enrichment analysis will be performed to determine if specific + biological processes are up- or downregulated after feeding. + +### Note + +The documentation of all the **SplineOmics** package functions can be viewed +[here](https://csbg.github.io/SplineOmics/reference) Further note: To run the code of a box, click on the respective ▶️ play button like symbol. @@ -43,7 +62,9 @@ like symbol. Make sure all the required packages for this analysis script are installed. Part of these packages are not dependencies of the -SplineOmics package, that is why they could be missing. +SplineOmics package, that is why they could be missing. If the code block +below does not work for you, manually install those packages and skip this +block. ```{r Conditionally install missing packages} install_if_missing <- function(packages) { @@ -71,9 +92,9 @@ install_if_missing(packages_to_install) library(SplineOmics) # Functions are marked with SplineOmics:: # Additional packages needed to prepare SplineOmics function inputs -library(readxl) -library(here) -library(readr) +library(readxl) # for loading Excel files +library(here) # For managing filepaths +library(readr) # For reading the database TSV files ``` To avoid conflicts between functions from the dplyr package and base R @@ -82,7 +103,7 @@ the conflicted package. This ensures that the intended function is used, preventing potential errors and improving code clarity. ```{r Load dplyr package} -library(dplyr) +library(dplyr) # For data manipulation library(conflicted) # Explicitly state preference of functions @@ -96,25 +117,38 @@ conflicted::conflict_prefer("union", "base") # Load the files -In this example, the data.xlsx file contains numeric values -(intensities) and feature descriptions, such as gene and protein names -(annotation part). The meta.xlsx file contains meta information, which -describes the columns of the numeric values in data.xlsx. - -These example files are included in the package and do not need to be -present on your system. For your analysis, create file paths using the -here library instead of system.file. - -```{r Load the files} -data_excel <- readxl::read_excel( - system.file( - "extdata", - "proteomics_data.xlsx", - package = "SplineOmics" - ) - ) +In this example, the proteomics_data.rds file contains the numeric values (the +intensities) and also the feature descriptions, such as gene and protein +name (= annotation part). Usually, you would load the data from for example an +Excel file, but the .rds file is more compressed, which is the reason this +format was chosen here to limit the size of the SplineOmics package. + +The file meta.xlsx contains the meta information, which are the +descriptions of the columns of the numeric values of data. -meta <- readxl::read_excel( +(These example files are part of the package and don't have to be +present on your system). + +Please note that this dataset is an actual experimental dataset, but the +annotation information, such as gene names, has been removed since it was +not yet published at the time of making the SplineOmics package public. Instead, +the dataset includes randomly generated gene +symbols and gene names corresponding to Cricetulus griseus (Chinese +Hamster) for each row. This is intended to demonstrate the functionality +of the package. + +The left part of data contains the numeric values, and the right part the +annotation info, which can be copied in a separate dataframe, as shown below. + +```{r load the files} +data <- readRDS(system.file( + "extdata", + "proteomics_data.rds", + package = "SplineOmics" + )) + + +meta <- read_excel( system.file( "extdata", "proteomics_meta.xlsx", @@ -123,88 +157,130 @@ meta <- readxl::read_excel( ) # Extract the annotation part from the dataframe. -first_na_col <- which(is.na(data_excel[1,]))[1] -annotation <- data_excel |> - dplyr::select((first_na_col + 1):ncol(data_excel)) |> +first_na_col <- which(is.na(data[1,]))[1] +annotation <- data |> + dplyr::select((first_na_col + 1):ncol(data)) |> dplyr::slice(-c(1:3)) + +print(data) +print(meta) +print(annotation) ``` -Note that for this experiment, just a single treatment is present, which -is the growth phase (exponential or stationary) of the cells. This is -encoded in "condition" column of meta, here called "Phase". If there is -more than one treatment, they can be combined in the single condition -column. For example, if additionally, there would also be a temperature -shift, from 37 to 32 °C, this could for example be written in the -condition column in following way: exp_37, exp_32, stat_37, stat_32 -(Both treatments combined in one string and all placed in the single -condition column). - -# Bring the inputs into the standardized (required) format - -Since `data_excel` is not in the format required by the SplineOmics -package, it needs processing. This can be done with a few R commands, -but if your file looks like the one here, with the data matrix on the -left and annotation info on the right, separated by an empty column -(which is required!), the `extract_data()` function can handle this -automatically. - -The function identifies the data matrix and converts it into a -dataframe. Column headers are created from the information in the cells -above each data matrix column. If no annotation columns are specified, -row headers are simply increasing numbers. In this example, the -annotation columns "First.Protein.Description" and "ID" are specified to -form the row headers (feature names). These names will be used to label -any plots where a feature is shown individually, such as spline plots -with datapoints from an individual feature. - -## Required Arguments `extract_data()` - -- **data**: A dataframe loaded from a tabular file. - -## Optional Arguments `extract_data()` - -- **feature_name_columns**: A character vector specifying the columns - of the dataframe `data` that should be used to construct the feature - names. If omitted, the feature names are just numbers (stored as - characters) starting from 1 (1, 2, 3, etc.). - -(When you want to have meaningful feature descriptions, add feature-name -columns as arguments to the function below. They are used for the row -headers of the matrix. If you don't use this function, make sure you -have row headers to your matrix if you want feature descriptions for -your plots.) - -```{r Process the inputs} +## Bring the Inputs into the Standardized Format + +Since `data` is not in the format required by the **SplineOmics** +package, it needs some processing. The SplineOmics package requires data to be +a numeric matrix, so no element is allowed to be anything else than a number. +This can be done with a few commands +in R, but if your file has a specific structure, the function +`extract_data()` can handle this automatically. + +### File Structure Requirements + +If your file looks like the one used here, where: + +- The **data matrix field** is on the left +- The **annotation info** is on the right +- These fields are separated by one empty column + +### Usage of the extract_data() function + +Then, `extract_data()` can: + +- **Identify the data matrix field** and return it as a numeric matrix. +- **Create column headers** from the information written in the cells + above the respective columns of the data matrix field. +- **Assign rowheaders**: + - If no annotation columns are specified, rowheaders will be + increasing numbers. + - If annotation columns are specified (like + `"First.Protein.Description"` and `"ID"` in this example), these + will be combined to form the rowheaders (feature names). + +### Usage in Plotting + +The generated rowheaders will be used to label any plots where a feature +is shown individually, such as: + +- **Spline plots** with the datapoints from an individual feature. + +```{r process inputs, eval = TRUE} data <- SplineOmics::extract_data( - data = data_excel, - feature_name_columns = c( # Feature names will be a combo out of these col - "First.Protein.Description", # Prodivde the row headers of the matrix - "ID" - ) + # The dataframe with the numbers on the left and info on the right. + data = data, + # Use this annotation column for the feature names. + feature_name_columns = c("Gene_name"), + # When TRUE, you must confirm that data is in the required format. + user_prompt = FALSE ) ``` # Perform EDA (exploratory data analysis) -The first step in analyzing data is usually EDA. EDA involves -summarizing the main characteristics of the data, often using plots such -as density distributions, boxplots, PCA, and correlation heatmaps. This -process can be carried out using the package function `explore_data()`. +Now that we have the data in the required format (numeric matrix) we can go on. + +The first step in analyzing data is typically **Exploratory Data +Analysis (EDA)**. EDA involves summarizing the main characteristics of +the data, often through visualizations. + +### Common EDA Plots + +Some common types of EDA plots include: + +- **Density distributions** +- **Boxplots** +- **PCA (Principal Component Analysis)** +- **Correlation heatmaps** + +Again, you can generate those plots yourself with a few lines of R code. +However, if you prefer, for convenience, the `explore_data()` function can +handle this for you. + +### Using `explore_data()` for EDA + +The **SplineOmics** package provides the function `explore_data()` to +perform EDA. This function requires the following arguments: + +- **data**: The numeric data matrix. +- **meta**: The metadata table. +- **condition**: The name of the column in the metadata that contains + the levels of the experiment (e.g., "Exponential" and "Stationary"). +- **report_info**: A list that contains general information about the + analysis, such as the name of the analyst and the datatype (e.g. proteomics) + +### Optional Arguments -These batch columns are used to run the `removeBatchEffect` function of -limma to remove the batch effect from the data for plotting. When at -least one batch column is provided, the function generates two EDA HTML -reports: one for the uncorrected data and one for the batch corrected -data. +In addition to the required arguments, `explore_data()` offers several +optional arguments: -### Report Generation +- **meta_batch_column**: The name of the column that contains the + first batch effect. -The reports are written to the current working directory by default or -to a specified location using the optional argument `report_dir`. The -function also returns all generated plots. If no report should be -generated, set the optional argument `report` to `FALSE`. +- **meta_batch2_column**: The name of the column that contains the + second batch effect. -```{r Define info that is written in all the HTML reports} + If at least one batch column is provided, the function will: + + - Use the `removeBatchEffect()` function from **limma** to remove + the batch effect from the data before plotting. + - Generate two EDA HTML reports: one for the **uncorrected data** + and one for the **batch-corrected data**. + +### Output and Report Options + +- By default, the reports are saved in the **current working + directory**, but this location can be changed using the `report_dir` + argument. +- The function also **returns all plots** generated during the + analysis, so that you can modify them according to your own needs. +- If you do not want a report to be generated, you can set the + `report` argument to `FALSE` (when you for example just want the figures + in the R environment) + +```{r Load EDA arguments, eval = TRUE} +# Those fields are mandatory, because we believe that when such a report is +# opened after half a year, those infos can be very helpful. report_info <- list( omics_data_type = "PTX", data_description = "Proteomics data of CHO cells", @@ -213,13 +289,23 @@ report_info <- list( contact_info = "thomas.rauter@plus.ac.at", project_name = "DGTX" ) + +report_dir <- here::here( + "demo_results", + "explore_data" + ) ``` ## SplineOmics Object -The SplineOmics object is used because multiple functions in the package -take the same inputs, and some functions generate "intermediate" output -that is used by subsequent functions in the workflow. +In the SplineOmics package, multiple functions take the same arguments as input. +To make this easier and to avoid errors, we decided that those arguments are not +provided individually to the functions, but are all stored in an R6 object +(which is of type 'SplineOmics') and then this object is passed to the +functions. Additionally, some functions generate intermediate output, which is +just necessary for the next function in the workflow, which is then also just +passed along by updating the SplineOmics object. But you don't have to worry +about this. ### Functionality @@ -229,7 +315,13 @@ from the object and potentially adds new data or results back into it. ### Documentation -The documentation for each function specifies which arguments must be +The documentation of the function that creates the SplineOmics object can be +found [here](https://csbg.github.io/SplineOmics/reference/create_splineomics.html) +and the documentation of the function that updates it +[[here](https://csbg.github.io/SplineOmics/reference/update_splineomics.html) + +The documentation for each function that takes the SplineOmics object as input +specifies which arguments must be present in the SplineOmics object when it is passed to the respective function. @@ -242,6 +334,8 @@ function. ## Optional Arguments `create_splineomics()` +- **rna_seq_data**: An object containing the preprocessed RNA-seq data, + such as the output from `limma::voom` function. - **annotation**: A dataframe with the feature descriptions of data. - **report_info**: A list containing general information about the analysis. @@ -250,522 +344,339 @@ function. - **design**: A limma design formula - **spline_params**: Parameters for the spline functions. -```{r Create the SplineOmics object} +```{r Create the SplineOmics object, eval = TRUE} +# splineomics now contains the SplineOmics object. splineomics <- SplineOmics::create_splineomics( data = data, meta = meta, annotation = annotation, report_info = report_info, - condition = "Phase", - meta_batch_column = "Reactor" + condition = "Phase", # Column of meta that contains the levels. + meta_batch_column = "Reactor" # For batch effect removal ) ``` -## Required Arguments `explore_data()` - -- **splineomics**: A SplineOmics object containing the following - fields: - - **data**: A matrix with the data. - - **meta**: Metadata associated with the data. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **report_info**: A list containing general information about the - analysis. - - **meta_batch_column**: Column for meta batch information. - - **meta_batch2_column**: Column for secondary meta batch - information. - -## Optional Arguments `explore_data()` - -- **report_dir**: The path to the output directory. Default is current - work dir. -- **report**: A Boolean TRUE or FALSE value specifying if a report - should be generated. Default is TRUE. - -```{r Run the EDA function} -report_dir <- here::here( - "demo_results", - "explore_data" - ) +Now that we have the SplineOmics object defined, we can perform our exploratory +data analysis. +```{r Run EDA function, eval = TRUE} plots <- SplineOmics::explore_data( - splineomics = splineomics, + splineomics = splineomics, # SplineOmics object report_dir = report_dir ) ``` -The EDA plots can reveal a range of insights. In the HTML report, the -plots are grouped into three categories: distribution and variability -analysis, time series analysis, and dimensional reduction and -clustering. +The EDA plots can tell you a range of things. The plots in the HTML +report are grouped into three categories: Distribution and Variability +Analysis, Time Series Analysis, and Dimensionality Reduction and +Clustering. -### Correlation Heatmaps +If you look at the correlation heatmaps in the HTML report, you can see +that the samples E12_TP05_Exponential and E10_TP10_Stationary stick out. +Seeing this, you might want to remove them from the data. You can test +out what happens when you do this, along with testing how other +hyperparameter choices influence the results, with the package function +screen_limma_hyperparams(). -If you examine the correlation heatmaps in the HTML report, you might -notice that the samples `E12_TP05_Exponential` and `E10_TP10_Stationary` -stand out. Based on this observation, you might decide to remove these -samples from the data. The impact of such decicions can be explored with -the screen_limma_hyperparams() function. +## Finding the Best Hyperparameters -# Find the best hyperparameters +Before running the **limma spline analysis**, it is important to find +the best "hyperparameters". In this context, hyperparameters include: -## Determining the Best Hyperparameters +- **Degree of freedom (DoF)** +- **Different versions of the data** (e.g., outlier removed vs. not + removed) +- **Different limma design formulas** -Before running the limma spline analysis, we need to determine the best -"hyperparameters." Hyperparameters in this context include the degree of -freedom, different versions of the data (e.g., outlier removed vs. not -removed), different limma design formulas, etc. Rationally choosing the -best combination of hyperparameters is challenging, so it is often -better to try out multiple combinations and select the best one. +### Challenge of Hyperparameter Selection + +Rationally determining the best combination of hyperparameters can be +very challenging. By rationally, I mean deciding upon the final hyperparameters +without ever testing any, just by scientific reasoning. It is much easier just +testing a few and seeing how they actually behave. However, manually selecting +combinations can be tedious, and you have to work very systematically, which +can be challenging. To solve this problem, the `screen_limma_hyperparams()` +function was written. ### Using `screen_limma_hyperparams()` -The function `screen_limma_hyperparams()` automates this process. For -each hyperparameter, you specify the values you want to try, and the -function runs the limma spline analysis with various combinations of -these hyperparameters. Not every single combination is generated. -Instead, there are "inner" and "outer" hyperparameters. All combinations -are generated for "outer" hyperparameters, while specific combinations -are generated for "inner" hyperparameters. - -"Inner" hyperparameters include the adjusted p-value thresholds and -spline parameters. For example, if you have two versions of a dataset -(one with potential outliers removed and one without), these are -considered "outer" hyperparameters. The function generates all possible -comparisons for the "outer" hyperparameters, resulting in a single -comparison. Then, for each version of the data, it generates every -combination of the "inner" hyperparameters. +The function `screen_limma_hyperparams()` automates the process of +testing different combinations of hyperparameters. Here's how it works: + +- **Specify values**: For each hyperparameter, you can specify all the + values you want to test. +- **Run combinations**: The function runs the **limma spline + analysis** with combinations formed from the hyperparameters you've + provided in a semi combinatorial way. + +### Inner vs. Outer Hyperparameters + +Semi combinatorial here means that not every possible combination is generated. +Instead, there are **inner** and **outer** hyperparameters: + +- **Outer hyperparameters**: These include things like **different + versions of the dataset** (e.g., full dataset vs. dataset with + outliers removed). + - All possible combinations of outer hyperparameters are + generated. +- **Inner hyperparameters**: These include **adjusted p-value + thresholds** and **spline parameters** (e.g., degree of freedom). + - For each version of the data (outer hyperparameter), all + combinations of inner hyperparameters are tested. + +This approach is neccessary, because otherwise the amount of combos would +explode. ### Example -For example, if you specify natural cubic splines with a degree of -freedom of 2 or 3, and adjusted p-value thresholds of 0.05 or 0.1, the -function will test all combinations: +For example, if you have two versions of a dataset (one full dataset, +and one with some outliers removed), these versions are considered outer +hyperparameters. Additionaly, lets say, you want to test two different limma +design formulas, formula 1 and 2. The function will test out all combinations +of those outer hyperparameters and compare them with each other, which results +in a total of 6 combinations here: + +- **Full Dataset Formula 1** vs **Full Dataset Formula 2** +- **Full Dataset Formula 1** vs **Outliers Removed Dataset Formula 1** +- **Full Dataset Formula 1** vs **Outliers Removed Dataset Formula 2** + +- **Full Dataset Formula 2** vs **Outliers Removed Dataset Formula 1** +- **Full Dataset Formula 2** vs **Outliers Removed Dataset Formula 2** + +- **Outliers Removed Dataset Formula 1** vs **Outliers Removed Dataset Formula 2** -- DoF = 2, threshold = 0.05 -- DoF = 3, threshold = 0.05 -- DoF = 2, threshold = 0.1 -- DoF = 3, threshold = 0.1 +Let's say you specified the following inner +hyperparameters: -This systematic approach helps in identifying the best hyperparameters -for the analysis. +- **Spline parameters**: Natural cubic splines with a degree of + freedom of either 2 or 3. +- **Adjusted p-value threshold**: 0.05 or 0.1. -```{r Load hyperparameter-screening args} +The function will generate and test all combinations of the spline +parameters and p-value thresholds for all 4 combos: + +Combo 1: +- **DoF = 2, threshold = 0.05** +- **DoF = 3, threshold = 0.05** +- **DoF = 2, threshold = 0.1** +- **DoF = 3, threshold = 0.1** + +Combo 2: +- **DoF = 2, threshold = 0.05** +- **DoF = 3, threshold = 0.05** +- **DoF = 2, threshold = 0.1** +- **DoF = 3, threshold = 0.1** + +Combo 3: +... + +This allows you to systematically explore different combinations and +select the optimal hyperparameters for your analysis. + +Below is an example for our proteomics data: + +```{r Load hyperparameter-screening args, eval = TRUE} data1 <- data meta1 <- meta -data2 <- data[, !(colnames(data) %in% c( # Remove potential outliers - "E12_TP05_Exponential", +# Remove the "outliers" +data2 <- data[, !(colnames(data) %in% c( + "E12_TP05_Exponential", "E10_TP10_Stationary" ) )] +# Adjust meta so that it matches data2 meta2 <- meta[!meta$`Sample.ID` %in% c( "E12_TP05_Exponential", "E10_TP10_Stationary" ), ] +# As mentioned above, all the values of one hyperparameter are stored +# and provided as a list. datas <- list(data1, data2) + +# This will be used to describe the versions of the data. datas_descr <- c( "full_data", "outliers_removed" ) -metas <- list( - meta1, - meta2 - ) +metas <- list(meta1, meta2) +# Test two different limma designs designs <- c( "~ 1 + Phase*X + Reactor", "~ 1 + X + Reactor" ) +# Specify the meta "level" column +condition <- "Phase" + +report_dir <- here::here( + "demo_results", + "hyperparams_screen_reports" + ) + +# To remove the batch effect +meta_batch_column = "Reactor" + +# Test out two different p-value thresholds (inner hyperparameter) pthresholds <- c( 0.05, 0.1 ) -``` - -### Spline Configuration Parameters - -The `spline_test_configs` dataframe (see box below) is used to specify -the parameters for different runs of spline analysis. Each row in the -dataframe corresponds to one set of spline settings. The supported -spline types are natural cubic splines (denoted by "n") and B-splines -(denoted by "b"). - -#### Parameters - -- **spline_type**: The type of spline to use. Options are: - - `"n"`: Natural cubic splines - - `"b"`: B-splines -- **degree**: The degree of the spline. This is only required for - B-splines (`spline_type = "b"`). For natural cubic splines - (`spline_type = "n"`), this should be set to `NA`. -- **dof**: Degrees of Freedom (DoF) for the spline. This parameter - controls the flexibility of the spline. Higher values allow more - flexibility. -- **knots**: A list specifying the positions of the knots. If set to - `NA`, the knots are placed automatically in a central fashion. Knots - are only needed when you want to manually specify their positions. - Each element of the list should correspond to a vector of knot - positions for the respective spline. - -#### Understanding the Relationship Between Degree, DoF, and Knots - -The relationship between the degree of the spline, the degrees of -freedom (DoF), and the number of internal knots (k) varies between -B-splines and natural cubic splines. Here's a breakdown: - -- **B-splines**: - - - **degree**: The degree of the spline. - - **DoF**: Degrees of freedom. - - **k**: Number of internal knots. - - The relationships are given by the following formulas: - - - `DoF = k + degree` - - `k = DoF - degree` - -- **Natural cubic splines**: - - **degree**: Always 3 for cubic splines. - - **DoF**: Degrees of freedom. - - **k**: Number of internal knots. - - The relationships are given by the following formulas: - - - `DoF = k + 1` - - `k = DoF - 1` - -#### Specifying Parameters - -You either specify the degrees of freedom (DoF) or the knots, not both. -The choice depends on how you want to control the flexibility of the -spline: - -- **Specifying DoF**: The number of internal knots (k) will be - determined automatically based on the DoF. -- **Specifying Knots**: The degrees of freedom (DoF) will be - calculated based on the number of knots and the degree of the - spline. - -```{r spline_test_configs definition} -# Every row a combo to test. +# Create a dataframe with combinations of spline parameters to test +# (every row a combo to test) spline_test_configs <- data.frame( - spline_type = c("n", "n", "n", "n"), # All should use natural splines (n) - degree = c(NA, NA, NA, NA), # only needed for B-splines (spline_type = b) - dof = c(2L, 3L, 4L, 5L), # Test these variations of the DoF. - # Per default, knots are placed automatically in a central fashion. - knots = I(list(c(NA), c(NA), c(NA), c(NA))), - bknots = I(list(c(NA), c(NA), c(NA), c(NA))) - ) -``` - -## Required Arguments `screen_limma_hyperparams()` - -- **splineomics**: A SplineOmics object containing the following - fields: - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **report_info**: A list containing general information about the - analysis. - - **meta_batch_column**: Column for meta batch information. - - **meta_batch2_column**: Column for secondary meta batch - information. -- **datas**: A list of data frames containing the datasets to be - analyzed. -- **datas_descr**: A description object for the data. -- **metas**: A list of data frames containing metadata for each - dataset in `datas`. -- **designs**: A character vector of design formulas for the limma - analysis. -- **spline_test_configs**: A configuration object for spline tests. + # 'n' stands for natural cubic splines, b for B-splines. + spline_type = c("n", "n", "b", "b"), + # Degree is not applicable (NA) for natural splines. + degree = c(NA, NA, 2L, 4L), + # Degrees of freedom (DoF) to test. + # Higher dof means spline can fit more complex patterns. + dof = c(2L, 3L, 3L, 4L) +) -## Optional Arguments `screen_limma_hyperparams()` +print(spline_test_configs) +``` -- **report_dir**: The path to the output directory. Default is current - work dir. -- **adj_pthresholds**: A numeric vector of p-value thresholds for - significance determination. -- **time_unit**: A character string specifying the time unit label for - plots. -- **padjust_method**: A character string specifying the method for - p-value adjustment. Default is "BH" (Benjamini-Hochberg). - -```{r Perform hyperparameter-screening} -report_dir <- here::here( - "demo_results", - "hyperparams_screen_reports" - ) +Now that we specified all the values for each hyperparameter that we want to +test, we can run the `screen_limma_hyperparams()` function. +```{r Perform hyperparameter-screening, eval = TRUE} SplineOmics::screen_limma_hyperparams( - splineomics = splineomics, - datas = datas, - datas_descr = datas_descr, - metas = metas, - designs = designs, - spline_test_configs = spline_test_configs, - report_dir = report_dir, - adj_pthresholds = pthresholds + splineomics, + datas, + datas_descr, + metas, + designs, + spline_test_configs, + report_dir, + pthresholds, ) -``` -The last HTML generated by `screen_limma_hyperparams()` describes the -meaning of all short words used in the reports. For example it states -that Design_1 = "\~ 1 + Phase\*X + Reactor", and Design_2 = "\~ 1 + X + -Reactor". +``` # Run limma spline analysis -Once we identify the hyperparameters that are likely the best, we can -run the limma spline analysis with them to obtain the results. For this, -the SplineOmics object must be updated based on our findings from the -hyperparameter screening. - -For example, we figured out that natural cubic splines with a DoF of 2 -perform the best in terms of avoiding under- and overfitting. - -### Spline Parameters List - -The `spline_params` list is used to specify the final spline parameters -that you want to use for your analysis. It takes the same arguments as -the `spline_test_configs` dataframe, but here you define the actual -parameters for the spline analysis. - -#### Parameters - -- **spline_type**: The type of spline to use. Options are: - - `"n"`: Natural cubic splines - - `"b"`: B-splines -- **degree**: The degree of the spline. This is only required for - B-splines (`spline_type = "b"`). For natural cubic splines - (`spline_type = "n"`), this should be set to `NA`. -- **dof**: Degrees of Freedom (DoF) for the spline. This parameter - controls the flexibility of the spline. Higher values allow more - flexibility. -- **knots**: A list specifying the positions of the knots. If set to - `NA`, the knots are placed automatically in a central fashion. Knots - are only needed when you want to manually specify their positions. - Each element of the list should correspond to a vector of knot - positions for the respective spline. - -#### Usage with Limma Design Formula - -- **Interaction Effects**: If the limma design formula contains an - interaction effect, you must specify a single option for each - parameter. This single option will be applied to all levels. This - approach ensures consistency across all levels when interactions are - present. -- **No Interaction Effects**: If the limma design formula contains no - interaction effects, you must specify one element in the vector for - each level. Each element corresponds to the respective level in the - order they appear in the metadata. - -```{r best spline params} -spline_params = list( - spline_type = c("n"), # Natural splines for all levels - dof = c(2L) # Degree of freedom of 2 for all levels - ) -``` +Once we identified the hyperparameters that are likely the best ones, we +can run the limma spline analysis with them and get the results. -The spline_params, among others, can be loaded in the SplineOmics object -with the `update_splineomics()` function. - -## Required Arguments `update_splineomics()` - -- **splineomics**: A SplineOmics object to be updated. - -## Optional Arguments `update_splineomics()` - -- **...**: Named arguments with new values for fields to be updated or - added. Allowed fields include: - - **data**: A matrix with the data. - - **meta**: Metadata associated with the data. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **annotation**: A dataframe with the feature descriptions of - data. - - **report_info**: A list containing report information such as - omics data type, data description, data collection date, analyst - name, contact info, and project name. - - **meta_batch_column**: Column for meta batch information. - - **meta_batch2_column**: Column for secondary meta batch - information. - - **feature_name_columns**: Columns used to construct feature - names. - - **design**: A limma design formula - - **spline_params**: Parameters for spline functions. - - **limma_splines_result**: Results from the limma splines - analysis. - -```{r Update the SplineOmics object} +Lets just assume for now that the new parameters, with which the +SplineOmics object is updated, are the best for this analysis. The +choice depends on the analysis. For example, for this analysis, natural +cubic splines (n) with a dof of two seemed to fit the data best (not +overfitting, but also not underfitting), which was the reason those +spline parameters were chosen. + +```{r Update the SplineOmics object, eval = TRUE} splineomics <- SplineOmics::update_splineomics( splineomics = splineomics, - data = data2, # Currently data1 (data is loaded) - meta = meta2, # Currently meta1 (meta is loaded) - design = "~ 1 + Phase*X + Reactor", - spline_params = spline_params + design = "~ 1 + Phase*X + Reactor", # best design formula + data = data2, # data without "outliers" was better + meta = meta2, + spline_params = list( + spline_type = c("n"), # natural cubic splines + dof = c(2L) + ) ) ``` -## Required Arguments `run_limma_splines()` - -- **splineomics**: A SplineOmics object containing the following - fields: - - **data**: A matrix with the data. - - **meta**: Metadata associated with the data. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **design**: A limma design formula - - **spline_params**: Parameters for spline functions. - -## Optional Arguments `run_limma_splines()` - -- **padjust_method**: A character string specifying the method for - p-value adjustment. Default is "BH" (Benjamini-Hochberg). +Run the `run_limma_splines()` function with the updated SplineOmics object: -```{r Run the limma spline analysis} -# Run the limma spline analysis +```{r limma spline analysis, eval = TRUE} splineomics <- SplineOmics::run_limma_splines( - splineomics = splineomics + splineomics ) ``` -The function run_limma_splines() adds a named list to the returned -SplineOmics object. Each element in this list represents a specific -"category" of results. These elements are lists containing the -respective limma topTables, either for each level or for comparisons -between two levels. +The output of the function run_limma_splines() is a named list, where +each element is a specific "category" of results. Refer to [this +document](https://csbg.github.io/SplineOmics/articles/limma_result_categories.html) +for an explanation of the different result categories. Each of those +elements is a list, containing as elements the respective limma +topTables, either for each level or each comparison between two levels. -The element "time_effect" is a list where each element is a topTable -reporting the p-values for each feature for the respective level. +The element "time_effect" is a list, where each element is the topTable +where the p-value for each feature for the respective level are +reported. -The element "avrg_diff_conditions" is a list containing topTables that -represent the comparison of the average differences between the levels. +The element "avrg_diff_conditions" is a list that contains as elements +the topTables, that represent the comparison of the average differences +of the levels. -The element "interaction_condition_time" is a list containing topTables -that represent the interaction between the levels, which includes both -time and average differences. +The element "interaction_condition_time" is a list that contains as +elements the topTables, that represent the interaction between the +levels (which includes both time and the average differences) # Build limma report The topTables of all three categories can be used to generate p-value histograms an volcano plots. -## Required Arguments `create_limma_report()` - -- **splineomics**: A SplineOmics object containing the following - fields: - - **limma_splines_result**: A list containing the results of the - limma analysis with splines. It should have three components: - `time_effect`, `avrg_diff_conditions`, and - `interaction_condition_time`. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **annotation**: A dataframe with the feature descriptions of - data. - - **report_info**: A list containing metadata and other - information to be included in the report. - -## Optional Arguments `create_limma_report()` - -- **adj_pthresh**: A numeric value specifying the adjusted p-value - threshold for significance. Default is 0.05. -- **report_dir**: The path to the output directory. Default is current - work dir. - -```{r Build limma report} +```{r build limma report, eval = TRUE} report_dir <- here::here( "demo_results", "create_limma_reports" ) plots <- SplineOmics::create_limma_report( - splineomics = splineomics, + splineomics, report_dir = report_dir ) ``` # Cluster the hits (significant features) -After obtaining the limma spline results, we can cluster the hits based -on their temporal patterns (spline shapes). A hit is defined by setting -an adjusted p-value threshold for each level. Hierarchical clustering is -then used to assign each hit to one of the specified number of clusters -for that level. - -```{r Prepare inputs for the cluster_hits function} -adj_pthresholds <- c( - 0.05, # threshold for the exponential phase - 0.05 # threshold for the stationary phase +After we obtained the limma spline results, we can cluster the hits +based on their temporal pattern (their spline shape). We define what a +hit is by setting an adj. p-value threshold for every level. Hits are features +(e.g. proteins) that have an adj. p-value below the threshold. +Hierarchical clustering is used to place every hit in one of as many +clusters as we have specified for that specific level. + +```{r cluster the hits, eval = TRUE} +adj_pthresholds <- c( # 0.05 for both levels + 0.05, # exponential + 0.05 # stationary ) -clusters <- list( - 6L, # 6 clusters for the exponential phase - 3L # 3 clusters for the stationary phase +clusters <- c( + 6L, # 6 clusters for the exponential phase level + 3L # 3 clusters for the stationary phase level ) -plot_info = list( # For the spline plots - y_axis_label = "log2 intensity", # Unit of the values in the data matrix. - time_unit = "min", - treatment_labels = c("Feeding"), - treatment_timepoints = c(0) # The feeding occurred at 0 minutes. -) - -gene_column_name <- "Genes" -genes <- data_excel[[gene_column_name]][4:nrow(data_excel)] -``` - -## Required Arguments `cluster_hits()` - -- **splineomics**: An S3 object of class `SplineOmics` that contains - all the necessary data and parameters for the analysis, including: - - **data**: A matrix with the data. - - **meta**: Metadata associated with the data. - - **design**: A limma design formula. - - **condition**: Column name of the levels (e.g., Exponential and - Stationary). - - **spline_params**: Parameters for spline functions. - - **meta_batch_column**: Column for meta batch information. - - **meta_batch2_column**: Column for secondary meta batch - information. - - **limma_splines_result**: A list of data frames, each - representing a top table from differential expression analysis, - containing at least 'adj.P.Val' and expression data columns. - - **genes**: A character vector containing the gene names of the - features to be analyzed. - -## Optional Arguments `cluster_hits()` - -- **adj_pthresholds**: Numeric vector of p-value thresholds for - filtering hits in each top table. Default is 0.05. -- **clusters**: Character or integer vector specifying the number of - clusters or 'auto' for automatic estimation. -- **report_info**: A character string to be printed at the top of the - report. -- **time_unit**: A character string specifying the time unit label for - plots (e.g., 'min' for minutes). -- **report_dir**: The path to the output directory. Default is current - work dir. -- **report**: Boolean TRUE or FALSE value specifying if a report - should be generated. Default is TRUE. - -```{r Cluster the hits} report_dir <- here::here( "demo_results", "clustering_reports" ) +plot_info = list( # For the spline plots + y_axis_label = "log2 intensity", + time_unit = "min", # our measurements were in minutes + treatment_labels = c("Feeding"), + treatment_timepoints = c(0) # Feeding was at 0 minutes. +) + +# Get all the gene names. They are used for generating files +# which contents can be directly used as the input for the Enrichr webtool, +# if you prefer to manually perform the enrichment. Those files are +# embedded in the output HTML report and can be downloaded from there. +gene_column_name <- "Gene_symbol" +genes <- data_excel[[gene_column_name]][4:nrow(data_excel)] + clustering_results <- SplineOmics::cluster_hits( splineomics = splineomics, - genes = genes, - analysis_type = "time_effect", # effect of time on features in level. + # Cluster the hits from the time_effect results. You can also cluster + # the hits from the other two limma result categories by specifying + # it here with this argument. + analysis_type = "time_effect", adj_pthresholds = adj_pthresholds, clusters = clusters, + genes = genes, plot_info = plot_info, report_dir = report_dir, ) @@ -773,11 +684,13 @@ clustering_results <- SplineOmics::cluster_hits( # Perform gene set enrichment analysis (GSEA) -To each clustered hit, the respective gene can be assigned and GSEA +Usually, the final step in such a bioinformatics analysis is GSEA. To each +clustered hit, the respective gene can be assigned and GSEA performed. For this, the Enrichr databases of choice have to be downloaded: -```{r Define which Enrichr databases to download} +```{r download Enrichr databases, eval = TRUE} +# Specify which databases you want to download from Enrichr gene_set_lib <- c( "WikiPathways_2019_Human", "NCI-Nature_2016", @@ -792,19 +705,7 @@ gene_set_lib <- c( "GO_Molecular_Function_2018", "Human_Gene_Atlas" ) -``` - -## Required Arguments `download_enrichr_databases()` - -- **gene_set_lib**: A char vector of database names to download from - Enrichr. - -## Optional Arguments `download_enrichr_databases()` - -- **output_dir**: The path to the output directory where the .tsv file - will be saved. Defaults to the current working directory. -```{r Download the Enrichr databases} output_dir <- here::here( "demo_results", "downloaded_databases" @@ -816,24 +717,16 @@ SplineOmics::download_enrichr_databases( ) ``` -To run GSEA, a genes vector must be created, containing all the -underlying genes of the features. The downloaded database file should be -loaded as a dataframe. Additionally, the clusterProfiler parameters and -the report directory can optionally be specified. The function -create_gsea_report() runs GSEA using clusterProfiler, generates an HTML -report, and returns the GSEA dot plots in R. - -```{r Prepare GSEA inputs} -# Get gene vector. Every gene must be in the standardized format expected by -# the enrichment tools. The subsequent code extracts this part. -# For your analysis, this code needs to be customized based on the format. -gene_column_name <- "Genes" -genes <- annotation[[gene_column_name]][1:nrow(annotation)] -genes <- sub(" .*", "", genes) -genes <- sub(";.*", "", genes) -genes <- sub("_.*", "", genes) -genes <- sub("-.*", "", genes) +Per default the file is placed in the current working directory, which +is the root dir of the R project. +To run GSEA, the downloaded database file has to be loaded as a +dataframe. Further, optionally, the clusterProfiler parameters and the +report dir can be specified. The function create_gsea_report() runs GSEA +using clusterProfiler, generates an HTML report and returns the GSEA +dotplots in R. + +```{r run GSEA, eval = TRUE} # The file has a timestamp, but this code takes it irrespective of it. downloaded_dbs_filepath <- list.files( path = output_dir, @@ -846,6 +739,7 @@ databases <- readr::read_tsv( col_types = readr::cols() ) +# Specify the clusterProfiler parameters clusterProfiler_params <- list( adj_p_value = 0.05, pAdjustMethod = "BH", @@ -853,50 +747,43 @@ clusterProfiler_params <- list( maxGSSize = 500, qvalueCutoff = 0.2 ) -``` - -## Required Arguments `create_gsea_report()` - -- **levels_clustered_hits**: A list of clustered hits for the - different levels. -- **genes**: A vector of genes of all features of the dataset -- **databases**: A list of databases to be used in the analysis. -- **report_info**: A list containing information for the report - generation. - -## Optional Arguments `create_gsea_report()` -- **params**: Additional parameters for the GSEA analysis. Default is - NA. -- **plot_titles**: Titles for the plots. Default is NA. -- **background**: Background data. Default is NULL. -- **report_dir**: The path to the output directory where the report - will be saved. Default is the current working directory. - -```{r Run GSEA} report_dir <- here::here( "demo_results", "gsea_reports" ) +``` -result <- SplineOmics::create_gsea_report( +The function below runs the clusterProfiler for all clusters and all levels, +and generates the HTML report: + +```{r run GSEA, eval = TRUE} +result <- SplineOmics::run_gsea( + # A dataframe with three columns: feature, cluster, and gene. Feature contains + # the integer index of the feature, cluster the integer specifying the cluster + # number, and gene the string of the gene, such as "CLSTN2". levels_clustered_hits = clustering_results$clustered_hits_levels, - genes = genes, databases = databases, - params = clusterProfiler_params, + clusterProfiler_params = clusterProfiler_params, report_info = report_info, report_dir = report_dir ) ``` -In the output HTML, each row in the dot plots represents a term from a -specific database, and the columns correspond to the respective -clusters. The color scale indicates the odds ratio, while the size -represents the -log10 adjusted p-value. Only terms with support from -more than 2 genes are included in the plot. For each cluster, a maximum -of 5 terms with the highest odds ratios are shown. - -Note that if, for example, cluster 1 already has 5 terms and cluster 2 -receives a term that was also found for cluster 1, this term will be -included as the sixth term for cluster 1. This is how the maximum of 5 -can be exceeded. +This report first shows all enrichment results, where more than 2 genes +supported a term, in a tabular format. The table with all the terms with +\< 2 genes supporting it can be downloaded by clicking on a button below +that table. + +For the dotplots below that, every row is a term from a specific +database, and the columns are the respective clusters. The color scale +contains the info about the odds ratio and the size the -log10 adj. +p-value. Only terms that have \> 2 genes as support are included in the +plot. Further, for each cluster, just maximally 5 terms are shown (the +terms with the highest odds ratios). Note that when for example cluster +1 already has 5 terms, and cluster 2 does not, and gets a term which was +also found for cluster 1, than this term would be included as the sixth +term for cluster 1, so this is a way the maximum of 5 can be exceeded. + +If a phase, like stationary here, does not lead to any enrichment +results, that is stated with a red message. diff --git a/man/InputControl.Rd b/man/InputControl.Rd index 8d51d9b..723af99 100755 --- a/man/InputControl.Rd +++ b/man/InputControl.Rd @@ -73,9 +73,9 @@ If any of these checks fail, an informative error message is returned. This function performs the following checks: 1. Ensures `feature_name_columns` and `annotation` are not `NULL`. 2. Verifies that each element in `feature_name_columns` is a character - with + with a length of 1. -3. Checks that all elements of `feature_name_columns` are valid column +3. Checks that all elements of `feature_name_columns` are valid column names in the `annotation` data frame. Check Report @@ -84,7 +84,7 @@ Check Report The function performs the following checks: - Whether the `report` argument is present. -- If `report` is not a Boolean value (`TRUE` or `FALSE`), it throws +- If `report` is not a Boolean value (`TRUE` or `FALSE`), it throws an error. } \section{Functions}{ @@ -751,18 +751,18 @@ Check Analysis Mode \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-InputControl-check_analysis_type}{}}} \subsection{Method \code{check_analysis_type()}}{ -This method checks the validity of the `analysis_mode` argument. -It ensures that +This method checks the validity of the `analysis_mode` argument. +It ensures that `analysis_mode` is a character vector of length 1 and that it matches - one of the -allowed analysis modes: "time_effect", "avrg_diff_conditions", or + one of the +allowed analysis modes: "time_effect", "avrg_diff_conditions", or "interaction_condition_time". \subsection{Usage}{ \if{html}{\out{