diff --git a/R/merge_sce_list.R b/R/merge_sce_list.R index f6bb60c4..463dc10a 100644 --- a/R/merge_sce_list.R +++ b/R/merge_sce_list.R @@ -83,6 +83,9 @@ merge_sce_list <- function( Please check that `retain_coldata_cols` was correctly specified.") } + # Check metadata - library_id and sample_id must be present + check_metadata(sce_list) + # Check altExp compatibility, if we are including them # altExps for a given name must all have the same features and the same assays if (include_altexp) { @@ -102,6 +105,17 @@ merge_sce_list <- function( # Update retain_coldata_cols retain_coldata_cols <- c(retain_coldata_cols, altexp_columns) + + # Check each altExp metadata, for SCEs with the altExp + names(altexp_attributes) |> + purrr::walk( + \(altexp_name) { + sce_list |> + purrr::keep(\(sce) altexp_name %in% altExpNames(sce)) |> + purrr::map(altExp, altexp_name) |> + check_metadata() + } + ) } else { # Remove altexps if we are not including them sce_list <- sce_list |> @@ -136,17 +150,6 @@ merge_sce_list <- function( warning("The provided `retain_coldata_cols` are not present in any SCEs.") } - # check that library id and sample id are present in metadata - id_checks <- sce_list |> - purrr::map(\(sce) { - all(c("library_id", "sample_id") %in% names(metadata(sce))) - }) |> - unlist() - - if (!all(id_checks)) { - stop("The metadata for each SCE object must contain `library_id` and `sample_id`.") - } - ## Prepare main experiment of SCEs for merging -------------------- sce_list <- sce_list |> purrr::imap( @@ -159,45 +162,7 @@ merge_sce_list <- function( ) - ## Handle metadata --------------------------------------------- - # get a list of metadata from the list of sce objects - # each library becomes an element within the metadata components - metadata_list <- sce_list |> - purrr::map(metadata) |> - purrr::transpose() - - # flatten and reduce the library ids and sample ids - metadata_list$library_id <- metadata_list$library_id |> - unlist() |> - unique() - - metadata_list$sample_id <- metadata_list$sample_id |> - unlist() |> - unique() - - # if object has sample metadata then combine into a single data frame - if ("sample_metadata" %in% names(metadata_list)) { - sample_metadata <- metadata_list$sample_metadata |> - purrr::map(\(df) { - # make sure all column types are compatible first - df |> - dplyr::mutate(dplyr::across(dplyr::everything(), as.character)) - }) |> - dplyr::bind_rows() |> - unique() - - # check that all sample ids are found in the new sample metadata and warn if not - if (!all(metadata_list$sample_id %in% sample_metadata$sample_id)) { - warning("Not all sample ids are present in metadata(merged_sce)$sample_metadata.") - } - - # replace sample metadata in metadata list - metadata_list$sample_metadata <- sample_metadata - } - - ## Handle altExps ------------------------------------------------------ - # If we are including altExps, process them and save to list to add to merged SCE if (include_altexp) { for (altexp_name in names(altexp_attributes)) { @@ -219,8 +184,35 @@ merge_sce_list <- function( # Create the merged SCE from the processed list merged_sce <- do.call(combineCols, unname(sce_list)) - # Replace existing metadata list with merged metadata - metadata(merged_sce) <- metadata_list + # Update metadata in merged objects, using the unmerged sce_list + metadata(merged_sce) <- sce_list |> + purrr::map(metadata) |> + prepare_merged_metadata() + + if (include_altexp) { + for (altexp_name in names(altexp_attributes)) { + has_altexp_name <- sce_list |> + purrr::map_lgl(\(sce) (altexp_name %in% altExpNames(sce))) + + # For any SCEs without this altExp, create the library_id and sample_id metadata + additional_metadata <- sce_list |> + purrr::discard(has_altexp_name) |> + purrr::map(extract_metadata_for_altexp) + + # Update metadata in altExps that were originally present + altexp_metadata_list <- sce_list |> + purrr::keep(has_altexp_name) |> + purrr::map(altExp, altexp_name) |> + purrr::map(metadata) |> + # Tack on the metadata we created for libraries without altExps + c(additional_metadata) + + # Ensure correct order + altexp_metadata_list <- altexp_metadata_list[names(sce_list)] + + metadata(altExp(merged_sce, altexp_name)) <- prepare_merged_metadata(altexp_metadata_list) + } + } return(merged_sce) } @@ -304,9 +296,6 @@ prepare_sce_for_merge <- function( colnames(sce) <- glue::glue("{sce_name}-{colnames(sce)}") } - # Update metadata list - metadata(sce) <- update_sce_metadata(metadata(sce)) - # return the processed SCE return(sce) } @@ -358,39 +347,6 @@ prepare_altexp_for_merge <- function( } - -#' Helper function to update SCE metadata for merging -#' -#' @param metadata_list Current SCE metadata before modification -#' -#' @return Updated metadata list to store in the SCE -update_sce_metadata <- function(metadata_list) { - # first check that this library hasn't already been merged - if ("library_metadata" %in% names(metadata_list)) { - stop(paste( - "This SCE object appears to be a merged object", - "We do not support merging objects with objects that have already been merged." - )) - } - - # create library and sample metadata. - # library metadata will hold all the previous metadata fields, to avoid conflicts - library_metadata <- metadata_list[names(metadata_list) != "sample_metadata"] - sample_metadata <- metadata_list$sample_metadata - - # combine into one list - metadata_list <- list( - library_id = metadata_list[["library_id"]], - sample_id = metadata_list[["sample_id"]], - library_metadata = library_metadata, # this will be all previous metadata for the given library - sample_metadata = sample_metadata # this will be the same as the previous sample_metadata - ) - - # return updated metadata - return(metadata_list) -} - - #' Helper function to check altExp compatibility #' #' @param sce_list List of SCEs with altExps to check @@ -445,3 +401,118 @@ get_altexp_attributes <- function(sce_list) { } return(altexp_attributes) } + + + + + +#' Prepare an updated list of metadata for the merged SCE +#' +#' The metadata will be reformatted to contain the following fields: +#' - `library_id`: vector of all library ids in the merged object +#' - `sample_id`: vector of all sample ids in the merged object +#' - `library_metadata`: pre-merge metadata list for each library, named by SCE name +#' - `sample_metadata`: a data frame with all distinct rows from the pre-merge sample_metadata data frames, +#' with all values coerced to character +#' +#' @param metadata_list List of metadata to update +#' +#' @return Updated metadata list to include in the merged SCE +prepare_merged_metadata <- function(metadata_list) { + # Define vectors of library and sample ids + metadata_library_ids <- metadata_list |> + purrr::map_chr("library_id") + + metadata_sample_ids <- metadata_list |> + purrr::map_chr("sample_id") + + # Grab names to check contents + transposed_names <- metadata_list |> + purrr::map(names) |> + purrr::reduce(union) + + # first check that this library hasn't already been merged + if ("library_metadata" %in% transposed_names) { + stop(paste( + "This SCE object appears to be a merged object", + "We do not support merging objects with objects that have already been merged." + )) + } + + # Create new version of metadata_list without any sample_metadata fields + # Note that this will work even if `sample_metadata` is not present + library_metadata <- metadata_list |> + purrr::map( + \(meta) meta[which(names(meta) != "sample_metadata")] + ) + + # combine into final metadata list + final_metadata_list <- list( + # vector of all library ids + library_id = metadata_library_ids, + # vector of all sample ids + sample_id = metadata_sample_ids, + # list of existing metadata for each library, with sample_metadata removed + library_metadata = library_metadata + ) + + # if object has sample metadata then combine into a single data frame, + # and add this to the final_metadata_list + if ("sample_metadata" %in% transposed_names) { + sample_metadata <- metadata_list |> + purrr::map("sample_metadata") |> + purrr::map(\(df) { + # make sure all column types are compatible first + df |> + dplyr::mutate(dplyr::across(dplyr::everything(), as.character)) + }) |> + dplyr::bind_rows() |> + dplyr::distinct() + + # check that all sample ids are found in the new sample metadata and warn if not + if (!all(metadata_sample_ids %in% sample_metadata$sample_id)) { + warning("Not all sample ids are present in metadata(merged_sce)$sample_metadata.") + } + + # add to final_metadata_list + final_metadata_list$sample_metadata <- sample_metadata + } + + return(final_metadata_list) +} + + + +#' Helper function to check that expected metadata fields are present in a given +#' list of SCEs. +#' +#' The default expected fields are `library_id` and `sample_id`. If either is missing from +#' any SCE, an error is thrown. This function does not return anything. +#' +#' @param sce_list List of SCEs to check +#' @param expected_fields a vector of metadata fields that should be present +check_metadata <- function(sce_list, expected_fields = c("library_id", "sample_id")) { + metadata_checks <- sce_list |> + purrr::map_lgl(\(sce) { + all(expected_fields %in% names(metadata(sce))) + }) + + if (!all(metadata_checks)) { + stop(glue::glue(" + The metadata for each SCE object must contain {stringr::str_flatten_comma(expected_fields, ', and ')}. + If `include_altexp` is TRUE, these fields must also be present in all altExps. + ")) + } +} + +#' Helper function to extract main SCE metadata for inclusion in an altExp +#' +#' @param sce SCE object to extract metadata from +#' +#' @return List with fields `library_id` and `sample_id` +extract_metadata_for_altexp <- function(sce) { + list( + library_id = metadata(sce)$library_id, + sample_id = metadata(sce)$sample_id + ) +} diff --git a/inst/WORDLIST b/inst/WORDLIST index eb418099..3439328a 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,3 +1,5 @@ + +altExp Alevin CMD commenter @@ -5,6 +7,8 @@ Dockerfile Kallisto nf pre +SCE +SCEs PRs RStudio ScPCA diff --git a/man/check_metadata.Rd b/man/check_metadata.Rd new file mode 100644 index 00000000..0b9fa0a8 --- /dev/null +++ b/man/check_metadata.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge_sce_list.R +\name{check_metadata} +\alias{check_metadata} +\title{Helper function to check that expected metadata fields are present in a given +list of SCEs.} +\usage{ +check_metadata(sce_list, expected_fields = c("library_id", "sample_id")) +} +\arguments{ +\item{sce_list}{List of SCEs to check} + +\item{expected_fields}{a vector of metadata fields that should be present} +} +\description{ +The fields `library_id` and `sample_id` must be present. If either is missing from + any SCE, an error is thrown. This function does not return anything. +} diff --git a/man/extract_metadata_for_altexp.Rd b/man/extract_metadata_for_altexp.Rd new file mode 100644 index 00000000..f7243854 --- /dev/null +++ b/man/extract_metadata_for_altexp.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge_sce_list.R +\name{extract_metadata_for_altexp} +\alias{extract_metadata_for_altexp} +\title{Helper function to extract main SCE metadata for inclusion in an altExp} +\usage{ +extract_metadata_for_altexp(sce) +} +\arguments{ +\item{sce}{SCE object to extract metadata from} +} +\value{ +List with fields `library_id` and `sample_id` +} +\description{ +Helper function to extract main SCE metadata for inclusion in an altExp +} diff --git a/man/prepare_merged_metadata.Rd b/man/prepare_merged_metadata.Rd new file mode 100644 index 00000000..3e1905c6 --- /dev/null +++ b/man/prepare_merged_metadata.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge_sce_list.R +\name{prepare_merged_metadata} +\alias{prepare_merged_metadata} +\title{Prepare an updated list of metadata for the merged SCE} +\usage{ +prepare_merged_metadata(metadata_list) +} +\arguments{ +\item{metadata_list}{List of metadata to update} +} +\value{ +Updated metadata list to include in the merged SCE +} +\description{ +The metadata will be reformatted to contain the following fields: +- `library_id`: vector of all library ids in the merged object +- `sample_id`: vector of all sample ids in the merged object +- `library_metadata`: pre-merge metadata list for each library, named by SCE name +- `sample_metadata`: a data frame with all distinct rows from the pre-merge sample_metadata data frames, + with all values coerced to character +} diff --git a/man/update_sce_metadata.Rd b/man/update_sce_metadata.Rd deleted file mode 100644 index 1cc58743..00000000 --- a/man/update_sce_metadata.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/merge_sce_list.R -\name{update_sce_metadata} -\alias{update_sce_metadata} -\title{Helper function to update SCE metadata for merging} -\usage{ -update_sce_metadata(metadata_list) -} -\arguments{ -\item{metadata_list}{Current SCE metadata before modification} -} -\value{ -Updated metadata list to store in the SCE -} -\description{ -Helper function to update SCE metadata for merging -} diff --git a/tests/testthat/test-merge_sce_list.R b/tests/testthat/test-merge_sce_list.R index 991909e6..fa5c5ea6 100644 --- a/tests/testthat/test-merge_sce_list.R +++ b/tests/testthat/test-merge_sce_list.R @@ -59,23 +59,6 @@ sce_list <- list( # Tests without altexps ---------------------------------------------- -test_that("`update_sce_metadata()` returns the expected list", { - metadata_list <- metadata(sce_list[[1]]) - - new_metadata <- update_sce_metadata(metadata_list) - - expect_equal( - names(new_metadata), - c("library_id", "sample_id", "library_metadata", "sample_metadata") - ) - - expect_equal( - names(new_metadata$library_metadata), - c("library_id", "sample_id", "total_reads") - ) -}) - - test_that("`prepare_sce_for_merge` works as expected when all columns are present, no altexps", { result_sce <- prepare_sce_for_merge( sce, @@ -114,26 +97,6 @@ test_that("`prepare_sce_for_merge` works as expected when all columns are presen names(rowData(result_sce)), c("gene_names", paste(sce_name, "other_column", sep = "-")) ) - - # metadata names check - expect_contains( - names(metadata(result_sce)), - c("library_id", "sample_id", "library_metadata", "sample_metadata") - ) - - # check that sample metadata is a data frame - expect_s3_class(metadata(result_sce)$sample_metadata, "data.frame") - - # check that contents of library id and sample id are correct - expect_equal( - metadata(result_sce)$library_id, - "library-1" - ) - - expect_equal( - metadata(result_sce)$sample_id, - "sample-1" - ) }) @@ -511,7 +474,18 @@ test_that("merging SCEs with 1 altexp but different features fails as expected, test_that("merging SCEs where 1 altExp is missing works as expected, with altexps", { - sce_list_with_altexp[["sce4"]] <- sce_list[[1]] + sce_list_with_altexp$sce4 <- sce_list[[1]] + + # update the metdata list with sce4 name + metadata(sce_list_with_altexp$sce4) <- list( + library_id = "library-sce4", + sample_id = "sample-sce4", + total_reads = 100, + sample_metadata = data.frame( + sample_id = "sample-sce4", + library_id = "library-sce4" + ) + ) merged_sce <- merge_sce_list( sce_list_with_altexp, @@ -526,6 +500,57 @@ test_that("merging SCEs where 1 altExp is missing works as expected, with altexp merged_altexp <- altExp(merged_sce, "adt") + # check merged_altexp metadata + altexp_metadata <- metadata(merged_altexp) + expect_setequal( + names(altexp_metadata), + c("library_id", "sample_id", "library_metadata") + ) + expect_setequal( + altexp_metadata$library_id, + glue::glue("library-{names(sce_list_with_altexp)}") + ) + expect_setequal( + altexp_metadata$sample_id, + glue::glue("sample-{names(sce_list_with_altexp)}") + ) + + + expect_setequal( + # all but sce4 contain all metadata components + altexp_metadata$library_metadata[-4] |> + purrr::map(names) |> + purrr::reduce(intersect), + c("library_id", "sample_id", "mapped_reads", "ambient_profile") + ) + + expect_setequal( + # sce4 has only library id and sample id, as it was missing the altExp + names(altexp_metadata$library_metadata$sce4), + c("library_id", "sample_id") + ) + + + expect_true( + is.null(altexp_metadata$library_metadata$sce4$ambient_profile) & + is.null(altexp_metadata$library_metadata$sce4$mapped_reads) + ) + expect_true( + all( + is.numeric(altexp_metadata$library_metadata$sce1$ambient_profile), + is.numeric(altexp_metadata$library_metadata$sce2$ambient_profile), + is.numeric(altexp_metadata$library_metadata$sce3$ambient_profile) + ) + ) + expect_true( + all( + altexp_metadata$library_metadata$sce1$library_id == "library-sce1", + altexp_metadata$library_metadata$sce2$library_id == "library-sce2", + altexp_metadata$library_metadata$sce3$library_id == "library-sce3", + altexp_metadata$library_metadata$sce4$library_id == "library-sce4" + ) + ) + # check dimensions expect_equal( dim(merged_sce), @@ -533,8 +558,9 @@ test_that("merging SCEs where 1 altExp is missing works as expected, with altexp ) expect_equal( dim(merged_altexp), - c(num_altexp_features, total_cells * 4 / 3 ) + c(num_altexp_features, total_cells * 4 / 3) ) + # check colData names are as expected expected_coldata <- c( "sum", @@ -580,3 +606,86 @@ test_that("get_altexp_attributes throws an error as expected when features do no altExp(sce_list_with_altexp[[1]]) <- altExp(sce_list_with_altexp[[1]])[1:3, ] expect_error(get_altexp_attributes(sce_list_with_altexp)) }) + + +test_that("check_metadata throws an error when a field is missing", { + metadata(sce_list[[1]])$library_id <- NULL + expect_error(check_metadata(sce_list)) + expect_no_error(check_metadata(sce_list, expected_fields = "sample_id")) +}) + +test_that("extract_metadata_for_altexp returns the correct values", { + expected_list <- list( + "library_id" = "library-sce1", + "sample_id" = "sample-sce1" + ) + observed_list <- extract_metadata_for_altexp(sce_list[[1]]) + expect_equal(observed_list, expected_list) +}) + +test_that("prepare_merged_metadata works as expected, with sample_metadata present", { + observed_metadata <- sce_list |> + purrr::map(metadata) |> + prepare_merged_metadata() + + expect_setequal( + names(observed_metadata), + c("library_id", "sample_id", "library_metadata", "sample_metadata") + ) + + expected_library_ids <- glue::glue("library-{names(sce_list)}") + expected_sample_ids <- glue::glue("sample-{names(sce_list)}") + expect_setequal( + observed_metadata$library_id, + expected_library_ids + ) + expect_setequal( + observed_metadata$sample_id, + expected_sample_ids + ) + expect_setequal( + names(observed_metadata$library_metadata), + names(sce_list) + ) + expect_setequal( + observed_metadata$library_metadata |> + purrr::map(names) |> + purrr::reduce(intersect), + c("library_id", "sample_id", "total_reads") + ) + expect_equal( + data.frame( + sample_id = expected_sample_ids, + library_id = expected_library_ids + ), + observed_metadata$sample_metadata + ) +}) + + +test_that("prepare_merged_metadata works as expected from altExps metadata (aka, sample_metadata NOT present)", { + observed_metadata <- sce_list_with_altexp |> + purrr::map(altExp) |> + purrr::map(metadata) |> + prepare_merged_metadata() + + expect_setequal( + names(observed_metadata), + c("library_id", "sample_id", "library_metadata") + ) + + expect_setequal( + observed_metadata$library_id, + glue::glue("library-{names(sce_list)}") + ) + expect_setequal( + observed_metadata$sample_id, + glue::glue("sample-{names(sce_list)}") + ) + expect_setequal( + observed_metadata$library_metadata |> + purrr::map(names) |> + purrr::reduce(intersect), + c("library_id", "sample_id", "mapped_reads", "ambient_profile") + ) +})