Merge pull request #346 from jalavery/add_summarize_by_patient

Add summarize by patient
MSKCC-Epi-Bio · Oct 15, 2024 · cca101f · cca101f
2 parents 9d6073e + 9ea6e51
commit cca101f
Show file tree

Hide file tree

Showing 11 changed files with 522 additions and 6 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -32,6 +32,7 @@ export(specify_impact_panels)
 export(subset_by_frequency)
 export(subset_by_panel)
 export(summarize_by_gene)
+export(summarize_by_patient)
 export(tbl_genomic)
 export(which_impact_panel)
 import(ComplexHeatmap)

diff --git a/R/summarize-by-patient.R b/R/summarize-by-patient.R
@@ -0,0 +1,169 @@
+#' Simplify binary matrix to one column per patient that counts any alteration
+#' type across all samples as 1
+#'
+#' This will reduce the number of columns in your binary matrix, and the
+#' resulting data frame will have only 1 col per gene, as opposed to separate
+#' columns for mutation/cna/fusion.
+#'
+#' Note that if samples to the same patient were sequenced on different panels,
+#' any indication of an alteration is counted as an alteration, but the absence
+#' of an alteration is only defined when all sequencing panels included the gene
+#' and indicated that it was not altered.
+#'
+#' @param gene_binary a 0/1 matrix of gene alterations
+#' @param other_vars One or more column names (quoted or unquoted) in data to be retained
+#' in resulting data frame. Default is NULL.
+#'
+#' @return a binary matrix with a row for each sample and one column per gene
+#' @export
+#'
+#' @examples
+#' samples <- unique(gnomeR::mutations$sampleId)[1:10]
+#' gene_binary <- create_gene_binary(
+#'   samples = samples, mutation = mutations, cna = cna,
+#'   mut_type = "somatic_only",
+#'   include_silent = FALSE,
+#'   specify_panel = "IMPACT341")
+#'
+#' gene_binary$patient_id = extract_patient_id(gene_binary$sample_id)
+#'
+#' summarize_by_patient(gene_binary)
+#'
+summarize_by_patient <- function(gene_binary, other_vars = NULL) {
+
+
+  # Checks ------------------------------------------------------------------
+
+  if (!is.data.frame(gene_binary)) {
+    cli::cli_abort("{.code gene_binary} must be a data.frame with sample ids")
+  }
+
+  # !!! I think we should allow sample ID as input but not require it
+  # .check_required_cols(
+  #   gene_binary,
+  #   c("sample_id"))
+
+  .check_required_cols(
+    gene_binary,
+    c("patient_id"),
+    add_to_message = c(i = "To extract patient IDs from IMPACT sample IDs (e.g. `P-XXXXXX-TXX-IMX`), use {.code gnomeR::extract_patient_id(data$sample_id)}"))
+
+  # Other Vars - Capture Other Columns to Retain -----------------------------------
+
+  other_vars <-
+    .select_to_varnames({{ other_vars }},
+                        data = gene_binary,
+                        arg_name = "other_vars"
+    )
+
+
+  # Create Sample Index -----------------------------------------------------
+
+
+  sample_index <- gene_binary %>%
+    select("patient_id") %>%
+    mutate(sample_index = paste0("samp", 1:nrow(gene_binary)))
+
+  # data frame of only alterations
+
+  alt_only <- as.data.frame(select(gene_binary, -"patient_id", -any_of("sample_id"), -any_of(other_vars)))
+
+  row.names(alt_only) <- sample_index$sample_index
+
+  # check numeric class ---------
+  .abort_if_not_numeric(alt_only)
+
+  # Transpose ---------------------------------------------------------------
+
+  transp_alt_only <- as.data.frame(t(alt_only))
+
+  # remove endings of gene names
+  transp_alt_only <- transp_alt_only %>%
+    mutate(gene = str_remove_all(row.names(.),
+                                 ".Amp|.fus|.Del|.cna"))
+
+  # check for genes that have more than one alt type
+  gene_tab <- table(transp_alt_only$gene)
+
+  genes_multiple <-  names(gene_tab[which(gene_tab > 1)])
+  genes_single <- names(gene_tab[which(gene_tab == 1)])
+
+  # genes with one type of event
+  all_bin_once <- transp_alt_only %>%
+    filter(.data$gene %in% genes_single)
+
+  # genes with more than one type of event
+  all_bin_more <- transp_alt_only %>%
+    filter(.data$gene %in% genes_multiple)
+
+  if(length(genes_multiple) > 0) {
+    all_bin_more <- all_bin_more %>%
+      group_by(.data$gene) %>%
+      summarize(across(everything(), max))
+  }
+
+  # bind together and transpose
+  all_bin <- rbind(all_bin_once, all_bin_more, make.row.names = FALSE) %>%
+    tibble::column_to_rownames("gene")
+
+  all_bin <- as.data.frame(t(all_bin)) %>%
+    tibble::rownames_to_column("sample_index")
+
+  # join back to sample ID and other vars
+  simp_gene_binary <- all_bin %>%
+    left_join(sample_index, ., by = "sample_index") %>%
+    select(-c("sample_index")) %>%
+    # identify patients
+    # determine number of samples per patient
+    group_by(.data$patient_id) %>%
+    mutate(n_samples = n()) %>%
+    ungroup()
+
+  # summarize genomic information across patients
+  # separate patients w/ only 1 sample vs multiple samples to improve run time
+  simp_gene_binary_pt_single <- simp_gene_binary %>%
+    filter(.data$n_samples == 1)
+
+  if (nrow(simp_gene_binary %>%
+           filter(.data$n_samples > 1)) >0){
+
+    simp_gene_binary_pt_multiple <- simp_gene_binary %>%
+      filter(.data$n_samples > 1) %>%
+      group_by(.data$patient_id) %>%
+      summarize(across(.cols = c(everything()),
+                       .fns = ~case_when(
+                         # if any alteration, indicate altered
+                         max(c(.x, 0), na.rm = TRUE) == 1 ~ 1,
+                         # no alteration only if no NAs (no na.rm)
+                         max(.x) == 0 ~ 0
+                       )),
+                .groups = "drop")
+
+    simp_gene_binary_pt <- bind_rows(simp_gene_binary_pt_single,
+                                     simp_gene_binary_pt_multiple) %>%
+      select(-"n_samples")
+    } else {
+    simp_gene_binary_pt <- simp_gene_binary_pt_single %>%
+      select(-"n_samples")
+    }
+
+  # Join data
+  simp_gene_binary <- left_join(simp_gene_binary_pt,
+                                gene_binary %>%
+                                  select(any_of(c("patient_id", other_vars))) %>%
+                                  distinct(),
+                                by = "patient_id") %>%
+    select("patient_id", everything())
+
+  # warn if not unique resulting data
+  n_occur <- data.frame(table(simp_gene_binary$patient_id))
+  n_occur[n_occur$Freq > 1,]
+  dupe_samp_in_result <- n_occur$Var1[n_occur$Freq > 1]
+
+  if(length(dupe_samp_in_result) > 0) {
+    cli::cli_alert_warning("Returned data is not unique (1 row per patient) due to non-distinct values in data passed to {.code other_vars} argument. See {head(dupe_samp_in_result, 3)} ...")
+  }
+
+  return(simp_gene_binary)
+
+}
diff --git a/R/utils-gene-binary.R b/R/utils-gene-binary.R
@@ -15,6 +15,8 @@
 
 }
 
+#' Check for silent mutations
+#'
 #' @param mutation Raw maf dataframe containing alteration data
 #' @param include_silent Silent mutations will be removed if FALSE (default). Variant classification column is needed.
 #' @return a corrected maf file or an error if problems with maf
@@ -34,7 +36,7 @@
 
 #' Check for fusions in maf file
 #'
-#' @param mutation a mutation maf data frame
+#' @param mutation data frame of mutations (e.g. MAF)
 #' @return a data frame if no fusions found
 #' @keywords internal
 .check_for_fus_in_mut <- function(mutation) {

diff --git a/R/utils.R b/R/utils.R
@@ -8,11 +8,12 @@
 #'
 #' @param data A data frame to check
 #' @param required_cols A character specifying names of columns to check
+#' @param add_to_message a vector (preferrably named) of text to add to the error message for specific cases
 #' @return If data set doesn't have required columns it will return an error message.
 #' If it does have required columns, nothing will be returned
 #' @keywords internal
 
-.check_required_cols <- function(data, required_cols) {
+.check_required_cols <- function(data, required_cols, add_to_message = NULL) {
 
   # Get the name of the data object
   data_name <- deparse(substitute(data))
@@ -21,7 +22,12 @@
   which_missing <- required_cols[which(!(required_cols %in% column_names))]
 
   if(length(which_missing) > 0) {
-    cli::cli_abort("The following required columns are missing in your {.field {data_name}} data: {.var {which_missing}}")
+    message <-
+      c("Can't find required columns:", set_names(c(which_missing), "x"))
+
+    add_to_message <- add_to_message %||% ""
+    message <- c(message, add_to_message)
+    cli::cli_abort(message)
   }
 }
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -12,6 +12,8 @@ template:
 authors:
   Karissa Whiting:
     href: "https://www.karissawhiting.com/"
+  Jessica Lavery:
+    href: "https://github.com/jalavery/"
   Mike Curry:
     href: "https://github.com/michaelcurry1123/"
   Hannah Fuchs:
@@ -28,6 +30,7 @@ reference:
   - contents:
     - create_gene_binary
     - summarize_by_gene
+    - summarize_by_patient
     - pivot_cna_wider
     - pivot_cna_longer
     - add_pathways

diff --git a/man/dot-check_for_silent.Rd b/man/dot-check_for_silent.Rd
diff --git a/man/dot-check_required_cols.Rd b/man/dot-check_required_cols.Rd
diff --git a/man/summarize_by_patient.Rd b/man/summarize_by_patient.Rd
diff --git a/tests/testthat/test-recode-alias.R b/tests/testthat/test-recode-alias.R
@@ -12,7 +12,7 @@ test_that("missing columns of interest", {
                                 "MYC",
                                 "BCL1")
 
-  expect_error(recode_alias(genomic_df, alias_table = alias_table), "The following *")
+  expect_error(recode_alias(genomic_df, alias_table = alias_table), "Can't find required*")
 
 })
 

diff --git a/tests/testthat/test-reformat_fusion.R b/tests/testthat/test-reformat_fusion.R
@@ -1,7 +1,7 @@
 
 # data checks ---------------------------
 test_that("required columns are included & is data.frame", {
-  expect_error(reformat_fusion(gnomeR::sv_long %>% select(-fusion)), "The following*")
+  expect_error(reformat_fusion(gnomeR::sv_long %>% select(-fusion)), "Can't find*")
   expect_error(reformat_fusion(gnomeR::sv_long$hugo_symbol), "`fusion`*")
 })
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,7 +12,7 @@ test_that("missing columns of interest", { @@
                                     "MYC",
                                     "BCL1")
-      expect_error(recode_alias(genomic_df, alias_table = alias_table), "The following *")
+      expect_error(recode_alias(genomic_df, alias_table = alias_table), "Can't find required*")
     })
@@ Expand Down @@