updates to summarize_by_patient, tests, and general error messages fo…

…r the package
MSKCC-Epi-Bio · Aug 16, 2024 · ce5ba7e · ce5ba7e
1 parent c4cee52
commit ce5ba7e
Show file tree

Hide file tree

Showing 12 changed files with 129 additions and 62 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -30,7 +30,7 @@ LazyData: true
 URL: https://github.com/MSKCC-Epi-Bio/gnomeR,
     https://mskcc-epi-bio.github.io/gnomeR/
 BugReports: https://github.com/MSKCC-Epi-Bio/gnomeR/issues
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Depends: R (>= 3.6)
 biocViews: 
     ComplexHeatmap,

diff --git a/R/summarize-by-patient.R b/R/summarize-by-patient.R
@@ -23,9 +23,11 @@
 #'   samples = samples, mutation = mutations, cna = cna,
 #'   mut_type = "somatic_only",
 #'   include_silent = FALSE,
-#'   specify_panel = "IMPACT341"
-#' ) %>%
-#'   summarize_by_patient()
+#'   specify_panel = "IMPACT341")
+#'
+#' gene_binary$patient_id = extract_patient_id(gene_binary$sample_id)
+#'
+#' summarize_by_patient(gene_binary)
 #'
 summarize_by_patient <- function(gene_binary, other_vars = NULL) {
 
@@ -36,7 +38,15 @@ summarize_by_patient <- function(gene_binary, other_vars = NULL) {
     cli::cli_abort("{.code gene_binary} must be a data.frame with sample ids")
   }
 
-  .check_required_cols(gene_binary, "sample_id")
+  # !!! I think we should allow sample ID as input but not require it
+  # .check_required_cols(
+  #   gene_binary,
+  #   c("sample_id"))
+
+  .check_required_cols(
+    gene_binary,
+    c("patient_id"),
+    add_to_message = c(i = "To extract patient IDs from IMPACT sample IDs (e.g. `P-XXXXXX-TXX-IMX`), use {.code gnomeR::extract_patient_id(data$sample_id)}"))
 
   # Other Vars - Capture Other Columns to Retain -----------------------------------
 
@@ -49,12 +59,14 @@ summarize_by_patient <- function(gene_binary, other_vars = NULL) {
 
   # Create Sample Index -----------------------------------------------------
 
+
   sample_index <- gene_binary %>%
-    select("sample_id") %>%
+    select("patient_id") %>%
     mutate(sample_index = paste0("samp", 1:nrow(gene_binary)))
 
   # data frame of only alterations
-  alt_only <- as.data.frame(select(gene_binary, -"sample_id", -any_of(other_vars)))
+
+  alt_only <- as.data.frame(select(gene_binary, -"patient_id", -any_of("sample_id"), -any_of(other_vars)))
 
   row.names(alt_only) <- sample_index$sample_index
 
@@ -102,12 +114,10 @@ summarize_by_patient <- function(gene_binary, other_vars = NULL) {
     left_join(sample_index, ., by = "sample_index") %>%
     select(-c("sample_index")) %>%
     # identify patients
-    mutate(patient_id = gnomeR::extract_patient_id(.data$sample_id)) %>%
     # determine number of samples per patient
     group_by(.data$patient_id) %>%
     mutate(n_samples = n()) %>%
-    ungroup() %>%
-    select(-.data$sample_id)
+    ungroup()
 
   # summarize genomic information across patients
   # separate patients w/ only 1 sample vs multiple samples to improve run time
@@ -131,20 +141,19 @@ summarize_by_patient <- function(gene_binary, other_vars = NULL) {
 
     simp_gene_binary_pt <- bind_rows(simp_gene_binary_pt_single,
                                      simp_gene_binary_pt_multiple) %>%
-      select(-.data$n_samples)
+      select(-"n_samples")
     } else {
     simp_gene_binary_pt <- simp_gene_binary_pt_single %>%
-      select(-.data$n_samples)
+      select(-"n_samples")
     }
 
-
+  # !!!! Discuss this
   simp_gene_binary <- left_join(simp_gene_binary_pt,
                                 gene_binary %>%
-                                  mutate(patient_id = gnomeR::extract_patient_id(.data$sample_id)) %>%
                                   select(any_of(c("patient_id", other_vars))) %>%
                                   distinct(),
                                 by = "patient_id") %>%
-    select(.data$patient_id, everything())
+    select("patient_id", everything())
 
   return(simp_gene_binary)
 

diff --git a/R/utils-gene-binary.R b/R/utils-gene-binary.R
@@ -15,6 +15,8 @@
 
 }
 
+#' Check for silent mutations
+#'
 #' @param mutation Raw maf dataframe containing alteration data
 #' @param include_silent Silent mutations will be removed if FALSE (default). Variant classification column is needed.
 #' @return a corrected maf file or an error if problems with maf
@@ -34,7 +36,7 @@
 
 #' Check for fusions in maf file
 #'
-#' @param mutation
+#' @param mutation data frame of mutations (e.g. MAF)
 #' @return a data frame if no fusions found
 #' @keywords internal
 .check_for_fus_in_mut <- function(mutation) {

diff --git a/R/utils.R b/R/utils.R
@@ -8,11 +8,12 @@
 #'
 #' @param data A data frame to check
 #' @param required_cols A character specifying names of columns to check
+#' @param add_to_message a vector (preferrably named) of text to add to the error message for specific cases
 #' @return If data set doesn't have required columns it will return an error message.
 #' If it does have required columns, nothing will be returned
 #' @keywords internal
 
-.check_required_cols <- function(data, required_cols) {
+.check_required_cols <- function(data, required_cols, add_to_message = NULL) {
 
   # Get the name of the data object
   data_name <- deparse(substitute(data))
@@ -21,7 +22,12 @@
   which_missing <- required_cols[which(!(required_cols %in% column_names))]
 
   if(length(which_missing) > 0) {
-    cli::cli_abort("The following required columns are missing in your {.field {data_name}} data: {.var {which_missing}}")
+    message <-
+      c("Can't find required columns:", set_names(c(which_missing), "x"))
+
+    add_to_message <- add_to_message %||% ""
+    message <- c(message, add_to_message)
+    cli::cli_abort(message)
   }
 }
 

diff --git a/man/dot-check_for_fus_in_mut.Rd b/man/dot-check_for_fus_in_mut.Rd
diff --git a/man/dot-check_for_silent.Rd b/man/dot-check_for_silent.Rd
diff --git a/man/dot-check_required_cols.Rd b/man/dot-check_required_cols.Rd
diff --git a/man/summarize_by_patient.Rd b/man/summarize_by_patient.Rd
diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf
diff --git a/tests/testthat/test-recode-alias.R b/tests/testthat/test-recode-alias.R
@@ -12,7 +12,7 @@ test_that("missing columns of interest", {
                                 "MYC",
                                 "BCL1")
 
-  expect_error(recode_alias(genomic_df, alias_table = alias_table), "The following *")
+  expect_error(recode_alias(genomic_df, alias_table = alias_table), "Can't find required*")
 
 })
 

diff --git a/tests/testthat/test-reformat_fusion.R b/tests/testthat/test-reformat_fusion.R
@@ -1,7 +1,7 @@
 
 # data checks ---------------------------
 test_that("required columns are included & is data.frame", {
-  expect_error(reformat_fusion(gnomeR::sv_long %>% select(-fusion)), "The following*")
+  expect_error(reformat_fusion(gnomeR::sv_long %>% select(-fusion)), "Can't find*")
   expect_error(reformat_fusion(gnomeR::sv_long$hugo_symbol), "`fusion`*")
 })
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,7 +12,7 @@ test_that("missing columns of interest", { @@
                                     "MYC",
                                     "BCL1")
-      expect_error(recode_alias(genomic_df, alias_table = alias_table), "The following *")
+      expect_error(recode_alias(genomic_df, alias_table = alias_table), "Can't find required*")
     })
@@ Expand Down @@