Merge branch 'main' into mutationStatus_repeated_message

MSKCC-Epi-Bio · Dec 21, 2023 · 854ac13 · 854ac13
2 parents 0238995 + 5759088
commit 854ac13
Show file tree

Hide file tree

Showing 15 changed files with 210 additions and 81 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: gnomeR
 Title: Wrangle and analyze IMPACT and TCGA mutation data 
-Version: 1.3.0
+Version: 1.2.0.9004
 Authors@R: 
     c(person(given = "Karissa",
              family = "Whiting",

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(add_pathways)
 export(annotate_any_panel)
 export(annotate_specific_panel)
 export(create_gene_binary)
+export(extract_patient_id)
 export(ggcomut)
 export(gggenecor)
 export(ggsamplevar)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # gnomeR (development version)
 
+- Added `extract_patient_id()` function to get IMPACT patient ID from sample ID
 - Deprecated `freq_cutoff`, `freq_cutoff_by_gene`, and `gene_subset` arguments in `tbl_genomic()`. It is now recommended that users use `subset_by_frequency()` instead before passing data to `tbl_genomic()`.
 - Added `other_vars` argument to `subset_by_frequency()`, `subset_by_panel()`, `summarize_by_gene()` and `add_pathways()` to allow retention of other clinical vars when using functions within pipeline.
 - Deprecated `count_pathways_by` argument of `add_pathways()` function. Now, user must specify which specific alteration to count towards the pathway via the `.mut`, `.Amp`, `.Del`, `.fus` suffix (e.g. `custom_pathways = c('TP53.mut', 'APC.Del)`). 
@@ -10,6 +11,9 @@
 - Fixed bug in `add_pathways()` where `custom_pathways` wasn't catching all types of alterations when `GENE.all` was used due to `paste0()` vectorization.
 - Changed some arguments to strict matching (`rlang::arg_match()`) instead of partial matching (`match.arg()`) (e.g. `mut_type = "s"` doesn't work anymore and must be fully specified `mut_type = "somatic_only"`).
 - Added unit tests for gnomeR plots/visuals (#144).
+- A dictionary of old to new names for `rename_columns()` output is now an attribute of the returned object. Now messages can reference the original names of data columns (ex: `TumorAllele2` not `tumor_allele_2`) to make it more intuitive to users (#302).
+- Fixed bug that wasn't consistently filtering out germline samples
+- Enhanced `subset_by_frequency()` to users to select hugo_symbols if they reach a threshold in any level of a variable (ex: high risk vs low risk) (#305)
 
 
 # gnomeR 1.2.0

diff --git a/R/create-gene-binary.R b/R/create-gene-binary.R
@@ -181,6 +181,9 @@ create_gene_binary <- function(samples = NULL,
   samples_final <- samples %||%
     samples_in_data
 
+  samples_final <- unique(samples_final)
+
+
   # Sanitize Data and Filter to Final Samples List  --------
 
   mutation <- switch(!is.null(mutation),
@@ -205,8 +208,6 @@ create_gene_binary <- function(samples = NULL,
     }
   )
 
-
-
   # Recode Aliases -----------------------------------------------------------
 
   # Fusions - create long version with event split by two involved genes
@@ -270,7 +271,8 @@ create_gene_binary <- function(samples = NULL,
       mut_type = mut_type,
       snp_only = snp_only,
       include_silent = include_silent,
-      specify_panel = specify_panel
+      specify_panel = specify_panel,
+      names_mut_dict = names_mut_dict
     )
   )
 
@@ -398,7 +400,8 @@ create_gene_binary <- function(samples = NULL,
                                    mut_type,
                                    snp_only,
                                    include_silent,
-                                   specify_panel) {
+                                   specify_panel,
+                                   names_mut_dict) {
 
   # apply filters --------------
 
@@ -421,8 +424,10 @@ create_gene_binary <- function(samples = NULL,
     },
     "omit_germline" = {
       mutation <- mutation %>%
-        filter(.data$mutation_status != "GERMLINE" |
-          .data$mutation_status != "germline" | is.na(.data$mutation_status))
+        filter((.data$mutation_status != "GERMLINE" &
+                  .data$mutation_status != "germline" &
+                  .data$mutation_status != "Germline") |
+                 is.na(.data$mutation_status))
 
       blank_muts <- mutation %>%
         filter(is.na(.data$mutation_status) |
@@ -432,17 +437,19 @@ create_gene_binary <- function(samples = NULL,
 
       if ((blank_muts > 0)) {
         cli::cli_alert_warning(
-          "{(blank_muts)} mutations have {.code NA} or blank in mutation status column instead of 'SOMATIC' or 'GERMLINE'. These were assumed to be 'SOMATIC' and were retained in the resulting binary matrix.")
+          "{(blank_muts)} mutations have {.code NA} or blank in the {.field {dplyr::first(c(names_mut_dict['mutation_status'], 'mutation_status'), na_rm = TRUE)}} column instead of 'SOMATIC' or 'GERMLINE'. These were assumed to be 'SOMATIC' and were retained in the resulting binary matrix.")
       }
     },
     "somatic_only" = {
       mutation <- mutation %>%
         filter(.data$mutation_status == "SOMATIC" |
+                 .data$mutation_status == "Somatic" |
           .data$mutation_status == "somatic")
     },
     "germline_only" = {
       mutation <- mutation %>% filter(.data$mutation_status == "GERMLINE" |
-        .data$mutation_status == "germline")
+                                        .data$mutation_status == "Germline" |
+                                        .data$mutation_status == "germline")
     }
   )
 

diff --git a/R/sanitize-data.R b/R/sanitize-data.R
@@ -71,9 +71,11 @@
           )
         )
 
-      cli::cli_warn("Column {.field variant_type} is missing from your data. We inferred variant types using {.field reference_allele} and {.field tumor_seq_allele2} columns")
-    } else {
-      cli::cli_abort("Column {.field variant_type} is missing from your data and {.field reference_allele} and {.field tumor_seq_allele2}
+
+        cli::cli_warn(c("Column {.field variant_type} is missing from your data. We inferred variant types using ",
+        "{.field {dplyr::first(c(names_dict['reference_allele'], 'reference_allele'), na_rm = TRUE)}} and {.field {dplyr::first(c(names_dict['tumor_seq_allele_2'], 'tumor_seq_allele_2'), na_rm = TRUE)}} columns"))
+      } else {
+        cli::cli_abort("Column {.field variant_type} is missing from your data and {.field reference_allele} and {.field tumor_seq_allele_2}
                               columns were not available from which to infer variant type.
                               To proceed, add a column specifying {.field variant_type} (e.g. {.code mutate(<your-mutation-df>, variant_type = 'SNP')}")
     }

diff --git a/R/utils.R b/R/utils.R
@@ -4,15 +4,16 @@
 
 #' Rename columns from API results to work with gnomeR functions
 #'
-#' @param df_to_check a data frame to check and recode names as needed
-#'
+#' Will return a named vector of internal column names as values and original data set names
+#' as names as an attribute (`attr(x, "names_dict")`)
+#' @param df_to_check A data frame to check and recode names as needed
 #' @return a renamed data frame
 #' @export
 #' @examples
 #'
 #' rename_columns(df_to_check = gnomeR::mutations)
-#' rename_columns(df_to_check = gnomeR::sv)
-#'
+#' x <- rename_columns(df_to_check = gnomeR::sv)
+#' attr(x, "names_dict")
 rename_columns <- function(df_to_check) {
 
   names_df_long <- gnomeR::names_df %>%
@@ -22,28 +23,32 @@ rename_columns <- function(df_to_check) {
 
   which_to_replace <- intersect(names(df_to_check), unique(names_df_long$value))
 
-  # create a temporary dictionary as a named vector
-  temp_dict <- names_df_long %>%
+  # create a temporary dictionary as a named vector- this should have all relevant values, including those unchanged
+  names_dict <- names_df_long %>%
     dplyr::filter(.data$value %in% which_to_replace) %>%
     select("internal_column_name",  "value") %>%
     dplyr::distinct() %>%
     tibble::deframe()
 
 
-  if(length(temp_dict) > 0) {
+  if(length(names_dict) > 0) {
 
     # store details on what has been changed.
-    message <- purrr::map2_chr(names(temp_dict),
-                               temp_dict,
+    message <- purrr::map2_chr(names(names_dict),
+                               names_dict,
                                ~paste0(.y, " renamed ", .x))
 
     names(message) <- rep("!", times = length(message))
 
 
     # rename those variables only
-    df_to_check %>%
-      dplyr::rename(!!temp_dict)
+    df_to_check <- df_to_check %>%
+      dplyr::rename(!!names_dict)
+
+    attr(df_to_check, "names_dict") <- names_dict
   }
+
+    return(df_to_check)
 }
 
 
@@ -169,8 +174,8 @@ recode_cna <- function(alteration_vector){
     return(recoded_alterations)
   }
 
-# Binary Matrix Processing  -----------------------------------------------------
 
+# Binary Matrix Processing  -----------------------------------------------------
 
 #' Create binary data.frames depending on type of mutation data
 #'
@@ -255,4 +260,27 @@ recode_cna <- function(alteration_vector){
   }
 }
 
+#' Extract IMPACT Patient ID From Sample ID
+#'
+#' @param sample_id A character vector of IMPACT Tumor sample IDs
+#'
+#' @return Returns a vector of patient IDs
+#' @export
+#'
+#' @examples
+#' sample_id = c("P-0000071-T01-IM3", "P-0000072-T02-IM4", "P-0000073-T03-IM5")
+#' extract_patient_id(sample_id)
+#'
+extract_patient_id <- function(sample_id) {
+
+  # Checks ----------------------------------------------------------------
+  wrong_format <- sample_id[!stringr::str_detect(sample_id, "^P-\\d{1,}-T.*")]
+
+  if (length(wrong_format) > 0) {
+    cli::cli_abort("Some {.code sample_id} values do not match the expected IMPACT sample format (e.g `P-0000XX-T01-IM3`)")
+  }
+
+  patient_id = stringr::str_replace(sample_id, "-T.*", "")
+  return(patient_id)
+}
 
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -51,6 +51,7 @@ reference:
     - recode_cna
     - rename_columns
     - resolve_alias
+    - extract_patient_id
   - subtitle: Color Palette
   - contents:
     - gnomer_colors

diff --git a/codemeta.json b/codemeta.json
@@ -8,13 +8,13 @@
   "codeRepository": "https://github.com/MSKCC-Epi-Bio/gnomeR",
   "issueTracker": "https://github.com/MSKCC-Epi-Bio/gnomeR/issues",
   "license": "https://spdx.org/licenses/MIT",
-  "version": "1.3.0",
+  "version": "1.2.0.9004",
   "programmingLanguage": {
     "@type": "ComputerLanguage",
     "name": "R",
     "url": "https://r-project.org"
   },
-  "runtimePlatform": "R version 4.2.2 (2022-10-31 ucrt)",
+  "runtimePlatform": "R version 4.2.3 (2023-03-15)",
   "author": [
     {
       "@type": "Person",
@@ -371,7 +371,7 @@
     },
     "SystemRequirements": null
   },
-  "fileSize": "2591.928KB",
+  "fileSize": "2349.143KB",
   "releaseNotes": "https://github.com/MSKCC-Epi-Bio/gnomeR/blob/master/NEWS.md",
   "readme": "https://github.com/MSKCC-Epi-Bio/gnomeR/blob/main/README.md",
   "contIntegration": ["https://github.com/MSKCC-Epi-Bio/gnomeR/actions", "https://app.codecov.io/gh/MSKCC-Epi-Bio/gnomeR?branch=main"],

diff --git a/man/dot-mutations_gene_binary.Rd b/man/dot-mutations_gene_binary.Rd
diff --git a/man/extract_patient_id.Rd b/man/extract_patient_id.Rd
diff --git a/man/rename_columns.Rd b/man/rename_columns.Rd
diff --git a/tests/testthat/test-binary-matrix.R b/tests/testthat/test-binary-matrix.R
@@ -122,22 +122,19 @@ test_that("Check func works fine when we enter mix of impact samples IM3 and IM5
 })
 
 
-# NON UNIQUE SAMPLES in samples ARGUMENT?
-# test_that("Check when sample ids with duplicate values are entered", {
-#
-#   mut_valid_sample_ids<- (gnomeR::mutations$sampleId)[1:10]
-#
-#   #should get unique rows after transposing
-#   sub <- create_gene_binary(sample=mut_valid_sample_ids, mutation=gnomeR::mutations)
-#   expect_equal(nrow(sub), length(unique(mut_valid_sample_ids)))
-#
-#   sub_fs <- create_gene_binary(sample=mut_valid_sample_ids, fusion =gnomeR::sv)
-#   expect_equal(nrow(sub_fs), length(unique(mut_valid_sample_ids)))
-#
-#   sub_cna <- create_gene_binary(sample=mut_valid_sample_ids, cna =gnomeR::cna)
-#   expect_equal(nrow(sub_cna), length(unique(mut_valid_sample_ids)))
-#
-# })
+# NON UNIQUE SAMPLES in samples ARGUMENT
+test_that("Check when non unique sample ids are entered", {
+
+  sub_mut <- gnomeR::mutations[1:10, ]
+
+  sub_dup <- create_gene_binary(samples = sub_mut$sampleId,
+                            sub_mut)
+  sub_unique <- create_gene_binary(samples = unique(sub_mut$sampleId),
+                                sub_mut)
+
+  expect_equal(sub_dup, sub_unique)
+
+})
 
 
 
@@ -254,6 +251,35 @@ test_that("test inclusion of NAs in mut_type ", {
 
 })
 
+test_that("test removal of germline samples in mut_type ", {
+  mut2 = gnomeR::mutations[1:10, ]
+  mut2$mutationStatus[1:5]<-'GERMLINE'
+  mut2$mutationStatus[6:10]<-""
+  mut2$mutationStatus[2]<-'SOMATIC'
+  mut2$mutationStatus[3]<-'germline'
+
+  # NA included with all
+  see = create_gene_binary(mutation = mut2, specify_panel = "no", mut_type = "all")
+  expect_equal(see$PARP1[which(see$sample_id=="P-0001128-T01-IM3")],1)
+
+
+  # NA no longer included with somatic_only
+  see = create_gene_binary(mutation = mut2, mut_type = "somatic_only", specify_panel = "no")
+  expect_equal(see$PARP1[which(see$sample_id=="P-0001859-T01-IM3")],1)
+  expect_equal(nrow(see),1)
+
+  # NA no longer included with germline_only
+  see = create_gene_binary(mutation = mut2, mut_type = "germline_only",
+                           specify_panel = "no")
+  expect_equal(see$PARP1[which(see$sample_id=="P-0001128-T01-IM3")],1)
+  expect_equal(nrow(see),3)
+
+  see = create_gene_binary(samples = mut2$sampleId, mutation = mut2,
+                           mut_type = "omit_germline",
+                           specify_panel = "no")
+  expect_equal(see$AKT1[which(see$sample_id=="P-0001128-T01-IM3")], 0)
+
+})
 
 
 # Test high_level_cna_only argument --------------------------------------------