Skip to content

Commit

Permalink
Merge branch 'main' into mutationStatus_repeated_message
Browse files Browse the repository at this point in the history
  • Loading branch information
karissawhiting committed Dec 21, 2023
2 parents 0238995 + 5759088 commit 854ac13
Show file tree
Hide file tree
Showing 15 changed files with 210 additions and 81 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: gnomeR
Title: Wrangle and analyze IMPACT and TCGA mutation data
Version: 1.3.0
Version: 1.2.0.9004
Authors@R:
c(person(given = "Karissa",
family = "Whiting",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export(add_pathways)
export(annotate_any_panel)
export(annotate_specific_panel)
export(create_gene_binary)
export(extract_patient_id)
export(ggcomut)
export(gggenecor)
export(ggsamplevar)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# gnomeR (development version)

- Added `extract_patient_id()` function to get IMPACT patient ID from sample ID
- Deprecated `freq_cutoff`, `freq_cutoff_by_gene`, and `gene_subset` arguments in `tbl_genomic()`. It is now recommended that users use `subset_by_frequency()` instead before passing data to `tbl_genomic()`.
- Added `other_vars` argument to `subset_by_frequency()`, `subset_by_panel()`, `summarize_by_gene()` and `add_pathways()` to allow retention of other clinical vars when using functions within pipeline.
- Deprecated `count_pathways_by` argument of `add_pathways()` function. Now, user must specify which specific alteration to count towards the pathway via the `.mut`, `.Amp`, `.Del`, `.fus` suffix (e.g. `custom_pathways = c('TP53.mut', 'APC.Del)`).
Expand All @@ -10,6 +11,9 @@
- Fixed bug in `add_pathways()` where `custom_pathways` wasn't catching all types of alterations when `GENE.all` was used due to `paste0()` vectorization.
- Changed some arguments to strict matching (`rlang::arg_match()`) instead of partial matching (`match.arg()`) (e.g. `mut_type = "s"` doesn't work anymore and must be fully specified `mut_type = "somatic_only"`).
- Added unit tests for gnomeR plots/visuals (#144).
- A dictionary of old to new names for `rename_columns()` output is now an attribute of the returned object. Now messages can reference the original names of data columns (ex: `TumorAllele2` not `tumor_allele_2`) to make it more intuitive to users (#302).
- Fixed bug that wasn't consistently filtering out germline samples
- Enhanced `subset_by_frequency()` to users to select hugo_symbols if they reach a threshold in any level of a variable (ex: high risk vs low risk) (#305)


# gnomeR 1.2.0
Expand Down
23 changes: 15 additions & 8 deletions R/create-gene-binary.R
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@ create_gene_binary <- function(samples = NULL,
samples_final <- samples %||%
samples_in_data

samples_final <- unique(samples_final)


# Sanitize Data and Filter to Final Samples List --------

mutation <- switch(!is.null(mutation),
Expand All @@ -205,8 +208,6 @@ create_gene_binary <- function(samples = NULL,
}
)



# Recode Aliases -----------------------------------------------------------

# Fusions - create long version with event split by two involved genes
Expand Down Expand Up @@ -270,7 +271,8 @@ create_gene_binary <- function(samples = NULL,
mut_type = mut_type,
snp_only = snp_only,
include_silent = include_silent,
specify_panel = specify_panel
specify_panel = specify_panel,
names_mut_dict = names_mut_dict
)
)

Expand Down Expand Up @@ -398,7 +400,8 @@ create_gene_binary <- function(samples = NULL,
mut_type,
snp_only,
include_silent,
specify_panel) {
specify_panel,
names_mut_dict) {

# apply filters --------------

Expand All @@ -421,8 +424,10 @@ create_gene_binary <- function(samples = NULL,
},
"omit_germline" = {
mutation <- mutation %>%
filter(.data$mutation_status != "GERMLINE" |
.data$mutation_status != "germline" | is.na(.data$mutation_status))
filter((.data$mutation_status != "GERMLINE" &
.data$mutation_status != "germline" &
.data$mutation_status != "Germline") |
is.na(.data$mutation_status))

blank_muts <- mutation %>%
filter(is.na(.data$mutation_status) |
Expand All @@ -432,17 +437,19 @@ create_gene_binary <- function(samples = NULL,

if ((blank_muts > 0)) {
cli::cli_alert_warning(
"{(blank_muts)} mutations have {.code NA} or blank in mutation status column instead of 'SOMATIC' or 'GERMLINE'. These were assumed to be 'SOMATIC' and were retained in the resulting binary matrix.")
"{(blank_muts)} mutations have {.code NA} or blank in the {.field {dplyr::first(c(names_mut_dict['mutation_status'], 'mutation_status'), na_rm = TRUE)}} column instead of 'SOMATIC' or 'GERMLINE'. These were assumed to be 'SOMATIC' and were retained in the resulting binary matrix.")
}
},
"somatic_only" = {
mutation <- mutation %>%
filter(.data$mutation_status == "SOMATIC" |
.data$mutation_status == "Somatic" |
.data$mutation_status == "somatic")
},
"germline_only" = {
mutation <- mutation %>% filter(.data$mutation_status == "GERMLINE" |
.data$mutation_status == "germline")
.data$mutation_status == "Germline" |
.data$mutation_status == "germline")
}
)

Expand Down
8 changes: 5 additions & 3 deletions R/sanitize-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,11 @@
)
)

cli::cli_warn("Column {.field variant_type} is missing from your data. We inferred variant types using {.field reference_allele} and {.field tumor_seq_allele2} columns")
} else {
cli::cli_abort("Column {.field variant_type} is missing from your data and {.field reference_allele} and {.field tumor_seq_allele2}

cli::cli_warn(c("Column {.field variant_type} is missing from your data. We inferred variant types using ",
"{.field {dplyr::first(c(names_dict['reference_allele'], 'reference_allele'), na_rm = TRUE)}} and {.field {dplyr::first(c(names_dict['tumor_seq_allele_2'], 'tumor_seq_allele_2'), na_rm = TRUE)}} columns"))
} else {
cli::cli_abort("Column {.field variant_type} is missing from your data and {.field reference_allele} and {.field tumor_seq_allele_2}
columns were not available from which to infer variant type.
To proceed, add a column specifying {.field variant_type} (e.g. {.code mutate(<your-mutation-df>, variant_type = 'SNP')}")
}
Expand Down
52 changes: 40 additions & 12 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@

#' Rename columns from API results to work with gnomeR functions
#'
#' @param df_to_check a data frame to check and recode names as needed
#'
#' Will return a named vector of internal column names as values and original data set names
#' as names as an attribute (`attr(x, "names_dict")`)
#' @param df_to_check A data frame to check and recode names as needed
#' @return a renamed data frame
#' @export
#' @examples
#'
#' rename_columns(df_to_check = gnomeR::mutations)
#' rename_columns(df_to_check = gnomeR::sv)
#'
#' x <- rename_columns(df_to_check = gnomeR::sv)
#' attr(x, "names_dict")
rename_columns <- function(df_to_check) {

names_df_long <- gnomeR::names_df %>%
Expand All @@ -22,28 +23,32 @@ rename_columns <- function(df_to_check) {

which_to_replace <- intersect(names(df_to_check), unique(names_df_long$value))

# create a temporary dictionary as a named vector
temp_dict <- names_df_long %>%
# create a temporary dictionary as a named vector- this should have all relevant values, including those unchanged
names_dict <- names_df_long %>%
dplyr::filter(.data$value %in% which_to_replace) %>%
select("internal_column_name", "value") %>%
dplyr::distinct() %>%
tibble::deframe()


if(length(temp_dict) > 0) {
if(length(names_dict) > 0) {

# store details on what has been changed.
message <- purrr::map2_chr(names(temp_dict),
temp_dict,
message <- purrr::map2_chr(names(names_dict),
names_dict,
~paste0(.y, " renamed ", .x))

names(message) <- rep("!", times = length(message))


# rename those variables only
df_to_check %>%
dplyr::rename(!!temp_dict)
df_to_check <- df_to_check %>%
dplyr::rename(!!names_dict)

attr(df_to_check, "names_dict") <- names_dict
}

return(df_to_check)
}


Expand Down Expand Up @@ -169,8 +174,8 @@ recode_cna <- function(alteration_vector){
return(recoded_alterations)
}

# Binary Matrix Processing -----------------------------------------------------

# Binary Matrix Processing -----------------------------------------------------

#' Create binary data.frames depending on type of mutation data
#'
Expand Down Expand Up @@ -255,4 +260,27 @@ recode_cna <- function(alteration_vector){
}
}

#' Extract IMPACT Patient ID From Sample ID
#'
#' @param sample_id A character vector of IMPACT Tumor sample IDs
#'
#' @return Returns a vector of patient IDs
#' @export
#'
#' @examples
#' sample_id = c("P-0000071-T01-IM3", "P-0000072-T02-IM4", "P-0000073-T03-IM5")
#' extract_patient_id(sample_id)
#'
extract_patient_id <- function(sample_id) {

# Checks ----------------------------------------------------------------
wrong_format <- sample_id[!stringr::str_detect(sample_id, "^P-\\d{1,}-T.*")]

if (length(wrong_format) > 0) {
cli::cli_abort("Some {.code sample_id} values do not match the expected IMPACT sample format (e.g `P-0000XX-T01-IM3`)")
}

patient_id = stringr::str_replace(sample_id, "-T.*", "")
return(patient_id)
}

1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ reference:
- recode_cna
- rename_columns
- resolve_alias
- extract_patient_id
- subtitle: Color Palette
- contents:
- gnomer_colors
Expand Down
6 changes: 3 additions & 3 deletions codemeta.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
"codeRepository": "https://github.com/MSKCC-Epi-Bio/gnomeR",
"issueTracker": "https://github.com/MSKCC-Epi-Bio/gnomeR/issues",
"license": "https://spdx.org/licenses/MIT",
"version": "1.3.0",
"version": "1.2.0.9004",
"programmingLanguage": {
"@type": "ComputerLanguage",
"name": "R",
"url": "https://r-project.org"
},
"runtimePlatform": "R version 4.2.2 (2022-10-31 ucrt)",
"runtimePlatform": "R version 4.2.3 (2023-03-15)",
"author": [
{
"@type": "Person",
Expand Down Expand Up @@ -371,7 +371,7 @@
},
"SystemRequirements": null
},
"fileSize": "2591.928KB",
"fileSize": "2349.143KB",
"releaseNotes": "https://github.com/MSKCC-Epi-Bio/gnomeR/blob/master/NEWS.md",
"readme": "https://github.com/MSKCC-Epi-Bio/gnomeR/blob/main/README.md",
"contIntegration": ["https://github.com/MSKCC-Epi-Bio/gnomeR/actions", "https://app.codecov.io/gh/MSKCC-Epi-Bio/gnomeR?branch=main"],
Expand Down
3 changes: 2 additions & 1 deletion man/dot-mutations_gene_binary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/extract_patient_id.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions man/rename_columns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

58 changes: 42 additions & 16 deletions tests/testthat/test-binary-matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,22 +122,19 @@ test_that("Check func works fine when we enter mix of impact samples IM3 and IM5
})


# NON UNIQUE SAMPLES in samples ARGUMENT?
# test_that("Check when sample ids with duplicate values are entered", {
#
# mut_valid_sample_ids<- (gnomeR::mutations$sampleId)[1:10]
#
# #should get unique rows after transposing
# sub <- create_gene_binary(sample=mut_valid_sample_ids, mutation=gnomeR::mutations)
# expect_equal(nrow(sub), length(unique(mut_valid_sample_ids)))
#
# sub_fs <- create_gene_binary(sample=mut_valid_sample_ids, fusion =gnomeR::sv)
# expect_equal(nrow(sub_fs), length(unique(mut_valid_sample_ids)))
#
# sub_cna <- create_gene_binary(sample=mut_valid_sample_ids, cna =gnomeR::cna)
# expect_equal(nrow(sub_cna), length(unique(mut_valid_sample_ids)))
#
# })
# NON UNIQUE SAMPLES in samples ARGUMENT
test_that("Check when non unique sample ids are entered", {

sub_mut <- gnomeR::mutations[1:10, ]

sub_dup <- create_gene_binary(samples = sub_mut$sampleId,
sub_mut)
sub_unique <- create_gene_binary(samples = unique(sub_mut$sampleId),
sub_mut)

expect_equal(sub_dup, sub_unique)

})



Expand Down Expand Up @@ -254,6 +251,35 @@ test_that("test inclusion of NAs in mut_type ", {

})

test_that("test removal of germline samples in mut_type ", {
mut2 = gnomeR::mutations[1:10, ]
mut2$mutationStatus[1:5]<-'GERMLINE'
mut2$mutationStatus[6:10]<-""
mut2$mutationStatus[2]<-'SOMATIC'
mut2$mutationStatus[3]<-'germline'

# NA included with all
see = create_gene_binary(mutation = mut2, specify_panel = "no", mut_type = "all")
expect_equal(see$PARP1[which(see$sample_id=="P-0001128-T01-IM3")],1)


# NA no longer included with somatic_only
see = create_gene_binary(mutation = mut2, mut_type = "somatic_only", specify_panel = "no")
expect_equal(see$PARP1[which(see$sample_id=="P-0001859-T01-IM3")],1)
expect_equal(nrow(see),1)

# NA no longer included with germline_only
see = create_gene_binary(mutation = mut2, mut_type = "germline_only",
specify_panel = "no")
expect_equal(see$PARP1[which(see$sample_id=="P-0001128-T01-IM3")],1)
expect_equal(nrow(see),3)

see = create_gene_binary(samples = mut2$sampleId, mutation = mut2,
mut_type = "omit_germline",
specify_panel = "no")
expect_equal(see$AKT1[which(see$sample_id=="P-0001128-T01-IM3")], 0)

})


# Test high_level_cna_only argument --------------------------------------------
Expand Down
Loading

0 comments on commit 854ac13

Please sign in to comment.