From d0f902f6aa34ab13cf55459c55e8058e159e6096 Mon Sep 17 00:00:00 2001 From: toumban1 Date: Thu, 2 May 2024 15:09:07 -0400 Subject: [PATCH 01/12] Small change --- README.Rmd | 1 + 1 file changed, 1 insertion(+) diff --git a/README.Rmd b/README.Rmd index 14c283dc..dc5718b8 100644 --- a/README.Rmd +++ b/README.Rmd @@ -18,6 +18,7 @@ knitr::opts_chunk$set( library(dplyr) library(gtsummary) library(gnomeR) + ``` # gnomeR From 35555eedea088109098dc1fc7260bc8bed553156 Mon Sep 17 00:00:00 2001 From: toumban Date: Tue, 7 May 2024 12:37:56 -0400 Subject: [PATCH 02/12] Add by option to subet by frequency. Function needs to be updated. --- R/subset-by-frequency.R | 58 +++++++++++++++++++-- tests/testthat/Rplots.pdf | Bin 4490 -> 4490 bytes tests/testthat/test-plots.R | 1 - tests/testthat/test-subset_by_freq.R | 74 +++++++++++++++++++++++++++ 4 files changed, 129 insertions(+), 4 deletions(-) diff --git a/R/subset-by-frequency.R b/R/subset-by-frequency.R index ffcd5860..797b364f 100644 --- a/R/subset-by-frequency.R +++ b/R/subset-by-frequency.R @@ -6,6 +6,7 @@ #' @param t Threshold value between 0 and 1 to subset by. Default is 10% (.1). #' @param other_vars One or more column names (quoted or unquoted) in data to be retained #' in resulting data frame. Default is NULL. +#' @param by Variable used to subset the data. Default is NULL. #' @return a data frame with a `sample_id` column and columns for #' alterations over the given prevalence threshold of `t`. #' @@ -22,7 +23,7 @@ #'gene_binary %>% #' subset_by_frequency() #' -subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) { +subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NULL) { # Checks ------------------------------------------------------------------ @@ -46,11 +47,19 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) { arg_name = "other_vars" ) + # Define by variable + + by <- + .select_to_varnames({{ by }}, + data = gene_binary, + arg_name = "by" + ) + # data frame of only alterations alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars)) # Remove all NA columns ---------------------------------------------- - all_na_alt <- apply(alt_only, 2, function(x) { + all_na_alt <- apply(alt_only, 2, function(x) { sum(is.na(x)) == nrow(alt_only) }) @@ -59,10 +68,19 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) { # Check Numeric Class ----------------------------------------------------- - .abort_if_not_numeric(alt_only) + + if (is.null(by)) { + .abort_if_not_numeric(alt_only) + } + else { + .abort_if_not_numeric(select(alt_only, -any_of(by))) + } # Calc Frequency ---------------------------------------------------------- + + if(is.null(by)){ + counts <- apply(alt_only, 2, function(x) {sum(x, na.rm = TRUE)}) num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x))) @@ -74,5 +92,39 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) { all_of(alts_over_thresh)) return(subset_binary) + } + else{ + + alt_data <- + alt_only |> + group_by(by) |> + summarise(across(-c(by), sum = ~ sum(.), total = ~ sum(!is.na(.))), na.rm = T) + + alt_group_data <- + alt_data |> + pivot_longer(-any_of(by), + names_to = c("gene")) |> + separate(gene, into = c("gene", "measure"), sep = "_") |> + pivot_wider(names_from = measure, + values_from = value) |> + mutate(prop = sum/total) |> + arrange(desc(prop)) + + alts_over_thresh_grp <- + alt_group_data |> + filter(prop > t) |> + group_by(gene) |> + select(gene) |> + unique() |> + unlist() |> + as.vector() + + subset_binary <- select(gene_binary, "sample_id", + any_of(by), + any_of(other_vars), + all_of(alts_over_thresh_grp)) + + return(subset_binary) + } } diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf index 0d231f7a41e366659e127be3017822d7ee45043c..ed5bcafa9bd02151eefe0176d0cf1ba51e0ae688 100644 GIT binary patch delta 56 zcmeBD?oys$qHbbfYG7t)WMFP!smZ19o1fy6Sdyxs;bLWEV1y($F% + mutate(sample_id = as.character(1:nrow(.))) + + expect_error(bm |> + subset_by_frequency(t = .1, other_vars = c(sex, stage), by = grade), + "Error in `by=` argument input. Select from") + +}) + + +test_that("Check `by` variable works", { + + bm <- bind_rows( + "gen50" = c(rep(0, 5), rep(1, 5)), + "gene20" = c(rep(0, 8), rep(1, 2)), + "gene0" = c(rep(0, 10), rep(1, 0)), + "sex" = rep(c("F", "M"), 5), + "stage" = rep(c("I", "II"), 5)) %>% + mutate(sample_id = as.character(1:nrow(.))) + + sub <- bm %>% + subset_by_frequency(t = .1, other_vars = stage, by = sex) + + expect_equal(setdiff(names(bm), names(sub)), c("gene0")) + + bm1 <- bind_rows( + "gen50" = c(rep(0, 5), rep(1, 5)), + "gene20" = c(rep(0, 8), rep(1, 2)), + "gene0" = c(rep(0, 10), rep(1, 0)), + "sex" = c(rep("F", 4), rep("M", 4), "F", "M"), + "stage" = rep(c("I", "II"), 5)) %>% + mutate(sample_id = as.character(1:nrow(.))) + + sub1 <- bm1 %>% + subset_by_frequency(t = .25, other_vars = stage, by = sex) + + sub2 <- bm1 %>% + subset_by_frequency(t = .85, other_vars = stage, by = sex) + + expect_equal(setdiff(names(bm1), names(sub1)), c("gene20", "gene0")) + + expect_equal(setdiff(names(bm1), names(sub2)), c("gen50", "gene20", "gene0")) +}) + + +test_that("Check categorical `by` variable works", { + + bm <- bind_rows( + "gen50" = c(rep(0, 5), rep(1, 5)), + "gene20" = c(rep(0, 8), rep(1, 2)), + "gene10" = c(rep(0, 9), rep(1, 1)), + "gene0" = c(rep(0, 10), rep(1, 0)), + "sex" = c(rep("F", 4), rep("M", 4), "F", "M"), + "stage" = rep(c("I", "II"), 5), + "grade" = c(rep(1:4, 2), 1, 2)) %>% + mutate(sample_id = as.character(1:nrow(.))) + + sub <- subset_by_frequency(bm, other_vars = c("sex", "stage"), by = grade) + + sub1 <- subset_by_frequency(bm, t = 0.35, other_vars = c("sex", "stage"), by = grade) + + expect_equal(setdiff(names(bm), names(sub)), c("gene0")) + + expect_equal(setdiff(names(bm), names(sub1)), c("gene20", "gene10", "gene0")) +}) + From 407da1260496c7bdd00c6b183e50671a71b24e6c Mon Sep 17 00:00:00 2001 From: toumban1 Date: Wed, 14 Aug 2024 13:34:29 -0400 Subject: [PATCH 03/12] Update to include "by" variable from Hackathon Spring 2024 --- R/subset-by-frequency.R | 10 +++++----- tests/testthat/Rplots.pdf | Bin 4490 -> 4490 bytes 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/subset-by-frequency.R b/R/subset-by-frequency.R index 797b364f..569635e4 100644 --- a/R/subset-by-frequency.R +++ b/R/subset-by-frequency.R @@ -97,8 +97,8 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NUL alt_data <- alt_only |> - group_by(by) |> - summarise(across(-c(by), sum = ~ sum(.), total = ~ sum(!is.na(.))), na.rm = T) + group_by(across(any_of(by))) |> + summarise_all(list(sum = ~ sum(.), total = ~ sum(!is.na(.))), na.rm = T) alt_group_data <- alt_data |> @@ -107,12 +107,12 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NUL separate(gene, into = c("gene", "measure"), sep = "_") |> pivot_wider(names_from = measure, values_from = value) |> - mutate(prop = sum/total) |> - arrange(desc(prop)) + mutate(propo = sum/total) |> + arrange(desc(propo)) alts_over_thresh_grp <- alt_group_data |> - filter(prop > t) |> + filter(propo > t) |> group_by(gene) |> select(gene) |> unique() |> diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf index ed5bcafa9bd02151eefe0176d0cf1ba51e0ae688..f12961476aa3e0da1376c7b62dffbe3bf7c03638 100644 GIT binary patch delta 29 ecmeBD?oys$!ERw_W@ut$G0|QW#@raKF9-l})CWEQ delta 29 ecmeBD?oys$!ER<~WMFP!IniDe#@raKF9-l}H3u{R From ec26164d4ba31dff87b0a58e779d02d381eeda41 Mon Sep 17 00:00:00 2001 From: toumban1 Date: Mon, 26 Aug 2024 13:52:20 -0400 Subject: [PATCH 04/12] Updated documentation so commit is hopefully successful --- DESCRIPTION | 2 +- man/subset_by_frequency.Rd | 4 +++- tests/testthat/Rplots.pdf | Bin 4490 -> 4490 bytes 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e913897b..fde45585 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,7 +30,7 @@ LazyData: true URL: https://github.com/MSKCC-Epi-Bio/gnomeR, https://mskcc-epi-bio.github.io/gnomeR/ BugReports: https://github.com/MSKCC-Epi-Bio/gnomeR/issues -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 Depends: R (>= 3.6) biocViews: ComplexHeatmap, diff --git a/man/subset_by_frequency.Rd b/man/subset_by_frequency.Rd index ddf692f2..2249ba81 100644 --- a/man/subset_by_frequency.Rd +++ b/man/subset_by_frequency.Rd @@ -4,7 +4,7 @@ \alias{subset_by_frequency} \title{Subset a Binary Matrix By Alteration Frequency Threshold} \usage{ -subset_by_frequency(gene_binary, t = 0.1, other_vars = NULL) +subset_by_frequency(gene_binary, t = 0.1, other_vars = NULL, by = NULL) } \arguments{ \item{gene_binary}{A data frame with a row for each sample and column for each @@ -15,6 +15,8 @@ with values of 0, 1 or NA.} \item{other_vars}{One or more column names (quoted or unquoted) in data to be retained in resulting data frame. Default is NULL.} + +\item{by}{Variable used to subset the data. Default is NULL.} } \value{ a data frame with a \code{sample_id} column and columns for diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf index f12961476aa3e0da1376c7b62dffbe3bf7c03638..7c5c2e20b56749c72a1fa611b8ff772341496d23 100644 GIT binary patch delta 48 wcmeBD?oys$rfOkiW@v0`XkettrSF@c;*waBs-WRwWn^GvVgQ%h7_Bb|04-7r=l}o! delta 48 wcmeBD?oys$rfO Date: Thu, 3 Oct 2024 12:43:58 -0400 Subject: [PATCH 05/12] small updates to subset_by_frequency() --- R/subset-by-frequency.R | 49 +++++++++++++++------------- R/utils-gene-binary.R | 6 ++-- man/dot-check_for_fus_in_mut.Rd | 3 ++ man/dot-infer_mutation_status.Rd | 2 +- man/dot-infer_variant_type.Rd | 2 +- tests/testthat/test-subset_by_freq.R | 36 ++++++++++++++++++++ 6 files changed, 71 insertions(+), 27 deletions(-) diff --git a/R/subset-by-frequency.R b/R/subset-by-frequency.R index 569635e4..3adfdbbd 100644 --- a/R/subset-by-frequency.R +++ b/R/subset-by-frequency.R @@ -28,7 +28,7 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NUL # Checks ------------------------------------------------------------------ - # check threshold argument + # check threshold `t` argument if(!(is.numeric(t) & (t >= 0 & t <= 1))) { cli::cli_abort("{.field t} must be a number between 0 and 1") } @@ -52,9 +52,14 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NUL by <- .select_to_varnames({{ by }}, data = gene_binary, - arg_name = "by" + arg_name = "by", select_single = TRUE ) + # Check if 'by' is in 'other_vars' (only if both are non-NULL) + if (!is.null(by) && !is.null(other_vars) && by %in% other_vars) { + cli::cli_abort("{.code other_vars} cannot overlap with {.code by}.") + } + # data frame of only alterations alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars)) @@ -81,43 +86,43 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NUL if(is.null(by)){ - counts <- apply(alt_only, 2, function(x) {sum(x, na.rm = TRUE)}) - num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x))) + counts <- apply(alt_only, 2, function(x) {sum(x, na.rm = TRUE)}) + num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x))) - alt_freq <- counts/num_non_na - alts_over_thresh <- names(sort(alt_freq[alt_freq >= t], decreasing = TRUE)) + alt_freq <- counts/num_non_na + alts_over_thresh <- names(sort(alt_freq[alt_freq >= t], decreasing = TRUE)) - subset_binary <- select(gene_binary, "sample_id", - any_of(other_vars), - all_of(alts_over_thresh)) + subset_binary <- select(gene_binary, "sample_id", + any_of(other_vars), + all_of(alts_over_thresh)) return(subset_binary) + } else{ alt_data <- alt_only |> - group_by(across(any_of(by))) |> - summarise_all(list(sum = ~ sum(.), total = ~ sum(!is.na(.))), na.rm = T) + group_by(across(all_of(by))) |> + summarise(across(everything(), + list(sum = ~ sum(., na.rm = TRUE), + total = ~ sum(!is.na(.), na.rm = T)))) alt_group_data <- alt_data |> pivot_longer(-any_of(by), names_to = c("gene")) |> - separate(gene, into = c("gene", "measure"), sep = "_") |> - pivot_wider(names_from = measure, - values_from = value) |> - mutate(propo = sum/total) |> - arrange(desc(propo)) + separate("gene", into = c("gene", "measure"), sep = "_") |> + pivot_wider(names_from = "measure", + values_from = "value") |> + mutate(propo = .data$sum/.data$total) |> + arrange(desc(.data$propo)) alts_over_thresh_grp <- alt_group_data |> - filter(propo > t) |> - group_by(gene) |> - select(gene) |> - unique() |> - unlist() |> - as.vector() + filter(.data$propo > t) |> + pull("gene") |> + unique() subset_binary <- select(gene_binary, "sample_id", any_of(by), diff --git a/R/utils-gene-binary.R b/R/utils-gene-binary.R index ee90b848..0cf4080e 100644 --- a/R/utils-gene-binary.R +++ b/R/utils-gene-binary.R @@ -34,7 +34,7 @@ #' Check for fusions in maf file #' -#' @param mutation +#' @param mutation a mutation maf data frame #' @return a data frame if no fusions found #' @keywords internal .check_for_fus_in_mut <- function(mutation) { @@ -56,7 +56,7 @@ #' Infer mutation status and assume somatic if none #' -#' @param mutation a mutation data frame +#' @param mutation a mutation maf data frame #' @return a mutations data frame with a mutation status column #' @keywords internal .infer_mutation_status <- function(mutation) { @@ -76,7 +76,7 @@ #' #' Infers variant_type from reference_allele or tumor_seq_allele data #' -#' @param mutation data frame +#' @param mutation mutation maf file data frame #' @return a mutation data frame with a variant type column #' @keywords internal .infer_variant_type <- function(mutation, names_mut_dict = names_mut_dict) { diff --git a/man/dot-check_for_fus_in_mut.Rd b/man/dot-check_for_fus_in_mut.Rd index 9a9b9ec9..5d477ae4 100644 --- a/man/dot-check_for_fus_in_mut.Rd +++ b/man/dot-check_for_fus_in_mut.Rd @@ -6,6 +6,9 @@ \usage{ .check_for_fus_in_mut(mutation) } +\arguments{ +\item{mutation}{a mutation maf data frame} +} \value{ a data frame if no fusions found } diff --git a/man/dot-infer_mutation_status.Rd b/man/dot-infer_mutation_status.Rd index 1fff6fdf..57f0649e 100644 --- a/man/dot-infer_mutation_status.Rd +++ b/man/dot-infer_mutation_status.Rd @@ -7,7 +7,7 @@ .infer_mutation_status(mutation) } \arguments{ -\item{mutation}{a mutation data frame} +\item{mutation}{a mutation maf data frame} } \value{ a mutations data frame with a mutation status column diff --git a/man/dot-infer_variant_type.Rd b/man/dot-infer_variant_type.Rd index 5f96298b..48ccb566 100644 --- a/man/dot-infer_variant_type.Rd +++ b/man/dot-infer_variant_type.Rd @@ -7,7 +7,7 @@ .infer_variant_type(mutation, names_mut_dict = names_mut_dict) } \arguments{ -\item{mutation}{data frame} +\item{mutation}{mutation maf file data frame} } \value{ a mutation data frame with a variant type column diff --git a/tests/testthat/test-subset_by_freq.R b/tests/testthat/test-subset_by_freq.R index 1bc2aa8b..1b8f9622 100644 --- a/tests/testthat/test-subset_by_freq.R +++ b/tests/testthat/test-subset_by_freq.R @@ -248,3 +248,39 @@ test_that("Check categorical `by` variable works", { expect_equal(setdiff(names(bm), names(sub1)), c("gene20", "gene10", "gene0")) }) +test_that("Check only one `by` variable passed", { + + bm <- bind_rows( + "gen50" = c(rep(0, 5), rep(1, 5)), + "gene20" = c(rep(0, 8), rep(1, 2)), + "gene0" = c(rep(0, 10), rep(1, 0)), + "sex" = rep(c("F", "M"), 5), + "stage" = rep(c("I", "II"), 5)) %>% + mutate(sample_id = as.character(1:nrow(.))) + + expect_error(bm |> + subset_by_frequency(t = .1, + other_vars = NULL, + by = c(sex, stage)), + "Error in `by=` argument--select only") + +}) + +test_that("Check that `other_vars` and `by` don't overlap", { + + bm <- bind_rows( + "gen50" = c(rep(0, 5), rep(1, 5)), + "gene20" = c(rep(0, 8), rep(1, 2)), + "gene0" = c(rep(0, 10), rep(1, 0)), + "sex" = rep(c("F", "M"), 5), + "stage" = rep(c("I", "II"), 5)) %>% + mutate(sample_id = as.character(1:nrow(.))) + + expect_error(subset_by_frequency(bm, t = .1, + other_vars = c(sex, stage), + by = sex), + "`other_vars` cannot") + +}) + + From 5c339f15716bd32b9019b92c0530250fecc0adba Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 3 Oct 2024 12:45:48 -0400 Subject: [PATCH 06/12] remove PDF test plot for now --- tests/testthat/Rplots.pdf | Bin 4490 -> 4653 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf index 7c5c2e20b56749c72a1fa611b8ff772341496d23..5a79a06868b750e2424511698de66c593eef5f88 100644 GIT binary patch delta 245 zcmYL>J5B>J6h(#7xrx|;B@z@Qls9%}2p|fI30=ejs9x+hgN4Tie-IsgDiG_SK04lS|(>KgA`nBvnDf#mdOQ$OIubFW&=}`$@c|axC~8=xKveL{oS|#K?f8D From a53944cfa85c8fb1eae5c65f75d459f9007a007b Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 3 Oct 2024 14:37:30 -0400 Subject: [PATCH 07/12] update vignette to be in line with latest genie updates --- vignettes/genie-bpc-vignette.Rmd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vignettes/genie-bpc-vignette.Rmd b/vignettes/genie-bpc-vignette.Rmd index 98afdd63..548d72c2 100644 --- a/vignettes/genie-bpc-vignette.Rmd +++ b/vignettes/genie-bpc-vignette.Rmd @@ -16,6 +16,9 @@ knitr::opts_chunk$set( set.seed(20230302) # exit if user doesn't have synapser, a log in, or access to data. + +genieBPC::set_synapse_credentials() + if (genieBPC:::.is_connected_to_genie() == FALSE){ knitr::knit_exit() } From 30336c3149643c27ce9495211501c2cb62dbff4b Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 3 Oct 2024 15:47:42 -0400 Subject: [PATCH 08/12] temporarily move gtsummary to remotes until glue issue gets fixed for gtsummary on CRAN --- DESCRIPTION | 5 +++-- vignettes/data-processing-vignette.Rmd | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fde45585..6886671a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,7 +46,6 @@ Imports: tidyr (>= 1.3.0), cli, GGally, - gtsummary, broom.helpers, janitor, withr, @@ -61,7 +60,9 @@ Suggests: cbioportalR, genieBPC, vdiffr -Remotes: karissawhiting/cbioportalR +Remotes: + karissawhiting/cbioportalR, + ddsjoberg/gtsummary VignetteBuilder: knitr Language: en-US Roxygen: list(markdown = TRUE) diff --git a/vignettes/data-processing-vignette.Rmd b/vignettes/data-processing-vignette.Rmd index d0dfeb37..a686e0fc 100644 --- a/vignettes/data-processing-vignette.Rmd +++ b/vignettes/data-processing-vignette.Rmd @@ -231,7 +231,8 @@ gene_binary <- create_gene_binary( specify_panel = "no" ) -tbl1 <- gene_binary %>% subset_by_frequency(t = .15) %>% +tbl1 <- gene_binary %>% + subset_by_frequency(t = .15) %>% tbl_genomic() ``` From a8f8e7b557f3638586014e05a3e9eb9dd0a23deb Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Thu, 3 Oct 2024 16:19:51 -0400 Subject: [PATCH 09/12] fix dependency issues with gtsummary --- DESCRIPTION | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6886671a..74ec3ca3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,20 +50,21 @@ Imports: janitor, withr, scales, - lifecycle + lifecycle, + gtsummary (>= 2.0.2.9009) Suggests: knitr, rmarkdown, testthat (>= 3.0.0), spelling, - covr, - cbioportalR, + covr, + cbioportalR (>= 1.1.0), genieBPC, vdiffr -Remotes: - karissawhiting/cbioportalR, - ddsjoberg/gtsummary VignetteBuilder: knitr Language: en-US Roxygen: list(markdown = TRUE) Config/testthat/edition: 3 +Remotes: + ddsjoberg/gtsummary, + karissawhiting/cbioportalR From 38d624984fbc0bc2bed656f223428776a2d04655 Mon Sep 17 00:00:00 2001 From: lins5 Date: Wed, 9 Oct 2024 11:55:59 -0400 Subject: [PATCH 10/12] deleting genieBPC and IMPACT QA vignettes moving the genieBPC and IMPACT QA vignettes from gnomeR and moving it over to gnomeverse --- vignettes/genie-bpc-vignette.Rmd | 298 ------------------------------- vignettes/qa-impact-data.Rmd | 206 --------------------- 2 files changed, 504 deletions(-) delete mode 100644 vignettes/genie-bpc-vignette.Rmd delete mode 100644 vignettes/qa-impact-data.Rmd diff --git a/vignettes/genie-bpc-vignette.Rmd b/vignettes/genie-bpc-vignette.Rmd deleted file mode 100644 index 548d72c2..00000000 --- a/vignettes/genie-bpc-vignette.Rmd +++ /dev/null @@ -1,298 +0,0 @@ ---- -title: "Analyzing GENIE BPC Data Using {gnomeR}" -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Analyzing GENIE BPC Data Using {gnomeR}} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) - -set.seed(20230302) - -# exit if user doesn't have synapser, a log in, or access to data. - -genieBPC::set_synapse_credentials() - -if (genieBPC:::.is_connected_to_genie() == FALSE){ - knitr::knit_exit() -} -``` - -```{r setup, include = FALSE, message = FALSE, results = FALSE, eval=genieBPC:::.is_connected_to_genie()} -library(gnomeR) -library(dplyr) -library(genieBPC) -library(cbioportalR) -``` - -## Introduction - -This vignette will walk through how to apply {gnomeR} functions to data from AACR Project Genomics Evidence Neoplasia Information Exchange BioPharma Collaborative (GENIE BPC). A broad overview of AACR Project GENIE BPC can be found [here](https://www.aacr.org/about-the-aacr/newsroom/news-releases/aacr-project-genie-begins-five-year-collaborative-research-project-with-36-million-in-new-funding/), with details on the clinical data structure available on the [{genieBPC} package website](https://genie-bpc.github.io/genieBPC/articles/clinical_data_structure_vignette.html). - -For the purposes of this vignette, we will use the first publicly available GENIE BPC data release of non-small cell lung cancer patients, NSCLC v2.0-public. - -Note that the GENIE BPC genomic data are unique in a few particular ways: - -* These data are heterogeneous such that they come from multiple genomic panels from multiple institutions. Gene aliases across panels may vary. -* The GENIE BPC fusions and copy number alterations data may differ from common data formats. Differences are described in the Data Formats subsection below. - -## Data Access - -To gain access to the GENIE BPC data, please follow the instructions on the [{genieBPC} `pull_data_synapse()` vignette](https://genie-bpc.github.io/genieBPC/articles/pull_data_synapse_vignette.html) to register for a Synapse account. Once your Synapse account is created and you authenticate yourself using `genieBPC::set_synapse_credentials()`, you'll be ready to pull the GENIE BPC clinical and genomic data from Synapse into your local environment: - -```{r, eval=FALSE} -library(genieBPC) - -# if credentials are not stored in your R environment -set_synapse_credentials(username = "username", password = "password") -``` - -```{r, eval=genieBPC:::.is_connected_to_genie()} - -# if credentials are stored in your R environment -set_synapse_credentials() -``` - -## Obtain GENIE BPC Data - -```{r, eval=genieBPC:::.is_connected_to_genie()} -# pull NSCLC v2.0-public data from Synapse into the R environment -nsclc_2_0 = pull_data_synapse(cohort = "NSCLC", - version = "v2.0-public") -``` - -The resulting `nsclc_2_0` object is a nested list of datasets, including the mutations, fusions, and copy number data. - -- `nsclc_2_0$NSCLC_v2.0$mutations_extended` -- `nsclc_2_0$NSCLC_v2.0$fusions` -- `nsclc_2_0$NSCLC_v2.0$cna` - -Note that while the GENIE BPC clinical data are only available via Synapse, the genomic data can be accessed via both Synapse and cBioPortal. Using the {cbioportalR} package, users can pull the GENIE BPC genomic data directly from cBioPortal: - -```{r, eval=genieBPC:::.is_connected_to_genie()} -library(cbioportalR) - -# connect to the GENIE instance of cBioPortal -cbioportalR::set_cbioportal_db("https://genie.cbioportal.org/api") - -# view list of available studies from this instance of the portal -# NSCLC v2.0-public is: nsclc_public_genie_bpc -available_studies() -``` - -```{r, eval=genieBPC:::.is_connected_to_genie()} -# obtain genomic data for GENIE BPC NSCLC v2.0-public -mutations_extended_2_0 <- get_mutations_by_study("nsclc_public_genie_bpc") -cna_2_0 <- get_cna_by_study("nsclc_public_genie_bpc") -fusions_2_0 <- get_fusions_by_study("nsclc_public_genie_bpc") -``` - -## Data Formats - -The genomic data for GENIE BPC are stored both on Synapse and in cBioPortal. The data structure differs depending on where the genomic data are downloaded from. Therefore, the remainder of this vignette will proceed by outlining the process of annotating genomic data separately for genomic data downloaded from Synapse and genomic data downloaded from cBioPortal. - -### Differences Between Synapse and cBioPortal Genomic Data - -Please note that pulling genomic GENIE data from Synapse using `pull_data_synapse()` and pulling GENIE data from cBioPortal may result in small differences in the data due to systematic differences in the processing pipelines employed by Synapse and cBioPortal. These differences may include: - -* Data formatting - Some data sets (e.g. CNA files) may appear in wide format in Synapse data versus long format in cBioPortal data, or column attributes and names may appear sightly different (e.g. fusions files). - -* Default filtering - By default, cBioPortal filters out Silent, Intron, IGR, 3'UTR, 5'UTR, 3'Flank and 5'Flank, except for the promoter mutations of the TERT gene. See [cBioPortal documentation](https://docs.cbioportal.org/file-formats/#mutation-data) for more details. These are retained in Synapse processing pipelines. - -* Hugo Symbols - Some genes have more than one accepted Hugo Symbol and may be referred to differently between data sources (e.g. `NSD3` is an alias for `WHSC1L1`). Some tools exist to help you resolve gene aliases across genomic data sources. See `gnomeR::recode_alias()`, `cbioportal::get_alias()` and vignettes from the [{gnomeR}](https://mskcc-epi-bio.github.io/gnomeR/) and [{cbioportalR}](https://www.karissawhiting.com/cbioportalR/) for more information on how to use these functions and work with gene aliases. - -## Selecting a Cohort for Analysis - -The following code chunk uses the `genieBPC::create_analytic_cohort()` to create an analytic cohort of patients diagnosed with stage IV NSCLC of adenocarcinoma histology. Then, for patients with multiple genomic samples, the `genieBPC::select_unique_ngs()` function chooses the genomic sample with OncoTree code LUAD (if available). For patients with multiple samples with OncoTree code LUAD, we will select the metastatic genomic sample. If any patients have multiple metastatic samples with OncoTree code LUAD, take the latest of the samples. - -Note: for patients with exactly one genomic sample, that unique genomic sample will be returned *regardless of whether it meets the argument criteria specified below*. - -```{r, eval=genieBPC:::.is_connected_to_genie()} -# create analytic cohort of patients diagnosed with Stage IV adenocarcinoma -nsclc_2_0_example <- create_analytic_cohort( - data_synapse = nsclc_2_0$NSCLC_v2.0, - stage_dx = c("Stage IV"), - histology = "Adenocarcinoma" -) - -# select unique NGS samples for this analytic cohort -nsclc_2_0_samples <- select_unique_ngs( - data_cohort = nsclc_2_0_example$cohort_ngs, - oncotree_code = "LUAD", - sample_type = "Metastasis", - min_max_time = "max" -) -``` - -Create a dataframe of the corresponding panel and sample IDs: - -```{r, eval=genieBPC:::.is_connected_to_genie()} -# specify sample panels and IDs -nsclc_2_0_sample_panels <- nsclc_2_0_samples %>% - select(cpt_seq_assay_id, cpt_genie_sample_id) %>% - rename(panel_id = cpt_seq_assay_id, - sample_id = cpt_genie_sample_id) %>% - filter(!is.na(panel_id)) -``` - -## Process Data with `create_gene_binary()` - -The `create_gene_binary()` function takes inputs of mutations, fusions, and CNA data and returns a binary matrix with the alteration status for each gene, annotating missingness when genes were not included on a next generation sequencing panel. - -It is critical to utilize the `specify_panel` argument of `create_gene_binary()`. Samples included in GENIE BPC were sequenced across multiple sequencing platforms, with the genes included varying across panels. Without the `specify_panel` argument, missingness will not be correctly annotated, and genes that were not tested will be incorrectly documented as not being altered. - -Note: you can optionally check and recode any older gene names to their newer Hugo Symbol in your data set by passing the `genie` option to `create_gene_binary(recode_aliases=)`. - -**Using the genomic data from Synapse:** - -The fusions and CNA data as downloaded from Synapse require some modifications prior to being supplied to the `gnomeR::create_gene_binary()` function. - -First, the CNA file can be transposed to match the expected input for `create_gene_binary()` using `pivot_cna_longer()`: - -```{r, eval=genieBPC:::.is_connected_to_genie()} -# transpose CNA data from Synapse -cna_synapse_long <- pivot_cna_longer(nsclc_2_0$NSCLC_v2.0$cna) -``` - -Next, the fusions file can be transposed to match the expected input for `create_gene_binary()` - -```{r, eval=genieBPC:::.is_connected_to_genie()} -# transpose fusions data from Synapse -fusions_synapse_long <- reformat_fusion(nsclc_2_0$NSCLC_v2.0$fusions) -``` - -Finally, the reformatted genomic data can be supplied to `create_gene_binary()` to annotate genomic alterations for patients in the analytic cohort of interest. - -The CNA data as downloaded from cBioPortal only includes high level CNA (-2, 2), so we will specify `high_level_cna_only = TRUE` to be consistent with the results based on the genomic data as downloaded from cBioPortal. - -Additionally, we will use the built in 'genie` option to check gene aliases (see `?create_gene_binary` for more info). - -```{r, eval=genieBPC:::.is_connected_to_genie()} -nsclc_2_0_gen_dat_synapse <- - create_gene_binary( - mutation = nsclc_2_0$NSCLC_v2.0$mutations_extended, - cna = cna_synapse_long, - high_level_cna_only = TRUE, - fusion = fusions_synapse_long, - samples = nsclc_2_0_sample_panels$sample_id, - specify_panel = nsclc_2_0_sample_panels, - recode_aliases = "genie" - ) -``` - -**Using the genomic data from cBioPortal:** - -```{r, eval=genieBPC:::.is_connected_to_genie()} -nsclc_2_0_gen_dat_cbio <- - create_gene_binary( - mutation = mutations_extended_2_0, - cna = cna_2_0, - fusion = fusions_2_0, - samples = nsclc_2_0_sample_panels$sample_id, - specify_panel = nsclc_2_0_sample_panels, - recode_aliases = "genie" - ) -``` - -Binary genomic matrices created using the genomic data downloaded from Synapse and cBioPortal should be equal. We will proceed using the `nsclc_2_0_gen_dat_cbio` object. - -## Collapse Data with `summarize_by_gene()` - -We can summarize the presence of any alteration event (mutation, amplification, deletion, structural variant) with the `summarize_by_gene()` function, such that each gene is a column that captures the presence of any event regardless of alteration type. - -Summarizing the first 10 samples for KRAS alterations: - -**Using the genomic data from Synapse:** - -```{r, eval=genieBPC:::.is_connected_to_genie()} -nsclc_2_0_gen_dat_synapse[1:10, ] %>% - select(sample_id, KRAS, KRAS.Amp) %>% - summarize_by_gene() -``` - -## Analyzing Data - -After the data have been transformed into a binary format, we can create summaries and visualizations to better understand the data. - -### Summarize Data with `tbl_genomic()` - -The `tbl_genomic()` function summarizes the frequency of alteration events from the binary data returned from `create_gene_binary()` or `summarize_by_gene()`. - -**Using the genomic data from Synapse:** - -Summarizing the frequencies of KEAP1, STK11, and SMARCA4 alteration events: - -```{r, eval=genieBPC:::.is_connected_to_genie()} - -nsclc_2_0_gen_dat_synapse %>% - select(sample_id, KEAP1, STK11, SMARCA4) %>% - tbl_genomic() -``` - -Users can subset their data set to only include genes above a certain prevalence frequency threshold before passing to the function using the `subset_by_frequency()` function. - -Below, we summarize alteration events with >=10% frequency: - -```{r, eval=genieBPC:::.is_connected_to_genie()} - -nsclc_2_0_gen_dat_synapse %>% - subset_by_frequency(t = 0.1) %>% - tbl_genomic() -``` - -**Using the genomic data from cBioPortal:** - -Summarizing the frequencies of KEAP1, STK11, and SMARCA4 alteration events: - -```{r, eval=genieBPC:::.is_connected_to_genie()} - -nsclc_2_0_gen_dat_cbio %>% - select(sample_id, KEAP1, STK11, SMARCA4) %>% - tbl_genomic() -``` - -Summarizing alteration events with >=10% frequency: - -```{r, eval=genieBPC:::.is_connected_to_genie()} - -nsclc_2_0_gen_dat_cbio %>% - subset_by_frequency(t = 0.1) %>% - tbl_genomic() -``` - -### Data Visualizations - -We can use the `mutation_viz()` function to visualize several aspects of the mutation data, including variant classification, variant type, SNV class and top variant genes. - -For the purposes of this vignette we will visualize the genomic data from cBioPortal. - -**Using the genomic data from cBioPortal:** - -```{r, eval=genieBPC:::.is_connected_to_genie()} -mutation_viz_gen_dat_cbio <- mutation_viz(mutations_extended_2_0) - -mutation_viz_gen_dat_cbio -``` - -# References - -Additional details regarding the GENIE BPC data and the {genieBPC} R package are published in the following papers: - -* Lavery, J. A., Brown, S., Curry, M. A., Martin, A., Sjoberg, D. D., & Whiting, K. (2023). [A data processing pipeline for the AACR project GENIE biopharma collaborative data with the {genieBPC} R package](https://pubmed.ncbi.nlm.nih.gov/36519837/). Bioinformatics (Oxford, England), 39(1), btac796. https://doi.org/10.1093/bioinformatics/btac796 - -* Lavery, J. A., Lepisto, E. M., Brown, S., Rizvi, H., McCarthy, C., LeNoue-Newton, M., Yu, C., Lee, J., Guo, X., Yu, T., Rudolph, J., Sweeney, S., AACR Project GENIE Consortium, Park, B. H., Warner, J. L., Bedard, P. L., Riely, G., Schrag, D., & Panageas, K. S. (2022). [A Scalable Quality Assurance Process for Curating Oncology Electronic Health Records: The Project GENIE Biopharma Collaborative Approach](https://pubmed.ncbi.nlm.nih.gov/35192403/). JCO clinical cancer informatics, 6, e2100105. https://doi.org/10.1200/CCI.21.00105 - -Technical details regarding proper analysis of this data can be found in the following publication: - -* Brown, S., Lavery, J. A., Shen, R., Martin, A. S., Kehl, K. L., Sweeney, S. M., Lepisto, E. M., Rizvi, H., McCarthy, C. G., Schultz, N., Warner, J. L., Park, B. H., Bedard, P. L., Riely, G. J., Schrag, D., Panageas, K. S., & AACR Project GENIE Consortium (2022). [Implications of Selection Bias Due to Delayed Study Entry in Clinical Genomic Studies](https://pubmed.ncbi.nlm.nih.gov/34734967/). JAMA oncology, 8(2), 287–291. https://doi.org/10.1001/jamaoncol.2021.5153 - -* Kehl, K. L., Uno, H., Gusev, A., Groha, S., Brown, S., Lavery, J. A., Schrag, D., & Panageas, K. S. (2023). [Elucidating Analytic Bias Due to Informative Cohort Entry in Cancer Clinico-genomic Datasets](https://pubmed.ncbi.nlm.nih.gov/36626408/). Cancer epidemiology, biomarkers & prevention: a publication of the American Association for Cancer Research, cosponsored by the American Society of Preventive Oncology, 32(3), 344–352. https://doi.org/10.1158/1055-9965.EPI-22-0875 - -* Kehl, K. L., Riely, G. J., Lepisto, E. M., Lavery, J. A., Warner, J. L., LeNoue-Newton, M. L., Sweeney, S. M., Rudolph, J. E., Brown, S., Yu, C., Bedard, P. L., Schrag, D., Panageas, K. S., & American Association of Cancer Research (AACR) Project Genomics Evidence Neoplasia Information Exchange (GENIE) Consortium (2021). [Correlation Between Surrogate End Points and Overall Survival in a Multi-institutional Clinicogenomic Cohort of Patients With Non-Small Cell Lung or Colorectal Cancer](https://pubmed.ncbi.nlm.nih.gov/34309669/). JAMA network open, 4(7), e2117547. https://doi.org/10.1001/jamanetworkopen.2021.17547 - diff --git a/vignettes/qa-impact-data.Rmd b/vignettes/qa-impact-data.Rmd deleted file mode 100644 index 0904737a..00000000 --- a/vignettes/qa-impact-data.Rmd +++ /dev/null @@ -1,206 +0,0 @@ ---- -title: "How to QA Your IMPACT Data" -output: rmarkdown::html_vignette -author: Esther Drill -vignette: > - %\VignetteIndexEntry{How to QA Your IMPACT Data} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) - -``` - -```{r setup, include=FALSE, echo=FALSE,warning=FALSE} - -library(dplyr) -library(cbioportalR) -library(tidyr) -library(gnomeR) - - -data("clin_collab_df") -``` - -## Introduction - -The purpose of this vignette is to outline best practices for downloading, QA-ing and analyzing data generated from MSK IMPACT, a targeted tumor-sequencing test that can detect more than 468 gene mutations and other critical genetic changes in common and rare cancers. Using a hepatocellular cancer case study, we demonstrate a data analysis pipeline using {cbioportalR} functions that can help users generate reproducible analyses using this data. - -## Setting up - -For this vignette, we will be using {cbioportalR}, a package to download data from the cBioPortal website. We will also be using {dplyr} and {tidyr} to clean and manipulate the data: - -```{r message = FALSE, warning=FALSE} -library(gnomeR) -library(cbioportalR) -library(dplyr) -library(tidyr) -``` - -To access cBioPortal data using the [{cbioportalR}](https://www.karissawhiting.com/cbioportalR/) package, you must set the cBioPortal database using the `set_cbioportal_db()` function. To access public data, set this to `db = public`. If you are using a private version of cBioPortal, you would set the `db` argument to your institution's cBioPortal URL. - -```{r set_df} - -set_cbioportal_db(db = "public") - -``` - -## Case Study - -**Scenario:** You are a data analyst whose collaborator has sent you a clinical file of a cohort of patients with hepatocellular cancer that she is interested in for retrospective data analysis. In particular, she wants to look at IMPACT sequencing data for the cohort and investigate associations between genomic alterations and pathological and clinical characteristics. She asks if you can get the IMPACT data and do the analysis. - -She gives you a clinical file with 80 sample IDs: `clin_collab_df`. - -```{r head_clin} -head(clin_collab_df) -``` - -The sample IDs are in the `cbioportal_id` column. - -Before using cBioPortal to access the genomic data, you first want to do some QA on the clinical data and make sure it matches up with the clinical data in cBioPortal. - -### Check For Multiple Samples Per Patient - -One of the first things to check in your data is whether you have multiple sample IDs from the same patient. Sometimes a clinical file will have a patient_ID column as well; this one doesn't, so you can make your own. The patient ID is just the first 9 digits of the `cbioportal_id`: - -```{r patient_id} - -clin_collab_df <- clin_collab_df %>% - mutate( - patient_id = substr(cbioportal_id, 1, 9) - ) -``` - -If there is only one sample per patient, there should be the same number of samples as patients. - -```{r pt_samp_sum} -clin_collab_df %>% - summarize( patients = length(unique(patient_id)), - samples= length(unique(cbioportal_id))) -``` - -So it's clear that we have multiple samples per patient. To find out which patient/s, you can count the `patient_id` values and filter for \>1. - -```{r } -multiple_samps <- clin_collab_df %>% - count(patient_id) %>% - filter(n > 1) -multiple_samps -``` - -There are 2 patients who each have 2 samples in the collaborator's dataset. Filter the dataset to see the `cbioportal_id`'s in question: - -```{r count_patients } - -clin_collab_df %>% - filter( - patient_id %in% - (multiple_samps$patient_id)) - -``` - -These are patients and samples to ask your collaborator about: Does using both samples make sense? Often times the answer is no. And if not, which sample is the most appropriate one to include? (To get more info for yourself, you can enter the patient ids into the [cBioPortal website](https://www.cbioportal.org/).) - -### Check That All `cbioportal_ids` Are In cBioPortal Database - -To do this, you need to retrieve the clinical data from cBioPortal using [{cbioportalR}](https://www.karissawhiting.com/cbioportalR/). You can use the `get_clinical_by_sample()` function from {cbioportalR} to do this. Set the `sample_id` parameter to the `cbioportal_ids` from the clinical collaborator's file.\ -Store the sample data in a file called `clin_cbio`. - -```{r get_cbio_clinical} - -clin_cbio = get_clinical_by_sample(sample_id = clin_collab_df$cbioportal_id) - -``` - -(You can disregard the warning message for now, though you may be interested in specific clinical attributes later.) - -*Note: If you are using the public version of cBioPortal, this function will only query the `msk_impact_2017` study.* - -Notice that you now have 2 clinical files: one given to you by the collaborator (`clin_collab_df`) and one you have retrieved yourself from cBioPortal (`clin_cbio`). - -Here's the header of `clin_cbio`: - -```{r head_clin_cbio} -head(clin_cbio) %>% as.data.frame() -``` - -The sample IDs here are in the `sampleId` column. You may notice that this file is in "long" format and each sample has multiple rows. Later we will convert this file to "wide" format to do QA checking on attributes. - -But the first thing you want to know is whether you are able to find all of the `cbioportal_ids` from your `clin_collab_df` file in the `clin_cbio` file.\ -To do this, use the `setdiff()` function: - -```{r check_missing} - -setdiff(clin_collab_df$cbioportal_id, clin_cbio$sampleId) - -``` - -So there are two sample ID's from your clinical file (`clin_collab_df`) that are currently not found in cBioPortal (in your `clin_cbio` file). Include these in the list of cBioPortal questions to ask your collaborator. - -(Again, if you want to investigate a bit further, you could enter the patient cBioPortal IDs as queries into the [cBioPortal website](https://www.cbioportal.org/).) - -### Check Clinical Data Matches cBioPortal Database - -Now we need to check whether clinical information in collaborator's file (`clin_collab_df`) matches clinical information in cBioPortal (in your `clin_cbio` file). - -Look at the `clin_collab_df` again: - -```{r head_clin_v2} -head(clin_collab_df) -``` - -Aside from `cbioportal_id`, you have cancer type (`ctype`) and sample type (`primary_mets`) variables. Because it's a hepatocellular cancer study, all of the `ctype` values will be the same. To double check that, count `ctype`: - -```{r count_ctype} -clin_collab_df %>% count(ctype) -``` - -So the only variable you can check in this example is the `primary_mets`. To see if the `clin_cbio` file has an analogous variable to check, first see the attributes that are available in it. - -```{r count_attribute} -clin_cbio %>% count(clinicalAttributeId) -``` - -To quickly see values associated with a particular attribute, filter by the attribute and count the values. For example: - -```{r filter_and_count} -clin_cbio %>% filter(clinicalAttributeId=="SAMPLE_TYPE") %>% count(value) -``` - -The attribute `SAMPLE_TYPE` looks like the appropriate variable to check `primary_mets` against. To do this, we will convert `clin_cbio` to "wide" form (only for the `SAMPLE_TYPE` variable for now), merge it with `clin_collab_df` and then cross-tabulate the 2 variables. - -To convert `clin_cbio` to "wide" form: - -```{r convert_samps_to_df} - -clin_cbio_wide = clin_cbio %>% - select( sampleId, clinicalAttributeId, value) %>% - filter( clinicalAttributeId == "SAMPLE_TYPE") %>% - pivot_wider(names_from = clinicalAttributeId, values_from = value) -``` - -Take a look at the "wide" file: - -```{r head_wide} -head(clin_cbio_wide) %>% as.data.frame() -``` - -Now to check the `primary_mets` variable from `clin_collab_df` against the `SAMPLE_TYPE` variable from `clin_cbio_wide`, merge the files and tabulate the variables. - -```{r compare} -clin_merged <- clin_cbio_wide %>% left_join(clin_collab_df, by = c("sampleId" = "cbioportal_id")) -clin_merged %>% select(primary_mets, SAMPLE_TYPE) %>% table() -``` - -There is 1 sample that has a value of "Metastasis" for the `primary_mets` variable but "Primary" for the `SAMPLE_TYPE` variable. To find the sample ID, filter: - -```{r find_discordant} -clin_merged %>% filter(primary_mets == "Metastasis" & SAMPLE_TYPE == "Primary") -``` - -Include this sample in the list of questions for your collaborator. Either she will need to update her clinical file with the correct value or you/she will have to notify cBioPortal to update their database. From 2df1c4cfce19e439b9ef52d9d78ba06186de12d7 Mon Sep 17 00:00:00 2001 From: lins5 Date: Thu, 10 Oct 2024 14:14:52 -0400 Subject: [PATCH 11/12] adding gnomeverse into readme adding in gnomeverse sentence along with gnomeverse intro and linking to website --- README.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.Rmd b/README.Rmd index dc5718b8..43ab137c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -54,6 +54,8 @@ the `gnomeR` package provides a consistent framework for genetic data processing - **Visualize processed data** - Create summary plots from processed data. - **Analyzing processed data**- Analyze associations between genomic variables and clinical variables or outcomes. +{gnomeR} is now also apart of the [gnomeverse](https://mskcc-epi-bio.github.io/genomeverse/), which is a collection of R packages that are designed to work together seamlessly to create reproducible clinico-genomic analysis pipelines. + ## Getting Set up {gnomeR} works with any genomic data that follows cBioPortal guidelines for [mutation](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-5), [CNA](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data), or [fusion](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#structural-variant-data) data file formats. From 8d99c5e77c468bf630a081bcc985344d52a91637 Mon Sep 17 00:00:00 2001 From: karissawhiting Date: Tue, 15 Oct 2024 10:52:30 -0400 Subject: [PATCH 12/12] small update to language and add Jess as author --- DESCRIPTION | 4 ++++ README.Rmd | 2 +- README.md | 31 ++++++++++++++++++++++--------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 74ec3ca3..5910c085 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -7,6 +7,10 @@ Authors@R: role = c("aut", "cre"), email = "karissa.whiting@gmail.com", comment = c(ORCID = "0000-0002-4683-1868")), + person(given = "Jessica", + family = "Lavery", + role = "aut", + comment = c(ORCID = "0000-0002-2746-5647")), person(given = "Michael", family = "Curry", role = "aut", diff --git a/README.Rmd b/README.Rmd index 43ab137c..9953339b 100644 --- a/README.Rmd +++ b/README.Rmd @@ -54,7 +54,7 @@ the `gnomeR` package provides a consistent framework for genetic data processing - **Visualize processed data** - Create summary plots from processed data. - **Analyzing processed data**- Analyze associations between genomic variables and clinical variables or outcomes. -{gnomeR} is now also apart of the [gnomeverse](https://mskcc-epi-bio.github.io/genomeverse/), which is a collection of R packages that are designed to work together seamlessly to create reproducible clinico-genomic analysis pipelines. +{gnomeR} is part of [gnomeverse](https://mskcc-epi-bio.github.io/genomeverse/), a collection of R packages designed to work together seamlessly to create reproducible clinico-genomic analysis pipelines. ## Getting Set up diff --git a/README.md b/README.md index a2884134..b75d81be 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,11 @@ cBioPortal. With {gnomeR} and {cbioportalR} you can: - **Analyzing processed data**- Analyze associations between genomic variables and clinical variables or outcomes. +{gnomeR} is part of +[gnomeverse](https://mskcc-epi-bio.github.io/genomeverse/), a collection +of R packages designed to work together seamlessly to create +reproducible clinico-genomic analysis pipelines. + ## Getting Set up {gnomeR} works with any genomic data that follows cBioPortal guidelines @@ -110,13 +115,15 @@ by_gene <- gen_dat %>% summarize_by_gene() head(by_gene[,1:6]) -#> sample_id ALK ARAF BLM CD79B CSF1R -#> 1 P-0004508-T01-IM5 1 0 0 0 0 -#> 2 P-0005806-T01-IM5 0 0 0 0 0 -#> 3 P-0007006-T01-IM5 0 0 0 0 0 -#> 4 P-0008682-T01-IM5 0 0 0 0 0 -#> 5 P-0001297-T01-IM3 0 0 0 0 0 -#> 6 P-0007538-T01-IM5 0 1 0 0 1 +#> # A tibble: 6 × 6 +#> sample_id ALK ARAF BLM CD79B CSF1R +#> +#> 1 P-0004508-T01-IM5 1 0 0 0 0 +#> 2 P-0005806-T01-IM5 0 0 0 0 0 +#> 3 P-0007006-T01-IM5 0 0 0 0 0 +#> 4 P-0008682-T01-IM5 0 0 0 0 0 +#> 5 P-0001297-T01-IM3 0 0 0 0 0 +#> 6 P-0007538-T01-IM5 0 1 0 0 1 ``` ## Visualize @@ -183,15 +190,21 @@ Thank you to all contributors! [@alrein-05](https://github.com/alrein-05), [@arorarshi](https://github.com/arorarshi), [@AxelitoMartin](https://github.com/AxelitoMartin), +[@brombergm](https://github.com/brombergm), [@carokos](https://github.com/carokos), [@ChristineZ-msk](https://github.com/ChristineZ-msk), +[@ddsjoberg](https://github.com/ddsjoberg), [@edrill](https://github.com/edrill), [@hfuchs5](https://github.com/hfuchs5), [@jalavery](https://github.com/jalavery), [@jflynn264](https://github.com/jflynn264), [@karissawhiting](https://github.com/karissawhiting), [@michaelcurry1123](https://github.com/michaelcurry1123), -[@mljaniczek](https://github.com/mljaniczek), and -[@slb2240](https://github.com/slb2240) +[@mljaniczek](https://github.com/mljaniczek), +[@slb2240](https://github.com/slb2240), +[@stl2137](https://github.com/stl2137), +[@toumban1](https://github.com/toumban1), +[@whitec4](https://github.com/whitec4), and +[@Yukodeng](https://github.com/Yukodeng) # The End