Merge branch 'main' into pr/346

MSKCC-Epi-Bio · Oct 15, 2024 · 9ea6e51 · 9ea6e51
2 parents d482294 + 9d6073e
commit 9ea6e51
Show file tree

Hide file tree

Showing 15 changed files with 224 additions and 534 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -7,6 +7,10 @@ Authors@R:
              role = c("aut", "cre"),
              email = "[email protected]",
              comment = c(ORCID = "0000-0002-4683-1868")), 
+     person(given = "Jessica",
+             family = "Lavery",
+             role = "aut",
+             comment = c(ORCID = "0000-0002-2746-5647")), 
      person(given = "Michael",
              family = "Curry",
              role = "aut",
@@ -46,23 +50,25 @@ Imports:
     tidyr (>= 1.3.0),
     cli,
     GGally,
-    gtsummary,
     broom.helpers,
     janitor,
     withr,
     scales,
-    lifecycle
+    lifecycle,
+    gtsummary (>= 2.0.2.9009)
 Suggests: 
     knitr,
     rmarkdown,
     testthat (>= 3.0.0),
     spelling,
-    covr, 
-    cbioportalR, 
+    covr,
+    cbioportalR (>= 1.1.0),
     genieBPC,
     vdiffr
-Remotes: karissawhiting/cbioportalR
 VignetteBuilder: knitr
 Language: en-US
 Roxygen: list(markdown = TRUE)
 Config/testthat/edition: 3
+Remotes:  
+    ddsjoberg/gtsummary,
+    karissawhiting/cbioportalR
diff --git a/R/subset-by-frequency.R b/R/subset-by-frequency.R
@@ -6,6 +6,7 @@
 #' @param t Threshold value between 0 and 1 to subset by. Default is 10% (.1).
 #' @param other_vars One or more column names (quoted or unquoted) in data to be retained
 #' in resulting data frame. Default is NULL.
+#' @param by Variable used to subset the data. Default is NULL.
 #' @return a data frame with a `sample_id` column and columns for
 #' alterations over the given prevalence threshold of `t`.
 #'
@@ -22,12 +23,12 @@
 #'gene_binary %>%
 #'  subset_by_frequency()
 #'
-subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
+subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NULL) {
 
 
   # Checks ------------------------------------------------------------------
 
-  # check threshold argument
+  # check threshold `t` argument
   if(!(is.numeric(t) & (t >= 0 & t <= 1))) {
     cli::cli_abort("{.field t} must be a number between 0 and 1")
   }
@@ -46,11 +47,24 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
                         arg_name = "other_vars"
     )
 
+  # Define by variable
+
+  by <-
+    .select_to_varnames({{ by }},
+                        data = gene_binary,
+                        arg_name = "by", select_single = TRUE
+    )
+
+  # Check if 'by' is in 'other_vars' (only if both are non-NULL)
+  if (!is.null(by) && !is.null(other_vars) && by %in% other_vars) {
+    cli::cli_abort("{.code other_vars} cannot overlap with {.code by}.")
+  }
+
   # data frame of only alterations
   alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars))
 
   # Remove all NA columns ----------------------------------------------
-  all_na_alt <- apply(alt_only,  2, function(x) {
+  all_na_alt <- apply(alt_only, 2, function(x) {
      sum(is.na(x)) == nrow(alt_only)
   })
 
@@ -59,20 +73,63 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
 
 
   # Check Numeric Class -----------------------------------------------------
-  .abort_if_not_numeric(alt_only)
+
+  if (is.null(by)) {
+    .abort_if_not_numeric(alt_only)
+  }
+  else {
+    .abort_if_not_numeric(select(alt_only, -any_of(by)))
+  }
 
 
   # Calc Frequency ----------------------------------------------------------
-  counts <- apply(alt_only, 2,  function(x) {sum(x, na.rm = TRUE)})
-  num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x)))
 
-  alt_freq <- counts/num_non_na
-  alts_over_thresh <- names(sort(alt_freq[alt_freq >= t], decreasing = TRUE))
+  if(is.null(by)){
+
+    counts <- apply(alt_only, 2,  function(x) {sum(x, na.rm = TRUE)})
+    num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x)))
+
+    alt_freq <- counts/num_non_na
+    alts_over_thresh <- names(sort(alt_freq[alt_freq >= t], decreasing = TRUE))
 
-  subset_binary <- select(gene_binary, "sample_id",
-                          any_of(other_vars),
-                          all_of(alts_over_thresh))
+    subset_binary <- select(gene_binary, "sample_id",
+                            any_of(other_vars),
+                            all_of(alts_over_thresh))
 
   return(subset_binary)
 
+  }
+  else{
+
+    alt_data <-
+      alt_only |>
+      group_by(across(all_of(by))) |>
+      summarise(across(everything(),
+                         list(sum = ~ sum(., na.rm = TRUE),
+                              total = ~ sum(!is.na(.), na.rm = T))))
+
+    alt_group_data <-
+      alt_data |>
+      pivot_longer(-any_of(by),
+                   names_to = c("gene")) |>
+      separate("gene", into = c("gene", "measure"), sep = "_") |>
+      pivot_wider(names_from = "measure",
+                  values_from = "value") |>
+      mutate(propo = .data$sum/.data$total) |>
+      arrange(desc(.data$propo))
+
+    alts_over_thresh_grp <-
+      alt_group_data |>
+      filter(.data$propo > t) |>
+      pull("gene") |>
+      unique()
+
+    subset_binary <- select(gene_binary, "sample_id",
+                            any_of(by),
+                            any_of(other_vars),
+                            all_of(alts_over_thresh_grp))
+
+    return(subset_binary)
+
+  }
 }
diff --git a/R/utils-gene-binary.R b/R/utils-gene-binary.R
@@ -58,7 +58,7 @@
 
 #' Infer mutation status and assume somatic if none
 #'
-#' @param mutation a mutation data frame
+#' @param mutation a mutation maf data frame
 #' @return a mutations data frame with a mutation status column
 #' @keywords internal
 .infer_mutation_status <- function(mutation) {
@@ -78,7 +78,7 @@
 #'
 #' Infers variant_type from reference_allele or tumor_seq_allele data
 #'
-#' @param mutation data frame
+#' @param mutation mutation maf file data frame
 #' @return a mutation data frame with a variant type column
 #' @keywords internal
 .infer_variant_type <- function(mutation, names_mut_dict = names_mut_dict) {

diff --git a/README.Rmd b/README.Rmd
@@ -18,6 +18,7 @@ knitr::opts_chunk$set(
 library(dplyr)
 library(gtsummary)
 library(gnomeR)
+
 ```
 
 # gnomeR
@@ -53,6 +54,8 @@ the `gnomeR` package provides a consistent framework for genetic data processing
 - **Visualize processed data** - Create summary plots from processed data. 
 - **Analyzing processed data**- Analyze associations between genomic variables and clinical variables or outcomes.
 
+{gnomeR} is part of [gnomeverse](https://mskcc-epi-bio.github.io/genomeverse/), a collection of R packages designed to work together seamlessly to create reproducible clinico-genomic analysis pipelines.
+
 ## Getting Set up
 
 {gnomeR} works with any genomic data that follows cBioPortal guidelines for [mutation](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-5), [CNA](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data), or [fusion](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#structural-variant-data) data file formats.

diff --git a/README.md b/README.md
@@ -47,6 +47,11 @@ cBioPortal. With {gnomeR} and {cbioportalR} you can:
 - **Analyzing processed data**- Analyze associations between genomic
   variables and clinical variables or outcomes.
 
+{gnomeR} is part of
+[gnomeverse](https://mskcc-epi-bio.github.io/genomeverse/), a collection
+of R packages designed to work together seamlessly to create
+reproducible clinico-genomic analysis pipelines.
+
 ## Getting Set up
 
 {gnomeR} works with any genomic data that follows cBioPortal guidelines
@@ -110,13 +115,15 @@ by_gene <- gen_dat %>%
   summarize_by_gene()
 
 head(by_gene[,1:6])
-#>           sample_id ALK ARAF BLM CD79B CSF1R
-#> 1 P-0004508-T01-IM5   1    0   0     0     0
-#> 2 P-0005806-T01-IM5   0    0   0     0     0
-#> 3 P-0007006-T01-IM5   0    0   0     0     0
-#> 4 P-0008682-T01-IM5   0    0   0     0     0
-#> 5 P-0001297-T01-IM3   0    0   0     0     0
-#> 6 P-0007538-T01-IM5   0    1   0     0     1
+#> # A tibble: 6 × 6
+#>   sample_id           ALK  ARAF   BLM CD79B CSF1R
+#>   <chr>             <dbl> <dbl> <dbl> <dbl> <dbl>
+#> 1 P-0004508-T01-IM5     1     0     0     0     0
+#> 2 P-0005806-T01-IM5     0     0     0     0     0
+#> 3 P-0007006-T01-IM5     0     0     0     0     0
+#> 4 P-0008682-T01-IM5     0     0     0     0     0
+#> 5 P-0001297-T01-IM3     0     0     0     0     0
+#> 6 P-0007538-T01-IM5     0     1     0     0     1
 ```
 
 ## Visualize
@@ -183,15 +190,21 @@ Thank you to all contributors!
 [@alrein-05](https://github.com/alrein-05),
 [@arorarshi](https://github.com/arorarshi),
 [@AxelitoMartin](https://github.com/AxelitoMartin),
+[@brombergm](https://github.com/brombergm),
 [@carokos](https://github.com/carokos),
 [@ChristineZ-msk](https://github.com/ChristineZ-msk),
+[@ddsjoberg](https://github.com/ddsjoberg),
 [@edrill](https://github.com/edrill),
 [@hfuchs5](https://github.com/hfuchs5),
 [@jalavery](https://github.com/jalavery),
 [@jflynn264](https://github.com/jflynn264),
 [@karissawhiting](https://github.com/karissawhiting),
 [@michaelcurry1123](https://github.com/michaelcurry1123),
-[@mljaniczek](https://github.com/mljaniczek), and
-[@slb2240](https://github.com/slb2240)
+[@mljaniczek](https://github.com/mljaniczek),
+[@slb2240](https://github.com/slb2240),
+[@stl2137](https://github.com/stl2137),
+[@toumban1](https://github.com/toumban1),
+[@whitec4](https://github.com/whitec4), and
+[@Yukodeng](https://github.com/Yukodeng)
 
 # The End
diff --git a/man/dot-check_for_fus_in_mut.Rd b/man/dot-check_for_fus_in_mut.Rd
diff --git a/man/dot-infer_mutation_status.Rd b/man/dot-infer_mutation_status.Rd
diff --git a/man/dot-infer_variant_type.Rd b/man/dot-infer_variant_type.Rd
diff --git a/man/subset_by_frequency.Rd b/man/subset_by_frequency.Rd
diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf
diff --git a/tests/testthat/test-plots.R b/tests/testthat/test-plots.R
@@ -287,5 +287,4 @@ test_that("mutation_viz works", {
 #
 # })
 
-# testthat::test_file("C:\\Users\\toumban\\OneDrive - Memorial Sloan Kettering Cancer Center\\Desktop\\gnomeR\\tests\\testthat\\test-plots.R")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -287,5 +287,4 @@ test_that("mutation_viz works", {
		#
		# })

		# testthat::test_file("C:\\Users\\toumban\\OneDrive - Memorial Sloan Kettering Cancer Center\\Desktop\\gnomeR\\tests\\testthat\\test-plots.R")