Skip to content

Commit

Permalink
Merge branch 'main' into pr/346
Browse files Browse the repository at this point in the history
  • Loading branch information
karissawhiting committed Oct 15, 2024
2 parents d482294 + 9d6073e commit 9ea6e51
Show file tree
Hide file tree
Showing 15 changed files with 224 additions and 534 deletions.
16 changes: 11 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ Authors@R:
role = c("aut", "cre"),
email = "[email protected]",
comment = c(ORCID = "0000-0002-4683-1868")),
person(given = "Jessica",
family = "Lavery",
role = "aut",
comment = c(ORCID = "0000-0002-2746-5647")),
person(given = "Michael",
family = "Curry",
role = "aut",
Expand Down Expand Up @@ -46,23 +50,25 @@ Imports:
tidyr (>= 1.3.0),
cli,
GGally,
gtsummary,
broom.helpers,
janitor,
withr,
scales,
lifecycle
lifecycle,
gtsummary (>= 2.0.2.9009)
Suggests:
knitr,
rmarkdown,
testthat (>= 3.0.0),
spelling,
covr,
cbioportalR,
covr,
cbioportalR (>= 1.1.0),
genieBPC,
vdiffr
Remotes: karissawhiting/cbioportalR
VignetteBuilder: knitr
Language: en-US
Roxygen: list(markdown = TRUE)
Config/testthat/edition: 3
Remotes:
ddsjoberg/gtsummary,
karissawhiting/cbioportalR
79 changes: 68 additions & 11 deletions R/subset-by-frequency.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#' @param t Threshold value between 0 and 1 to subset by. Default is 10% (.1).
#' @param other_vars One or more column names (quoted or unquoted) in data to be retained
#' in resulting data frame. Default is NULL.
#' @param by Variable used to subset the data. Default is NULL.
#' @return a data frame with a `sample_id` column and columns for
#' alterations over the given prevalence threshold of `t`.
#'
Expand All @@ -22,12 +23,12 @@
#'gene_binary %>%
#' subset_by_frequency()
#'
subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL, by = NULL) {


# Checks ------------------------------------------------------------------

# check threshold argument
# check threshold `t` argument
if(!(is.numeric(t) & (t >= 0 & t <= 1))) {
cli::cli_abort("{.field t} must be a number between 0 and 1")
}
Expand All @@ -46,11 +47,24 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {
arg_name = "other_vars"
)

# Define by variable

by <-
.select_to_varnames({{ by }},
data = gene_binary,
arg_name = "by", select_single = TRUE
)

# Check if 'by' is in 'other_vars' (only if both are non-NULL)
if (!is.null(by) && !is.null(other_vars) && by %in% other_vars) {
cli::cli_abort("{.code other_vars} cannot overlap with {.code by}.")
}

# data frame of only alterations
alt_only <- select(gene_binary, -"sample_id", -any_of(other_vars))

# Remove all NA columns ----------------------------------------------
all_na_alt <- apply(alt_only, 2, function(x) {
all_na_alt <- apply(alt_only, 2, function(x) {
sum(is.na(x)) == nrow(alt_only)
})

Expand All @@ -59,20 +73,63 @@ subset_by_frequency <- function(gene_binary, t = .1, other_vars = NULL) {


# Check Numeric Class -----------------------------------------------------
.abort_if_not_numeric(alt_only)

if (is.null(by)) {
.abort_if_not_numeric(alt_only)
}
else {
.abort_if_not_numeric(select(alt_only, -any_of(by)))
}


# Calc Frequency ----------------------------------------------------------
counts <- apply(alt_only, 2, function(x) {sum(x, na.rm = TRUE)})
num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x)))

alt_freq <- counts/num_non_na
alts_over_thresh <- names(sort(alt_freq[alt_freq >= t], decreasing = TRUE))
if(is.null(by)){

counts <- apply(alt_only, 2, function(x) {sum(x, na.rm = TRUE)})
num_non_na <- apply(alt_only, 2, function(x) sum(!is.na(x)))

alt_freq <- counts/num_non_na
alts_over_thresh <- names(sort(alt_freq[alt_freq >= t], decreasing = TRUE))

subset_binary <- select(gene_binary, "sample_id",
any_of(other_vars),
all_of(alts_over_thresh))
subset_binary <- select(gene_binary, "sample_id",
any_of(other_vars),
all_of(alts_over_thresh))

return(subset_binary)

}
else{

alt_data <-
alt_only |>
group_by(across(all_of(by))) |>
summarise(across(everything(),
list(sum = ~ sum(., na.rm = TRUE),
total = ~ sum(!is.na(.), na.rm = T))))

alt_group_data <-
alt_data |>
pivot_longer(-any_of(by),
names_to = c("gene")) |>
separate("gene", into = c("gene", "measure"), sep = "_") |>
pivot_wider(names_from = "measure",
values_from = "value") |>
mutate(propo = .data$sum/.data$total) |>
arrange(desc(.data$propo))

alts_over_thresh_grp <-
alt_group_data |>
filter(.data$propo > t) |>
pull("gene") |>
unique()

subset_binary <- select(gene_binary, "sample_id",
any_of(by),
any_of(other_vars),
all_of(alts_over_thresh_grp))

return(subset_binary)

}
}
4 changes: 2 additions & 2 deletions R/utils-gene-binary.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@

#' Infer mutation status and assume somatic if none
#'
#' @param mutation a mutation data frame
#' @param mutation a mutation maf data frame
#' @return a mutations data frame with a mutation status column
#' @keywords internal
.infer_mutation_status <- function(mutation) {
Expand All @@ -78,7 +78,7 @@
#'
#' Infers variant_type from reference_allele or tumor_seq_allele data
#'
#' @param mutation data frame
#' @param mutation mutation maf file data frame
#' @return a mutation data frame with a variant type column
#' @keywords internal
.infer_variant_type <- function(mutation, names_mut_dict = names_mut_dict) {
Expand Down
3 changes: 3 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ knitr::opts_chunk$set(
library(dplyr)
library(gtsummary)
library(gnomeR)
```

# gnomeR
Expand Down Expand Up @@ -53,6 +54,8 @@ the `gnomeR` package provides a consistent framework for genetic data processing
- **Visualize processed data** - Create summary plots from processed data.
- **Analyzing processed data**- Analyze associations between genomic variables and clinical variables or outcomes.

{gnomeR} is part of [gnomeverse](https://mskcc-epi-bio.github.io/genomeverse/), a collection of R packages designed to work together seamlessly to create reproducible clinico-genomic analysis pipelines.

## Getting Set up

{gnomeR} works with any genomic data that follows cBioPortal guidelines for [mutation](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#data-file-5), [CNA](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#discrete-copy-number-data), or [fusion](https://docs.cbioportal.org/5.1-data-loading/data-loading/file-formats#structural-variant-data) data file formats.
Expand Down
31 changes: 22 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ cBioPortal. With {gnomeR} and {cbioportalR} you can:
- **Analyzing processed data**- Analyze associations between genomic
variables and clinical variables or outcomes.

{gnomeR} is part of
[gnomeverse](https://mskcc-epi-bio.github.io/genomeverse/), a collection
of R packages designed to work together seamlessly to create
reproducible clinico-genomic analysis pipelines.

## Getting Set up

{gnomeR} works with any genomic data that follows cBioPortal guidelines
Expand Down Expand Up @@ -110,13 +115,15 @@ by_gene <- gen_dat %>%
summarize_by_gene()

head(by_gene[,1:6])
#> sample_id ALK ARAF BLM CD79B CSF1R
#> 1 P-0004508-T01-IM5 1 0 0 0 0
#> 2 P-0005806-T01-IM5 0 0 0 0 0
#> 3 P-0007006-T01-IM5 0 0 0 0 0
#> 4 P-0008682-T01-IM5 0 0 0 0 0
#> 5 P-0001297-T01-IM3 0 0 0 0 0
#> 6 P-0007538-T01-IM5 0 1 0 0 1
#> # A tibble: 6 × 6
#> sample_id ALK ARAF BLM CD79B CSF1R
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 P-0004508-T01-IM5 1 0 0 0 0
#> 2 P-0005806-T01-IM5 0 0 0 0 0
#> 3 P-0007006-T01-IM5 0 0 0 0 0
#> 4 P-0008682-T01-IM5 0 0 0 0 0
#> 5 P-0001297-T01-IM3 0 0 0 0 0
#> 6 P-0007538-T01-IM5 0 1 0 0 1
```

## Visualize
Expand Down Expand Up @@ -183,15 +190,21 @@ Thank you to all contributors!
[@alrein-05](https://github.com/alrein-05),
[@arorarshi](https://github.com/arorarshi),
[@AxelitoMartin](https://github.com/AxelitoMartin),
[@brombergm](https://github.com/brombergm),
[@carokos](https://github.com/carokos),
[@ChristineZ-msk](https://github.com/ChristineZ-msk),
[@ddsjoberg](https://github.com/ddsjoberg),
[@edrill](https://github.com/edrill),
[@hfuchs5](https://github.com/hfuchs5),
[@jalavery](https://github.com/jalavery),
[@jflynn264](https://github.com/jflynn264),
[@karissawhiting](https://github.com/karissawhiting),
[@michaelcurry1123](https://github.com/michaelcurry1123),
[@mljaniczek](https://github.com/mljaniczek), and
[@slb2240](https://github.com/slb2240)
[@mljaniczek](https://github.com/mljaniczek),
[@slb2240](https://github.com/slb2240),
[@stl2137](https://github.com/stl2137),
[@toumban1](https://github.com/toumban1),
[@whitec4](https://github.com/whitec4), and
[@Yukodeng](https://github.com/Yukodeng)

# The End
2 changes: 1 addition & 1 deletion man/dot-check_for_fus_in_mut.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/dot-infer_mutation_status.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/dot-infer_variant_type.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/subset_by_frequency.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified tests/testthat/Rplots.pdf
Binary file not shown.
1 change: 0 additions & 1 deletion tests/testthat/test-plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -287,5 +287,4 @@ test_that("mutation_viz works", {
#
# })

# testthat::test_file("C:\\Users\\toumban\\OneDrive - Memorial Sloan Kettering Cancer Center\\Desktop\\gnomeR\\tests\\testthat\\test-plots.R")

Loading

0 comments on commit 9ea6e51

Please sign in to comment.