-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #346 from jalavery/add_summarize_by_patient
Add summarize by patient
- Loading branch information
Showing
11 changed files
with
522 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
#' Simplify binary matrix to one column per patient that counts any alteration | ||
#' type across all samples as 1 | ||
#' | ||
#' This will reduce the number of columns in your binary matrix, and the | ||
#' resulting data frame will have only 1 col per gene, as opposed to separate | ||
#' columns for mutation/cna/fusion. | ||
#' | ||
#' Note that if samples to the same patient were sequenced on different panels, | ||
#' any indication of an alteration is counted as an alteration, but the absence | ||
#' of an alteration is only defined when all sequencing panels included the gene | ||
#' and indicated that it was not altered. | ||
#' | ||
#' @param gene_binary a 0/1 matrix of gene alterations | ||
#' @param other_vars One or more column names (quoted or unquoted) in data to be retained | ||
#' in resulting data frame. Default is NULL. | ||
#' | ||
#' @return a binary matrix with a row for each sample and one column per gene | ||
#' @export | ||
#' | ||
#' @examples | ||
#' samples <- unique(gnomeR::mutations$sampleId)[1:10] | ||
#' gene_binary <- create_gene_binary( | ||
#' samples = samples, mutation = mutations, cna = cna, | ||
#' mut_type = "somatic_only", | ||
#' include_silent = FALSE, | ||
#' specify_panel = "IMPACT341") | ||
#' | ||
#' gene_binary$patient_id = extract_patient_id(gene_binary$sample_id) | ||
#' | ||
#' summarize_by_patient(gene_binary) | ||
#' | ||
summarize_by_patient <- function(gene_binary, other_vars = NULL) { | ||
|
||
|
||
# Checks ------------------------------------------------------------------ | ||
|
||
if (!is.data.frame(gene_binary)) { | ||
cli::cli_abort("{.code gene_binary} must be a data.frame with sample ids") | ||
} | ||
|
||
# !!! I think we should allow sample ID as input but not require it | ||
# .check_required_cols( | ||
# gene_binary, | ||
# c("sample_id")) | ||
|
||
.check_required_cols( | ||
gene_binary, | ||
c("patient_id"), | ||
add_to_message = c(i = "To extract patient IDs from IMPACT sample IDs (e.g. `P-XXXXXX-TXX-IMX`), use {.code gnomeR::extract_patient_id(data$sample_id)}")) | ||
|
||
# Other Vars - Capture Other Columns to Retain ----------------------------------- | ||
|
||
other_vars <- | ||
.select_to_varnames({{ other_vars }}, | ||
data = gene_binary, | ||
arg_name = "other_vars" | ||
) | ||
|
||
|
||
# Create Sample Index ----------------------------------------------------- | ||
|
||
|
||
sample_index <- gene_binary %>% | ||
select("patient_id") %>% | ||
mutate(sample_index = paste0("samp", 1:nrow(gene_binary))) | ||
|
||
# data frame of only alterations | ||
|
||
alt_only <- as.data.frame(select(gene_binary, -"patient_id", -any_of("sample_id"), -any_of(other_vars))) | ||
|
||
row.names(alt_only) <- sample_index$sample_index | ||
|
||
# check numeric class --------- | ||
.abort_if_not_numeric(alt_only) | ||
|
||
# Transpose --------------------------------------------------------------- | ||
|
||
transp_alt_only <- as.data.frame(t(alt_only)) | ||
|
||
# remove endings of gene names | ||
transp_alt_only <- transp_alt_only %>% | ||
mutate(gene = str_remove_all(row.names(.), | ||
".Amp|.fus|.Del|.cna")) | ||
|
||
# check for genes that have more than one alt type | ||
gene_tab <- table(transp_alt_only$gene) | ||
|
||
genes_multiple <- names(gene_tab[which(gene_tab > 1)]) | ||
genes_single <- names(gene_tab[which(gene_tab == 1)]) | ||
|
||
# genes with one type of event | ||
all_bin_once <- transp_alt_only %>% | ||
filter(.data$gene %in% genes_single) | ||
|
||
# genes with more than one type of event | ||
all_bin_more <- transp_alt_only %>% | ||
filter(.data$gene %in% genes_multiple) | ||
|
||
if(length(genes_multiple) > 0) { | ||
all_bin_more <- all_bin_more %>% | ||
group_by(.data$gene) %>% | ||
summarize(across(everything(), max)) | ||
} | ||
|
||
# bind together and transpose | ||
all_bin <- rbind(all_bin_once, all_bin_more, make.row.names = FALSE) %>% | ||
tibble::column_to_rownames("gene") | ||
|
||
all_bin <- as.data.frame(t(all_bin)) %>% | ||
tibble::rownames_to_column("sample_index") | ||
|
||
# join back to sample ID and other vars | ||
simp_gene_binary <- all_bin %>% | ||
left_join(sample_index, ., by = "sample_index") %>% | ||
select(-c("sample_index")) %>% | ||
# identify patients | ||
# determine number of samples per patient | ||
group_by(.data$patient_id) %>% | ||
mutate(n_samples = n()) %>% | ||
ungroup() | ||
|
||
# summarize genomic information across patients | ||
# separate patients w/ only 1 sample vs multiple samples to improve run time | ||
simp_gene_binary_pt_single <- simp_gene_binary %>% | ||
filter(.data$n_samples == 1) | ||
|
||
if (nrow(simp_gene_binary %>% | ||
filter(.data$n_samples > 1)) >0){ | ||
|
||
simp_gene_binary_pt_multiple <- simp_gene_binary %>% | ||
filter(.data$n_samples > 1) %>% | ||
group_by(.data$patient_id) %>% | ||
summarize(across(.cols = c(everything()), | ||
.fns = ~case_when( | ||
# if any alteration, indicate altered | ||
max(c(.x, 0), na.rm = TRUE) == 1 ~ 1, | ||
# no alteration only if no NAs (no na.rm) | ||
max(.x) == 0 ~ 0 | ||
)), | ||
.groups = "drop") | ||
|
||
simp_gene_binary_pt <- bind_rows(simp_gene_binary_pt_single, | ||
simp_gene_binary_pt_multiple) %>% | ||
select(-"n_samples") | ||
} else { | ||
simp_gene_binary_pt <- simp_gene_binary_pt_single %>% | ||
select(-"n_samples") | ||
} | ||
|
||
# Join data | ||
simp_gene_binary <- left_join(simp_gene_binary_pt, | ||
gene_binary %>% | ||
select(any_of(c("patient_id", other_vars))) %>% | ||
distinct(), | ||
by = "patient_id") %>% | ||
select("patient_id", everything()) | ||
|
||
# warn if not unique resulting data | ||
n_occur <- data.frame(table(simp_gene_binary$patient_id)) | ||
n_occur[n_occur$Freq > 1,] | ||
dupe_samp_in_result <- n_occur$Var1[n_occur$Freq > 1] | ||
|
||
if(length(dupe_samp_in_result) > 0) { | ||
cli::cli_alert_warning("Returned data is not unique (1 row per patient) due to non-distinct values in data passed to {.code other_vars} argument. See {head(dupe_samp_in_result, 3)} ...") | ||
} | ||
|
||
return(simp_gene_binary) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.