Skip to content

Commit

Permalink
Merge pull request #171 from umccr/sv_update
Browse files Browse the repository at this point in the history
Sash SV support and generalise SV input
  • Loading branch information
skanwal authored Sep 30, 2024
2 parents 80158c4 + 194ae5c commit 5286e06
Show file tree
Hide file tree
Showing 12 changed files with 135 additions and 229 deletions.
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ export(glanceExprPlot)
export(immune_summary)
export(known_trans)
export(known_translocations_cgi_process)
export(manta_process)
export(mutCNexprPlot)
export(nearest_position)
export(pca)
Expand All @@ -50,7 +49,8 @@ export(salmon_counts)
export(saveWidgetFix)
export(session_info_kable)
export(session_info_tbls)
export(sv_manta_summary)
export(sv_process)
export(sv_summary)
export(tpm_from_rpkm)
export(webplot)
importFrom(dplyr,"%>%")
Expand Down
8 changes: 4 additions & 4 deletions R/exprTable.R
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ exprTable <- function(data = NULL, genes = NULL, keep_all = FALSE, cn_data = NUL
}
}

##### Add structural variants results from MANTA
##### Add structural variants results
if (!is.null(sv_data) && length(genes) > 0) {
##### NOTE: when merging per-gene expression data with SV data from MANTA the "gene" column is used since multiple entries are possible for one gene in MANTA output
##### NOTE: when merging per-gene expression data with SV data the "gene" column is used since multiple entries are possible for one gene
group.z <- merge(group.z, sv_data, by.x = "Gene", by.y = "Genes", all = TRUE, sort = FALSE)
}

Expand Down Expand Up @@ -260,11 +260,11 @@ exprTable <- function(data = NULL, genes = NULL, keep_all = FALSE, cn_data = NUL
group.z <- group.z[order(abs(group.z[, "Diff"]), decreasing = TRUE), ]
group.z <- group.z[order(group.z$TIER), ]

##### Order the data by MANTA increasing Tier (to prioritise SVs, based on https://github.com/AstraZeneca-NGS/simple_sv_annotation/blob/master/simple_sv_annotation.py), event type and then by the highest absolute values for Patient vs [comp_cancer] difference
##### Order the data by event type and then by the highest absolute values for Patient vs [comp_cancer] difference
} else if (!is.null(sv_data) && length(genes) > 0) {
group.z <- group.z[order(abs(group.z[["Diff"]]), decreasing = TRUE), ]
group.z <- group.z[order(group.z[["fusion_genes"]], decreasing = TRUE), ]
group.z <- group.z[order(group.z[["Tier"]]), ]
#group.z <- group.z[order(group.z[["Tier"]]), ]

##### Otherwise order table by the highest absolute values for Patient vs [comp_cancer] difference
} else if (length(genes) > 0) {
Expand Down
24 changes: 12 additions & 12 deletions R/sample_data.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Read Sample Data
#'
#' Reads sample data, including Arriba fusions, Arriba plots, Salmon counts,
#' DRAGEN fusions, DRAGEN mapping metrics, Manta SVs, PURPLE CNVs, and PCGR SNVs.
#' DRAGEN fusions, DRAGEN mapping metrics, SVs, PURPLE CNVs, and PCGR SNVs.
#'
#' @param p RNAsum params list.
#' @param results_dir Directory to output extracted Arriba PNGs to (created
Expand Down Expand Up @@ -80,8 +80,8 @@ read_sample_data <- function(p, results_dir, tx2gene = NULL) {

#' Read WGS Data
#'
#' Reads WGS data, including PCGR `tiers.tsv`, PURPLE `cnv.gene.tsv`, and Manta
#' `manta.tsv`. If the file path has been specified in the RNAsum params and is
#' Reads WGS data, including PCGR `tiers.tsv`, PURPLE `cnv.gene.tsv`, and
#' `sv.prioritised.tsv`. If the file path has been specified in the RNAsum params and is
#' valid, it is returned. As a fallback, if the umccrise directory param has
#' been specified, then there is an attempt to detect the file pattern in there.
#'
Expand All @@ -95,8 +95,8 @@ read_sample_data <- function(p, results_dir, tx2gene = NULL) {
#' "TEST-somatic.pcgr.snvs_indels.tiers.tsv",
#' package = "RNAsum"
#' ),
#' manta_tsv = system.file(
#' "rawdata/test_data/umccrised/test_sample_WGS/structural/TEST-prioritize-manta.tsv",
#' sash_tsv = system.file(
#' "rawdata/test_data/test_sample_WGS/structural/TEST.sv.prioritised.tsv",
#' package = "RNAsum"
#' )
#' )
Expand Down Expand Up @@ -143,16 +143,16 @@ read_wgs_data <- function(p) {
nm = "purple_gene_tsv", func = ppl_cnv_som_gene_read
)

manta_tsv <- .read(
sv_tsv <- .read(
p = p,
subdir = "structural", pat = "manta\\.tsv$",
nm = "manta_tsv", func = sv_prioritize_old
subdir = "structural", pat = "prioritised\\.tsv$",
nm = "sv_tsv", func = sv_prioritize
)

list(
pcgr_tiers_tsv = pcgr_tiers_tsv,
purple_gene_tsv = purple_gene_tsv,
manta_tsv = manta_tsv
sv_tsv = sv_tsv
)
}

Expand Down Expand Up @@ -212,12 +212,12 @@ fusions_summary <- function(tbl = NULL) {
res
}

#' Get Manta SV Summary
#' Get SV Summary
#'
#' @param tbl Tibble with melted SVs from umccrise, containing 'Genes' column.
#' @param tbl Tibble with melted SVs from sash, containing 'Genes' column.
#' @return Character vector of Genes.
#' @export
sv_manta_summary <- function(tbl) {
sv_summary <- function(tbl) {
assertthat::assert_that(
inherits(tbl, "data.frame"), (c("Genes") %in% names(tbl))
)
Expand Down
189 changes: 48 additions & 141 deletions R/sv.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#' Process Manta Object
#' Process SV Object
#'
#' @param manta_tsv_obj Manta list object read via `sv_prioritize_old`.
#' @param sv_tsv_obj SV list object read via `sv_prioritize`.
#'
#' @return List with:
#' - total Manta (unmelted) variants
#' - total SV (unmelted) variants
#' - tibble with melted variants
#' - genes involved in multi-gene events
#' @export
manta_process <- function(manta_tsv_obj) {
total_variants <- manta_tsv_obj[["total_variants"]]
melted <- manta_tsv_obj[["melted"]] |>
sv_process <- function(sv_tsv_obj) {
total_variants <- sv_tsv_obj[["total_variants"]]
melted <- sv_tsv_obj[["melted"]] |>
dplyr::mutate(
is_fusion = grepl("&", .data$Genes),
fusion_genes = dplyr::if_else(.data$is_fusion, .data$Genes, "")
Expand Down Expand Up @@ -38,146 +38,53 @@ manta_process <- function(manta_tsv_obj) {
)
}

sv_prioritize_old <- function(sv_file) {
# grab the dplyr pipe (for now!)
`%>%` <- dplyr::`%>%`
subset_genes <- function(genes, ind) {
genes |>
stringr::str_split("&") %>%
purrr::map(
~ .[ind] %>%
replace("", NA) %>%
.[!is.na(.)]
) %>%
purrr::map_chr(~ ifelse(length(.) > 0, stringr::str_c(., collapse = "&"), ""))
}

format_val <- function(val, is_pct = F) {
ifelse(!is.na(val),
format(val, digits = 1) %>% stringr::str_c(ifelse(is_pct, "%", "")),
NA
)
}

split_sv_field <- function(.data, field, is_pct = F) {
f_q <- rlang::enquo(field)
f_str <- rlang::quo_name(f_q)
f1_str <- stringr::str_c(f_str, "1")
f2_str <- stringr::str_c(f_str, "2")
f1_q <- rlang::sym(f1_str)
f2_q <- rlang::sym(f2_str)
.data %>%
tidyr::separate(col = !!f_q, into = c(f1_str, f2_str), sep = ",", fill = "right") %>%
dplyr::mutate(
!!f1_q := as.double(!!f1_q) * ifelse(is_pct, 100, 1),
!!f2_q := as.double(!!f2_q) * ifelse(is_pct, 100, 1),
!!f_q := (!!f1_q + ifelse(is.na(!!f2_q), !!f1_q, !!f2_q)) / 2,
!!f_q := format_val(!!f_q, is_pct),
!!f1_q := format_val(!!f1_q, is_pct),
!!f2_q := format_val(!!f2_q, is_pct)
)
}
sv_prioritize <- function(sv_file){

# Check file is not empty
sv_all <- NULL
if (length(readLines(con = sv_file, n = 2)) <= 1) {
return(sv_all)
}

col_types_tab <- dplyr::tribble(
~Column, ~Description, ~Type,
"caller", "Manta SV caller", "c",
"sample", "Tumor sample name", "c",
"chrom", "CHROM column in VCF", "c",
"start", "POS column in VCF", "i",
"end", "INFO/END: End position of the variant described in this record", "i",
"svtype", "INFO/SVTYPE: Type of structural variant", "c",
"split_read_support", "FORMAT/SR of tumor sample: Split reads for the ref and alt alleles in the order listed, for reads where P(allele|read)>0.999", "c",
"paired_support_PE", "FORMAT/PE of tumor sample: ??", "c",
"paired_support_PR", "FORMAT/PR of tumor sample: Spanning paired-read support for the ref and alt alleles in the order listed, for reads where P(allele|read)>0.999", "c",
"AF_BPI", "INFO/BPI_AF: AF at each breakpoint (so AF_BPI1,AF_BPI2)", "c",
"somaticscore", "INFO/SOMATICSCORE: Somatic variant quality score", "i",
"tier", "INFO/SV_TOP_TIER (or 4 if missing): Highest priority tier for the effects of a variant entry", "c",
"annotation", "INFO/SIMPLE_ANN: Simplified structural variant annotation: 'SVTYPE | EFFECT | GENE(s) | TRANSCRIPT | PRIORITY (1-4)'", "c",
"AF_PURPLE", "INFO/PURPLE_AF: AF at each breakend (purity adjusted) (so AF_PURPLE1,AF_PURPLE2)", "c",
"CN_PURPLE", "INFO/PURPLE_CN: CN at each breakend (purity adjusted) (so CN_PURPLE1,CN_PURPLE2)", "c",
"CN_change_PURPLE", "INFO/PURPLE_CN_CHANGE: change in CN at each breakend (purity adjusted) (so CN_change_PURPLE1,CN_change_PURPLE2)", "c",
"Ploidy_PURPLE", "INFO/PURPLE_PLOIDY: Ploidy of variant (purity adjusted)", "d",
"PURPLE_status", "INFERRED if FILTER=INFERRED, or RECOVERED if has INFO/RECOVERED, else blank. INFERRED: Breakend inferred from copy number transition", "c",
"START_BPI", "INFO/BPI_START: BPI adjusted breakend location", "i",
"END_BPI", "INFO/BPI_END: BPI adjusted breakend location", "i",
"ID", "ID column in VCF", "c",
"MATEID", "INFO/MATEID: ID of mate breakend", "c",
"ALT", "ALT column in VCF", "c"
)
ctypes <- paste(col_types_tab$Type, collapse = "")
sv_all <- readr::read_tsv(sv_file, col_names = TRUE, col_types = ctypes) |>
dplyr::select(-c("caller", "sample")) |>
split_sv_field("AF_BPI", is_pct = T) |>
split_sv_field("AF_PURPLE", is_pct = T) |>
split_sv_field("CN_PURPLE") |>
split_sv_field("CN_change_PURPLE") |>
dplyr::mutate(
Ploidy_PURPLE = as.double(.data$Ploidy_PURPLE),
Ploidy_PURPLE = format(.data$Ploidy_PURPLE, nsmall = 2)
) |>
tidyr::separate_wider_delim(cols = "split_read_support", names = c("SR (ref)", "SR (alt)"), delim = ",", too_few = "align_start") |>
tidyr::separate_wider_delim(cols = "paired_support_PR", names = c("PR (ref)", "PR (alt)"), delim = ",", too_few = "align_start") |>
tidyr::separate_wider_delim(cols = "paired_support_PE", names = c("PE (ref)", "PE (alt)"), delim = ",", too_few = "align_start") |>
dplyr::mutate(
SR = as.integer(.data$`SR (alt)`), PR = as.integer(.data$`PR (alt)`), PE = as.integer(.data$`PE (alt)`)
)
sv_all <- readr::read_tsv(sv_file, col_names = TRUE)
total_variants <- nrow(sv_all)
sv_all <- sv_all |>
# Unpack multiple annotations per region
dplyr::mutate(annotation = strsplit(.data$annotation, ",")) |>
tidyr::unnest("annotation") |>
tidyr::separate_wider_delim(
cols = "annotation", delim = "|",
names = c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), too_few = "align_start"
) |>
dplyr::mutate(
start = format(.data$start, big.mark = ",", trim = T),
end = format(.data$end, big.mark = ",", trim = T),
location = stringr::str_c(.data$chrom, ":", .data$start, sep = ""),
location = ifelse(is.na(.data$end), .data$location, stringr::str_c(.data$location))
) |>
dplyr::mutate(
Gene = subset_genes(.data$Genes, c(1, 2)),
Gene = ifelse((stringr::str_split(.data$Genes, "&") |> purrr::map_int(base::length)) > 2,
stringr::str_c(.data$Gene, "...", sep = ", "),
.data$Gene
),
`Other affected genes` = subset_genes(.data$Genes, -c(1, 2)) |> stringr::str_replace_all("&", ", "),
Gene = ifelse(stringr::str_detect(.data$Effect, "gene_fusion"),
.data$Gene,
.data$Gene |> stringr::str_replace_all("&", ", ")
)
) |>
tidyr::separate_wider_delim(
cols = "Effect", delim = "&", names = c("Effect", "Other effects"),
too_few = "align_start", too_many = "merge"
) |>
dplyr::select(
Tier = "tier", Event = "svtype", Genes = "Gene", Effect = "Effect",
Detail = "Detail", Location = "location", AF = "AF_PURPLE", `CN chg` = "CN_change_PURPLE",
"SR", "PR", CN = "CN_PURPLE", Ploidy = "Ploidy_PURPLE", "PURPLE_status",
"SR (ref)", "PR (ref)", "PE", "PE (ref)", `Somatic score` = "somaticscore",
"Transcript", "Other effects", "Other affected genes",
`AF at breakpoint 1` = "AF_PURPLE1", `AF at breakpoint 2` = "AF_PURPLE2",
`CN at breakpoint 1` = "CN_PURPLE1", `CN at breakpoint 2` = "CN_PURPLE2",
`CN change at breakpoint 1` = "CN_change_PURPLE1",
`CN change at breakpoint 2` = "CN_change_PURPLE2",
`AF before adjustment, bp 1` = "AF_BPI1",
`AF before adjustment, bp 2` = "AF_BPI2"
) |>
# filter out empty gene rows
dplyr::filter(.data$Genes != "") |>
dplyr::distinct() |>
dplyr::arrange(.data$Tier, .data$Effect, dplyr::desc(.data$AF), .data$Genes)
total_melted <- nrow(sv_all)
return(list(
melted = sv_all,
total_variants = total_variants,
total_melted = total_melted
))

# Check if user has provided a tsv with Gene as first column
if(colnames(sv_all)[1] == "Gene"){
return(list(
melted = sv_all,
total_variants = total_variants
))
}
if(!"Gene" %in% colnames(sv_all)){
# Assume it's an internal input. Unpack multiple annotations per region
sv_all <- sv_all |>
dplyr::select("annotation") |>
dplyr::mutate(annotation = strsplit(.data$annotation, ",")) |>
tidyr::unnest("annotation") |>
tidyr::separate_wider_delim(
cols = "annotation", delim = "|",
names = c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), too_few = "align_start"
) |>
dplyr::select("Effect", "Genes") |>
dplyr::mutate(
# if gene_fusion, keep as-is
gene_fusion_effect = grepl("gene_fusion", .data$Effect),
Gene = ifelse(
.data$gene_fusion_effect,
.data$Genes,
strsplit(.data$Genes, "&")
)
) |>
dplyr::select("Gene") |>
tidyr::unnest_longer("Gene") |>
dplyr::distinct() |>
dplyr::arrange(.data$Gene)

return(list(
melted = sv_all,
total_variants = total_variants
))
}
}

Loading

0 comments on commit 5286e06

Please sign in to comment.