Merge pull request #171 from umccr/sv_update

Sash SV support and generalise SV input
umccr · Sep 30, 2024 · 5286e06 · 5286e06
2 parents 80158c4 + 194ae5c
commit 5286e06
Show file tree

Hide file tree

Showing 12 changed files with 135 additions and 229 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -31,7 +31,6 @@ export(glanceExprPlot)
 export(immune_summary)
 export(known_trans)
 export(known_translocations_cgi_process)
-export(manta_process)
 export(mutCNexprPlot)
 export(nearest_position)
 export(pca)
@@ -50,7 +49,8 @@ export(salmon_counts)
 export(saveWidgetFix)
 export(session_info_kable)
 export(session_info_tbls)
-export(sv_manta_summary)
+export(sv_process)
+export(sv_summary)
 export(tpm_from_rpkm)
 export(webplot)
 importFrom(dplyr,"%>%")

diff --git a/R/exprTable.R b/R/exprTable.R
@@ -196,9 +196,9 @@ exprTable <- function(data = NULL, genes = NULL, keep_all = FALSE, cn_data = NUL
     }
   }
 
-  ##### Add structural variants results from MANTA
+  ##### Add structural variants results
   if (!is.null(sv_data) && length(genes) > 0) {
-    ##### NOTE: when merging per-gene expression data with SV data from MANTA the "gene" column is used since multiple entries are possible for one gene in MANTA output
+    ##### NOTE: when merging per-gene expression data with SV data the "gene" column is used since multiple entries are possible for one gene
     group.z <- merge(group.z, sv_data, by.x = "Gene", by.y = "Genes", all = TRUE, sort = FALSE)
   }
 
@@ -260,11 +260,11 @@ exprTable <- function(data = NULL, genes = NULL, keep_all = FALSE, cn_data = NUL
     group.z <- group.z[order(abs(group.z[, "Diff"]), decreasing = TRUE), ]
     group.z <- group.z[order(group.z$TIER), ]
 
-    ##### Order the data by MANTA increasing Tier (to prioritise SVs, based on https://github.com/AstraZeneca-NGS/simple_sv_annotation/blob/master/simple_sv_annotation.py), event type and then by the highest absolute values for Patient vs [comp_cancer] difference
+    ##### Order the data by event type and then by the highest absolute values for Patient vs [comp_cancer] difference
   } else if (!is.null(sv_data) && length(genes) > 0) {
     group.z <- group.z[order(abs(group.z[["Diff"]]), decreasing = TRUE), ]
     group.z <- group.z[order(group.z[["fusion_genes"]], decreasing = TRUE), ]
-    group.z <- group.z[order(group.z[["Tier"]]), ]
+    #group.z <- group.z[order(group.z[["Tier"]]), ]
 
     ##### Otherwise order table by the highest absolute values for Patient vs [comp_cancer] difference
   } else if (length(genes) > 0) {

diff --git a/R/sample_data.R b/R/sample_data.R
@@ -1,7 +1,7 @@
 #' Read Sample Data
 #'
 #' Reads sample data, including Arriba fusions, Arriba plots, Salmon counts,
-#' DRAGEN fusions, DRAGEN mapping metrics, Manta SVs, PURPLE CNVs, and PCGR SNVs.
+#' DRAGEN fusions, DRAGEN mapping metrics, SVs, PURPLE CNVs, and PCGR SNVs.
 #'
 #' @param p RNAsum params list.
 #' @param results_dir Directory to output extracted Arriba PNGs to (created
@@ -80,8 +80,8 @@ read_sample_data <- function(p, results_dir, tx2gene = NULL) {
 
 #' Read WGS Data
 #'
-#' Reads WGS data, including PCGR `tiers.tsv`, PURPLE `cnv.gene.tsv`, and Manta
-#' `manta.tsv`. If the file path has been specified in the RNAsum params and is
+#' Reads WGS data, including PCGR `tiers.tsv`, PURPLE `cnv.gene.tsv`, and
+#' `sv.prioritised.tsv`. If the file path has been specified in the RNAsum params and is
 #' valid, it is returned. As a fallback, if the umccrise directory param has
 #' been specified, then there is an attempt to detect the file pattern in there.
 #'
@@ -95,8 +95,8 @@ read_sample_data <- function(p, results_dir, tx2gene = NULL) {
 #'     "TEST-somatic.pcgr.snvs_indels.tiers.tsv",
 #'     package = "RNAsum"
 #'   ),
-#'   manta_tsv = system.file(
-#'     "rawdata/test_data/umccrised/test_sample_WGS/structural/TEST-prioritize-manta.tsv",
+#'   sash_tsv = system.file(
+#'     "rawdata/test_data/test_sample_WGS/structural/TEST.sv.prioritised.tsv",
 #'     package = "RNAsum"
 #'   )
 #' )
@@ -143,16 +143,16 @@ read_wgs_data <- function(p) {
     nm = "purple_gene_tsv", func = ppl_cnv_som_gene_read
   )
 
-  manta_tsv <- .read(
+  sv_tsv <- .read(
     p = p,
-    subdir = "structural", pat = "manta\\.tsv$",
-    nm = "manta_tsv", func = sv_prioritize_old
+    subdir = "structural", pat = "prioritised\\.tsv$",
+    nm = "sv_tsv", func = sv_prioritize
   )
 
   list(
     pcgr_tiers_tsv = pcgr_tiers_tsv,
     purple_gene_tsv = purple_gene_tsv,
-    manta_tsv = manta_tsv
+    sv_tsv = sv_tsv
   )
 }
 
@@ -212,12 +212,12 @@ fusions_summary <- function(tbl = NULL) {
   res
 }
 
-#' Get Manta SV Summary
+#' Get SV Summary
 #'
-#' @param tbl Tibble with melted SVs from umccrise, containing 'Genes' column.
+#' @param tbl Tibble with melted SVs from sash, containing 'Genes' column.
 #' @return Character vector of Genes.
 #' @export
-sv_manta_summary <- function(tbl) {
+sv_summary <- function(tbl) {
   assertthat::assert_that(
     inherits(tbl, "data.frame"), (c("Genes") %in% names(tbl))
   )

diff --git a/R/sv.R b/R/sv.R
@@ -1,15 +1,15 @@
-#' Process Manta Object
+#' Process SV Object
 #'
-#' @param manta_tsv_obj Manta list object read via `sv_prioritize_old`.
+#' @param sv_tsv_obj SV list object read via `sv_prioritize`.
 #'
 #' @return List with:
-#' - total Manta (unmelted) variants
+#' - total SV (unmelted) variants
 #' - tibble with melted variants
 #' - genes involved in multi-gene events
 #' @export
-manta_process <- function(manta_tsv_obj) {
-  total_variants <- manta_tsv_obj[["total_variants"]]
-  melted <- manta_tsv_obj[["melted"]] |>
+sv_process <- function(sv_tsv_obj) {
+  total_variants <- sv_tsv_obj[["total_variants"]]
+  melted <- sv_tsv_obj[["melted"]] |>
     dplyr::mutate(
       is_fusion = grepl("&", .data$Genes),
       fusion_genes = dplyr::if_else(.data$is_fusion, .data$Genes, "")
@@ -38,146 +38,53 @@ manta_process <- function(manta_tsv_obj) {
   )
 }
 
-sv_prioritize_old <- function(sv_file) {
-  # grab the dplyr pipe (for now!)
-  `%>%` <- dplyr::`%>%`
-  subset_genes <- function(genes, ind) {
-    genes |>
-      stringr::str_split("&") %>%
-      purrr::map(
-        ~ .[ind] %>%
-          replace("", NA) %>%
-          .[!is.na(.)]
-      ) %>%
-      purrr::map_chr(~ ifelse(length(.) > 0, stringr::str_c(., collapse = "&"), ""))
-  }
-
-  format_val <- function(val, is_pct = F) {
-    ifelse(!is.na(val),
-      format(val, digits = 1) %>% stringr::str_c(ifelse(is_pct, "%", "")),
-      NA
-    )
-  }
-
-  split_sv_field <- function(.data, field, is_pct = F) {
-    f_q <- rlang::enquo(field)
-    f_str <- rlang::quo_name(f_q)
-    f1_str <- stringr::str_c(f_str, "1")
-    f2_str <- stringr::str_c(f_str, "2")
-    f1_q <- rlang::sym(f1_str)
-    f2_q <- rlang::sym(f2_str)
-    .data %>%
-      tidyr::separate(col = !!f_q, into = c(f1_str, f2_str), sep = ",", fill = "right") %>%
-      dplyr::mutate(
-        !!f1_q := as.double(!!f1_q) * ifelse(is_pct, 100, 1),
-        !!f2_q := as.double(!!f2_q) * ifelse(is_pct, 100, 1),
-        !!f_q := (!!f1_q + ifelse(is.na(!!f2_q), !!f1_q, !!f2_q)) / 2,
-        !!f_q := format_val(!!f_q, is_pct),
-        !!f1_q := format_val(!!f1_q, is_pct),
-        !!f2_q := format_val(!!f2_q, is_pct)
-      )
-  }
+sv_prioritize <- function(sv_file){
 
+  # Check file is not empty
   sv_all <- NULL
   if (length(readLines(con = sv_file, n = 2)) <= 1) {
     return(sv_all)
   }
 
-  col_types_tab <- dplyr::tribble(
-    ~Column, ~Description, ~Type,
-    "caller", "Manta SV caller", "c",
-    "sample", "Tumor sample name", "c",
-    "chrom", "CHROM column in VCF", "c",
-    "start", "POS column in VCF", "i",
-    "end", "INFO/END: End position of the variant described in this record", "i",
-    "svtype", "INFO/SVTYPE: Type of structural variant", "c",
-    "split_read_support", "FORMAT/SR of tumor sample: Split reads for the ref and alt alleles in the order listed, for reads where P(allele|read)>0.999", "c",
-    "paired_support_PE", "FORMAT/PE of tumor sample: ??", "c",
-    "paired_support_PR", "FORMAT/PR of tumor sample: Spanning paired-read support for the ref and alt alleles in the order listed, for reads where P(allele|read)>0.999", "c",
-    "AF_BPI", "INFO/BPI_AF: AF at each breakpoint (so AF_BPI1,AF_BPI2)", "c",
-    "somaticscore", "INFO/SOMATICSCORE: Somatic variant quality score", "i",
-    "tier", "INFO/SV_TOP_TIER (or 4 if missing): Highest priority tier for the effects of a variant entry", "c",
-    "annotation", "INFO/SIMPLE_ANN: Simplified structural variant annotation: 'SVTYPE | EFFECT | GENE(s) | TRANSCRIPT | PRIORITY (1-4)'", "c",
-    "AF_PURPLE", "INFO/PURPLE_AF: AF at each breakend (purity adjusted) (so AF_PURPLE1,AF_PURPLE2)", "c",
-    "CN_PURPLE", "INFO/PURPLE_CN: CN at each breakend (purity adjusted) (so CN_PURPLE1,CN_PURPLE2)", "c",
-    "CN_change_PURPLE", "INFO/PURPLE_CN_CHANGE: change in CN at each breakend (purity adjusted) (so CN_change_PURPLE1,CN_change_PURPLE2)", "c",
-    "Ploidy_PURPLE", "INFO/PURPLE_PLOIDY: Ploidy of variant (purity adjusted)", "d",
-    "PURPLE_status", "INFERRED if FILTER=INFERRED, or RECOVERED if has INFO/RECOVERED, else blank. INFERRED: Breakend inferred from copy number transition", "c",
-    "START_BPI", "INFO/BPI_START: BPI adjusted breakend location", "i",
-    "END_BPI", "INFO/BPI_END: BPI adjusted breakend location", "i",
-    "ID", "ID column in VCF", "c",
-    "MATEID", "INFO/MATEID: ID of mate breakend", "c",
-    "ALT", "ALT column in VCF", "c"
-  )
-  ctypes <- paste(col_types_tab$Type, collapse = "")
-  sv_all <- readr::read_tsv(sv_file, col_names = TRUE, col_types = ctypes) |>
-    dplyr::select(-c("caller", "sample")) |>
-    split_sv_field("AF_BPI", is_pct = T) |>
-    split_sv_field("AF_PURPLE", is_pct = T) |>
-    split_sv_field("CN_PURPLE") |>
-    split_sv_field("CN_change_PURPLE") |>
-    dplyr::mutate(
-      Ploidy_PURPLE = as.double(.data$Ploidy_PURPLE),
-      Ploidy_PURPLE = format(.data$Ploidy_PURPLE, nsmall = 2)
-    ) |>
-    tidyr::separate_wider_delim(cols = "split_read_support", names = c("SR (ref)", "SR (alt)"), delim = ",", too_few = "align_start") |>
-    tidyr::separate_wider_delim(cols = "paired_support_PR", names = c("PR (ref)", "PR (alt)"), delim = ",", too_few = "align_start") |>
-    tidyr::separate_wider_delim(cols = "paired_support_PE", names = c("PE (ref)", "PE (alt)"), delim = ",", too_few = "align_start") |>
-    dplyr::mutate(
-      SR = as.integer(.data$`SR (alt)`), PR = as.integer(.data$`PR (alt)`), PE = as.integer(.data$`PE (alt)`)
-    )
+  sv_all <- readr::read_tsv(sv_file, col_names = TRUE)
   total_variants <- nrow(sv_all)
-  sv_all <- sv_all |>
-    # Unpack multiple annotations per region
-    dplyr::mutate(annotation = strsplit(.data$annotation, ",")) |>
-    tidyr::unnest("annotation") |>
-    tidyr::separate_wider_delim(
-      cols = "annotation", delim = "|",
-      names = c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), too_few = "align_start"
-    ) |>
-    dplyr::mutate(
-      start = format(.data$start, big.mark = ",", trim = T),
-      end = format(.data$end, big.mark = ",", trim = T),
-      location = stringr::str_c(.data$chrom, ":", .data$start, sep = ""),
-      location = ifelse(is.na(.data$end), .data$location, stringr::str_c(.data$location))
-    ) |>
-    dplyr::mutate(
-      Gene = subset_genes(.data$Genes, c(1, 2)),
-      Gene = ifelse((stringr::str_split(.data$Genes, "&") |> purrr::map_int(base::length)) > 2,
-        stringr::str_c(.data$Gene, "...", sep = ", "),
-        .data$Gene
-      ),
-      `Other affected genes` = subset_genes(.data$Genes, -c(1, 2)) |> stringr::str_replace_all("&", ", "),
-      Gene = ifelse(stringr::str_detect(.data$Effect, "gene_fusion"),
-        .data$Gene,
-        .data$Gene |> stringr::str_replace_all("&", ", ")
-      )
-    ) |>
-    tidyr::separate_wider_delim(
-      cols = "Effect", delim = "&", names = c("Effect", "Other effects"),
-      too_few = "align_start", too_many = "merge"
-    ) |>
-    dplyr::select(
-      Tier = "tier", Event = "svtype", Genes = "Gene", Effect = "Effect",
-      Detail = "Detail", Location = "location", AF = "AF_PURPLE", `CN chg` = "CN_change_PURPLE",
-      "SR", "PR", CN = "CN_PURPLE", Ploidy = "Ploidy_PURPLE", "PURPLE_status",
-      "SR (ref)", "PR (ref)", "PE", "PE (ref)", `Somatic score` = "somaticscore",
-      "Transcript", "Other effects", "Other affected genes",
-      `AF at breakpoint 1` = "AF_PURPLE1", `AF at breakpoint 2` = "AF_PURPLE2",
-      `CN at breakpoint 1` = "CN_PURPLE1", `CN at breakpoint 2` = "CN_PURPLE2",
-      `CN change at breakpoint 1` = "CN_change_PURPLE1",
-      `CN change at breakpoint 2` = "CN_change_PURPLE2",
-      `AF before adjustment, bp 1` = "AF_BPI1",
-      `AF before adjustment, bp 2` = "AF_BPI2"
-    ) |>
-    # filter out empty gene rows
-    dplyr::filter(.data$Genes != "") |>
-    dplyr::distinct() |>
-    dplyr::arrange(.data$Tier, .data$Effect, dplyr::desc(.data$AF), .data$Genes)
-  total_melted <- nrow(sv_all)
-  return(list(
-    melted = sv_all,
-    total_variants = total_variants,
-    total_melted = total_melted
-  ))
+
+  # Check if user has provided a tsv with Gene as first column
+  if(colnames(sv_all)[1] == "Gene"){
+    return(list(
+      melted = sv_all,
+      total_variants = total_variants
+    ))
+  }
+  if(!"Gene" %in% colnames(sv_all)){
+    # Assume it's an internal input. Unpack multiple annotations per region
+    sv_all <- sv_all |>
+      dplyr::select("annotation") |>
+      dplyr::mutate(annotation = strsplit(.data$annotation, ",")) |>
+      tidyr::unnest("annotation") |>
+      tidyr::separate_wider_delim(
+        cols = "annotation", delim = "|",
+        names = c("Event", "Effect", "Genes", "Transcript", "Detail", "Tier"), too_few = "align_start"
+      ) |>
+      dplyr::select("Effect", "Genes") |>
+      dplyr::mutate(
+        # if gene_fusion, keep as-is
+        gene_fusion_effect = grepl("gene_fusion", .data$Effect),
+        Gene = ifelse(
+          .data$gene_fusion_effect,
+          .data$Genes,
+          strsplit(.data$Genes, "&")
+        )
+      ) |>
+      dplyr::select("Gene") |>
+      tidyr::unnest_longer("Gene") |>
+      dplyr::distinct() |>
+      dplyr::arrange(.data$Gene)
+
+    return(list(
+      melted = sv_all,
+      total_variants = total_variants
+    ))
+  }
 }
+