From 705a0fb0f5df023ff87103754db5d8c3112914f1 Mon Sep 17 00:00:00 2001 From: Nicolai-vKuegelgen Date: Tue, 22 Oct 2024 16:11:51 +0200 Subject: [PATCH] - Rename highimpact and highlight to stemcell_hotspot and cancer_gene - add dosage_sensitive_gene annotation - change scoring to score each category separately (not score once per gene) --- .../control_files/allowedvalues_config.yaml | 7 +- .../control_files/default_config.yaml | 18 +++- .../scripts/R/R_plotting_functions.R | 31 ++++-- stemcnv_check/scripts/R/R_table_functions.R | 84 ++++++++------- stemcnv_check/scripts/R/helper_functions.R | 10 +- .../R/processCNVs_annotate_check-score.R | 50 ++++----- .../R/processCNVs_annotate_impact_lists.R | 74 ++++++++++++- stemcnv_check/scripts/R/vcf_io_functions.R | 12 ++- stemcnv_check/scripts/process_CNV_calls.R | 13 ++- stemcnv_check/scripts/report_template.Rmd | 45 +++++--- stemcnv_check/scripts/summarise_stats.R | 4 +- ...ots.tsv => genelist-stemcell-hotspots.tsv} | 0 tests/testthat/test_helper_functions.R | 8 +- ...est_processCNVs_annotate_cnv-check-score.R | 100 +++++++++++------- .../testthat/test_processCNVs_impact_lists.R | 86 ++++++++------- tests/testthat/test_report_table_functions.R | 75 +++++++++---- tests/testthat/test_vcf_io_functions.R | 10 +- 17 files changed, 405 insertions(+), 222 deletions(-) rename stemcnv_check/supplemental-files/{HighImpact-stemcell-hotspots.tsv => genelist-stemcell-hotspots.tsv} (100%) diff --git a/stemcnv_check/control_files/allowedvalues_config.yaml b/stemcnv_check/control_files/allowedvalues_config.yaml index 1ae446f..ce55dae 100644 --- a/stemcnv_check/control_files/allowedvalues_config.yaml +++ b/stemcnv_check/control_files/allowedvalues_config.yaml @@ -119,10 +119,13 @@ settings: gene_overlap: exclude_gene_type_regex: list__str include_only_these_gene_types: list__str - high_impact_list: str - highlight_list: str + stemcell_hotspot_list: str + cancer_gene_list: str Check_score_values: + pHaplo_threshold: float_le1_ge0 + pTriplo_threshold: float_le1_ge0 + dosage_sensitive_gene: float_ge0 any_roi_hit: float_ge0 any_other_gene: float_ge0 large_CN_size_modifier: float_ge1 diff --git a/stemcnv_check/control_files/default_config.yaml b/stemcnv_check/control_files/default_config.yaml index 9d749ee..3fa93ed 100644 --- a/stemcnv_check/control_files/default_config.yaml +++ b/stemcnv_check/control_files/default_config.yaml @@ -226,23 +226,31 @@ settings: # description & description_doi will be used to display extra info in the report # mapping can be 'gene_name', 'gband', and 'position' and should describe the hotspot # call_type can be 'any', 'gain', 'loss' or 'LOH' - high_impact_list: '__inbuilt__/supplemental-files/HighImpact-stemcell-hotspots.tsv' - highlight_list: '__inbuilt__/supplemental-files/genelist-cancer-drivers.tsv' + stemcell_hotspot_list: '__inbuilt__/supplemental-files/genelist-stemcell-hotspots.tsv' + cancer_gene_list: '__inbuilt__/supplemental-files/genelist-cancer-drivers.tsv' # also available: '__inbuilt__/supplemental-files/genelist-cancer-hotspots.tsv' # Scoring for CNV and LOH calls # scoring combines a Size based contribution with scores for overlapping annotated regions Check_score_values: - # HighImpact & Highlight scores need to be defined in the respective tables + # stemcell_hotspot & cancer_gene scores need to be defined in the respective tables # CNVs/LOHs get the summed scored of each overlapping annotated gene or region (gband/position) - # genes are only scored _once_ per call, i.e. a gene with both highimpact and highlight match will only + # genes are only scored _once_ per call, i.e. a gene with both stemcell_hotspot and cancer_gene match will only # contribute the higher of the two annotated scores. - # Genes without a score in the genelists are scored as 'any_other_gene' + + # Dosage sensivity predicition is a based on Collins et. al. 2022 (doi:10.1016/j.cell.2022.06.036) + # CNV loss alls overlapping a gene with pHaplos score >= threshold are scored with the 'dosage_sensitive_gene' score + # CNV gain calls are respectively scored for the pTriplo score + pHaplo_threshold: 0.86 + pTriplo_threshold: 0.94 + dosage_sensitive_gene: 5 + # Genes without any score from the hotspot lists of dosage sensivity are scored as 'any_other_gene' any_other_gene: 0.2 # Calls overlapping any sample defined ROI get this one-time score any_roi_hit: 50 # CNVs with a large CN (<1 or >3) have their size contribution multiplied by this factor large_CN_size_modifier: 1.5 + ##!advanced Precision: diff --git a/stemcnv_check/scripts/R/R_plotting_functions.R b/stemcnv_check/scripts/R/R_plotting_functions.R index f4821dd..0afabc0 100644 --- a/stemcnv_check/scripts/R/R_plotting_functions.R +++ b/stemcnv_check/scripts/R/R_plotting_functions.R @@ -73,8 +73,9 @@ make_LRR_BAF_plots <- function( filter_by_overlaps(GRanges(seqnames = chr, strand = '*', ranges = IRanges(start = call.row$start, end = call.row$end))) %>% as_tibble() - high_impact_list <- call.row$HighImpact %>% str_split('\\|') %>% unlist() - highlight_list <- call.row$Highlight %>% str_split('\\|') %>% unlist() + stemcell_hotspot_list <- call.row$stemcell_hotspot %>% str_split('\\|') %>% unlist() + dosage_sensitive_gene_list <- call.row$dosage_sensitive_gene %>% str_split('\\|') %>% unlist() + cancer_gene_list <- call.row$cancer_gene %>% str_split('\\|') %>% unlist() gene.data <- gr_genes %>% filter_by_overlaps(GRanges(seqnames = chr, strand = '*', ranges = IRanges(start = win_start, end = win_end))) %>% as_tibble() %>% @@ -83,8 +84,9 @@ make_LRR_BAF_plots <- function( y_pos = ifelse(strand == '+', 1, 0), Sample_Name = paste(sample_headers, collapse = '---'), direct_hit = gene_id %in% direct_genes$gene_id, - high_impact = gene_name %in% high_impact_list, - highlight = gene_name %in% highlight_list, + stemcell_hotspot = gene_name %in% stemcell_hotspot_list, + dosage_sensitive_gene = gene_name %in% dosage_sensitive_gene_list, + cancer_gene = gene_name %in% cancer_gene_list, ) %>% separate_rows(Sample_Name, sep = '---') %>% # Need to ensure table contains reference so everything is properly facet_wrapped @@ -103,8 +105,9 @@ make_LRR_BAF_plots <- function( y_pos = 0, Sample_Name = paste(sample_headers, collapse = '---'), color = case_when( - str_detect(section_name, paste(high_impact_list, collapse = '|')) ~ 'red', - str_detect(section_name, paste(highlight_list, collapse = '|')) ~ 'orange', + str_detect(section_name, paste(stemcell_hotspot_list, collapse = '|')) ~ 'red', + # Onlt the stemcell list contains gbands + # str_detect(section_name, paste(cancer_gene_list, collapse = '|')) ~ 'orange', band_staining == 'gpos100' ~ 'black', band_staining == 'gpos50' ~ 'grey30', band_staining == 'gpos25' ~ 'grey70', @@ -160,8 +163,9 @@ make_LRR_BAF_plots <- function( aes( x = x_pos, y = y_pos, width = width, height = .9, fill = case_when( - high_impact ~ 'red', - highlight ~ 'orange', + stemcell_hotspot ~ 'red', + dosage_sensitive_gene ~ 'orange', + cancer_gene ~ 'orange', direct_hit ~ 'black', TRUE ~ 'grey50' ) @@ -275,10 +279,17 @@ make_LRR_BAF_plots <- function( gene.data <- gene.data %>% filter(!is.na(x_pos)) %>% - dplyr::select(seqnames, start, end, width, strand, high_impact, highlight, direct_hit, gene_name, gene_type, gene_id) %>% + dplyr::select( + seqnames, start, end, width, strand, stemcell_hotspot, dosage_sensitive_gene, cancer_gene, + direct_hit, gene_name, gene_type, gene_id + ) %>% mutate(CNVtype = as.character(call.row$CNV_type)) %>% unique() - list('gg' = gg, 'genes' = gene.data, 'hotspots' = c(high_impact_list, highlight_list) %>% na.omit()) + list( + 'gg' = gg, + 'genes' = gene.data, + 'hotspots' = c(stemcell_hotspot_list, dosage_sensitive_gene_list, cancer_gene_list) %>% na.omit() + ) } diff --git a/stemcnv_check/scripts/R/R_table_functions.R b/stemcnv_check/scripts/R/R_table_functions.R index e171d0b..5501ebe 100644 --- a/stemcnv_check/scripts/R/R_table_functions.R +++ b/stemcnv_check/scripts/R/R_table_functions.R @@ -145,14 +145,14 @@ summary_table <- function(summary_stat_table, sample_headers, config) { } format_hotspots_to_badge <- function( - hotspot_vec, CNVtype_vec, hotspot_table, listname = 'high_impact', include_hover = TRUE + hotspot_vec, CNVtype_vec, hotspot_table, listname = 'stemcell_hotspot', include_hover = TRUE ) { - if (listname == "high_impact") { + if (listname == "stemcell_hotspot") { shorthand <- 'HI' - } else if (listname == "highlight") { + } else if (listname %in% c("cancer_gene", "dosage_sensitive_gene")) { shorthand <- 'HL' } else { - stop(str_glue("Unsupported list type '{listname}', only 'high_impact' and 'highlight' are defined")) + stop(str_glue("Unsupported list type '{listname}', only 'stemcell_hotspot', 'dosage_sensitive_gene' and 'cancer_gene' are defined")) } hotspot_table.any <- hotspot_table %>% @@ -206,7 +206,9 @@ format_hotspots_to_badge <- function( } } -CNV_table_output <- function(tb, plotsection, high_impact_tb, highlight_tb, gr_info, report_config, caption = NULL) { +CNV_table_output <- function( + tb, plotsection, stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, gr_info, report_config, caption = NULL +) { always_include <- report_config$call.data.and.plots[[plotsection]]$always_include # Reorder & subset columns tb <- tb %>% @@ -226,8 +228,9 @@ CNV_table_output <- function(tb, plotsection, high_impact_tb, highlight_tb, gr_i ) ), Precision_Estimate = ifelse(is.na(Precision_Estimate), '-', as.character(Precision_Estimate)), - HighImpact = map2_chr(HighImpact, CNV_type, \(hi,c) format_hotspots_to_badge(hi, c, high_impact_tb, 'high_impact')), - Highlight = map2_chr(Highlight, CNV_type, \(hi,c) format_hotspots_to_badge(hi, c, highlight_tb, 'highlight')), + stemcell_hotspot = map2_chr(stemcell_hotspot, CNV_type, \(hi,c) format_hotspots_to_badge(hi, c, stemcell_hotspot_tb, 'stemcell_hotspot')), + dosage_sensitive_gene = map2_chr(dosage_sensitive_gene, CNV_type, \(hi,c) format_hotspots_to_badge(hi, c, dosage_sensitive_gene_tb, 'dosage_sensitive_gene')), + cancer_gene = map2_chr(cancer_gene, CNV_type, \(hi,c) format_hotspots_to_badge(hi, c, cancer_gene_tb, 'cancer_gene')), genome_bands = pmap_chr(., \(chrom, start, end, ...) { gr_info %>% filter_by_overlaps(GRanges(seqnames = chrom, strand = '*', ranges = IRanges(start, end))) %>% @@ -248,9 +251,9 @@ CNV_table_output <- function(tb, plotsection, high_impact_tb, highlight_tb, gr_i Plot, Call_label, Check_Score, CNV_type, chrom, Size, genome_bands, start, end, #invis 10-11 - CNV_caller, HighImpact, Highlight, ROI_hits, + CNV_caller, stemcell_hotspot, dosage_sensitive_gene, cancer_gene, ROI_hits, Precision_Estimate, probe_coverage_gap, high_probe_density, - # invis: 19++ + # invis: 20++ copynumber, LRR, n_probes, n_uniq_probes, #n_premerged_calls, caller_confidence, caller_merging_coverage, Gap_percent ) @@ -264,7 +267,7 @@ CNV_table_output <- function(tb, plotsection, high_impact_tb, highlight_tb, gr_i 'Number of the CNV call, sorted by descending Check_Score', 'Link to the plot of the CNV call\\nNote: For the Top20/critical CNVs clicking on the link will switch the active plot below. For other CNVs it will open the plot in a new browser tab.', 'Designation label for the CNV call, (Critical, Reportable, Reference genotype, ROI)', - 'Check_Score of the CNV call, calculated based on size, overlap high impact or highlight list, or other genes', + 'Check_Score of the CNV call, calculated based on size, overlap with stemcell_hotspot, dosage_sensivity or cancer_gene list, or other genes', 'Type of CNV call (gain, loss, LOH)', 'Chromosome of the CNV call', 'Size of the CNV call (in base pairs)', @@ -272,8 +275,9 @@ CNV_table_output <- function(tb, plotsection, high_impact_tb, highlight_tb, gr_i 'Start position of the CNV call', 'End position of the CNV call', 'Caller tools detecting this CNV call', - 'High impact hotspots overlapping with this CNV call', - 'Highlight hotspots overlapping with this CNV call', + 'Stemcell hotspots overlapping with this CNV call', + 'Dosage sensitive genes overlapping with this CNV call', + 'Cancer genes overlapping with this CNV call', 'Regions of interest overlapping with this CNV call', #FIXME (future): add a doi for precision benchmark once available 'Precision estimate of the CNV call, based on internal benchmarking', @@ -302,7 +306,7 @@ CNV_table_output <- function(tb, plotsection, high_impact_tb, highlight_tb, gr_i buttons = c('colvis', 'copy', 'csv', 'excel', 'print'), columnDefs = list( #This uses 0-indexing vs the usual R 1-indexing - list(targets = c(0:2,10:11,19:(ncol(tb)-1)), visible = FALSE) + list(targets = c(0:2,10:11,20:(ncol(tb)-1)), visible = FALSE) ) ), callback = JS( @@ -321,7 +325,7 @@ CNV_table_output <- function(tb, plotsection, high_impact_tb, highlight_tb, gr_i select( CNV_type, Check_Score, chrom, start, end, Size, genome_bands, CNV_caller, - HighImpact, Highlight, + stemcell_hotspot, dosage_sensitive_gene, cancer_gene, Precision_Estimate, probe_coverage_gap, high_probe_density ) %>% rename_with(format_column_names) @@ -330,7 +334,7 @@ CNV_table_output <- function(tb, plotsection, high_impact_tb, highlight_tb, gr_i } gene_table_output <- function( - tb, plotsection, high_impact_tb, highlight_tb, report_config, caption = NULL, extra_cols = c() + tb, plotsection, stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, report_config, caption = NULL, extra_cols = c() ) { if (report_config$call.data.and.plots[[plotsection]]$include.gene.table.details == 'Call') { @@ -346,8 +350,12 @@ gene_table_output <- function( tb <- tb %>% mutate( across(any_of(c('direct_hit', 'gene_type', 'strand')), ~ factor(.)), - high_impact = factor(ifelse(high_impact, 'hit', '-'), levels = c('hit', '-')), - highlight = factor(ifelse(highlight, 'hit', '-'), levels = c('hit', '-')), + across( + any_of(c('stemcell_hotspot', 'dosage_sensitive_gene', 'cancer_gene')), + ~ factor(ifelse(., 'yes', '-'), levels = c('yes', '-')) + ), + # stemcell_hotspot = factor(ifelse(stemcell_hotspot, 'hit', '-'), levels = c('hit', '-')), + # cancer_gene = factor(ifelse(cancer_gene, 'hit', '-'), levels = c('hit', '-')), name_is_geneid = str_detect(gene_name, 'ENSG[0-9]{11}'), # REEV: gene_id *should* work, but won't if they are deprectated/not in Annonars REEV = str_glue("{gene_name}"), @@ -355,44 +363,48 @@ gene_table_output <- function( NCBI = ifelse(name_is_geneid, '-', str_glue("{gene_name}")), Ensembl = str_glue("{gene_id}"), #Reformat gene name - gene_name = ifelse( - high_impact == 'hit', - map2_chr(gene_name, CNVtype, \(g, c) format_hotspots_to_badge(g,c, high_impact_tb,'high_impact', FALSE)), - map2_chr(gene_name, CNVtype, \(g, c) format_hotspots_to_badge(g,c, highlight_tb, 'highlight', FALSE)) + gene_name = case_when( + stemcell_hotspot == 'yes' ~ map2_chr(gene_name, CNVtype, \(g, c) format_hotspots_to_badge(g,c, stemcell_hotspot_tb,'stemcell_hotspot', FALSE)), + dosage_sensitive_gene == 'yes' ~ map2_chr(gene_name, CNVtype, \(g, c) format_hotspots_to_badge(g,c, dosage_sensitive_gene_tb, 'dosage_sensitive_gene', FALSE)), + cancer_gene == 'yes' ~ map2_chr(gene_name, CNVtype, \(g, c) format_hotspots_to_badge(g,c, cancer_gene_tb, 'cancer_gene', FALSE)), + TRUE ~ gene_name ), ) %>% - arrange(high_impact, highlight, desc(direct_hit), start) %>% + arrange(stemcell_hotspot, cancer_gene, desc(direct_hit), start) %>% select( - gene_name, gene_id, seqnames, start, end, strand, high_impact, highlight, + gene_name, gene_id, seqnames, start, end, strand, stemcell_hotspot, dosage_sensitive_gene, cancer_gene, any_of(extra_cols), REEV, GTex, NCBI, Ensembl ) if (params$out_format == 'html') { - colors1 <- ifelse(tb$high_impact == 'hit', 'red' , 'white') - colors2 <- ifelse(tb$highlight == 'hit', 'orange', 'white') + colors1 <- ifelse(tb$stemcell_hotspot == 'yes', 'red' , 'white') + colors2 <- ifelse(tb$dosage_sensitive_gene == 'yes', 'orange', 'white') + colors3 <- ifelse(tb$cancer_gene == 'yes', 'orange', 'white') dt <- datatable( tb, rownames = FALSE, escape = FALSE, options = list( dom = 'Bftilp', pageLength = 10, extensions = c('Buttons'), buttons = c('colvis', 'copy', 'csv', 'excel', 'print'), - columnDefs = list(list(targets = 1:7, visible = FALSE)) + columnDefs = list(list(targets = 1:8, visible = FALSE)) ) ) %>% - formatStyle('high_impact', backgroundColor = styleRow(1:nrow(tb), colors1)) %>% # textAlign = 'center' - formatStyle('highlight', backgroundColor = styleRow(1:nrow(tb), colors2)) + formatStyle('stemcell_hotspot', backgroundColor = styleRow(1:nrow(tb), colors1)) %>% # textAlign = 'center' + formatStyle('dosage_sensitive_gene', backgroundColor = styleRow(1:nrow(tb), colors2)) %>% + formatStyle('cancer_gene', backgroundColor = styleRow(1:nrow(tb), colors3)) return(dt) } else { - tb <- tb %>% select(gene_name, gene_id, high_impact, highlight, any_of(extra_cols)) + tb <- tb %>% select(gene_name, gene_id, stemcell_hotspot, dosage_sensitive_gene, cancer_gene, any_of(extra_cols)) return(kable(tb, caption = caption)) } } hotspot_table_output <- function( - hotspots, cnv_type, plotsection, high_impact_tb, highlight_tb, report_config, out_format, caption = NULL + hotspots, cnv_type, plotsection, stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, report_config, out_format, caption = NULL ){ tb <- bind_rows( - high_impact_tb, - highlight_tb + stemcell_hotspot_tb, + dosage_sensitive_gene_tb, + cancer_gene_tb ) %>% filter(hotspot %in% hotspots & call_type %in% c('any', cnv_type)) @@ -403,10 +415,10 @@ hotspot_table_output <- function( dplyr::rename(description = description_htmllinks) %>% select(hotspot, call_type, list_name, description, check_score, any_of(colnames(tb))) %>% mutate( - hotspot = ifelse( - list_name %in% unique(high_impact_tb$list_name), - map2_chr(hotspot, call_type, \(g, c) format_hotspots_to_badge(g,c, high_impact_tb,'high_impact', FALSE)), - map2_chr(hotspot, call_type, \(g, c) format_hotspots_to_badge(g,c, highlight_tb, 'highlight', FALSE)) + hotspot = case_when( + list_name == unique(stemcell_hotspot_tb$list_name) ~ map2_chr(hotspot, call_type, \(g, c) format_hotspots_to_badge(g,c, stemcell_hotspot_tb,'stemcell_hotspot', FALSE)), + list_name == unique(dosage_sensitive_gene_tb$list_name) ~ map2_chr(hotspot, call_type, \(g, c) format_hotspots_to_badge(g,c, dosage_sensitive_gene_tb, 'dosage_sensitive_gene', FALSE)), + list_name == unique(cancer_gene_tb$list_name) ~ map2_chr(hotspot, call_type, \(g, c) format_hotspots_to_badge(g,c, cancer_gene_tb, 'cancer_gene', FALSE)) ), description = str_replace_all(description, ' ', '
') ) %>% diff --git a/stemcnv_check/scripts/R/helper_functions.R b/stemcnv_check/scripts/R/helper_functions.R index 27897b5..9bbf953 100644 --- a/stemcnv_check/scripts/R/helper_functions.R +++ b/stemcnv_check/scripts/R/helper_functions.R @@ -132,12 +132,12 @@ load_genomeInfo <- function(ginfo_file, config, target_style='UCSC') { gr_info } -load_hotspot_table <- function(config, table = 'HighImpact') { +load_hotspot_table <- function(config, table = 'stemcell_hotspot') { - if (table == 'HighImpact') { - filename <- config$settings$CNV_processing$gene_overlap$high_impact_list - } else if (table == 'Highlight') { - filename <- config$settings$CNV_processing$gene_overlap$highlight_list + if (table == 'stemcell_hotspot') { + filename <- config$settings$CNV_processing$gene_overlap$stemcell_hotspot_list + } else if (table == 'cancer_gene') { + filename <- config$settings$CNV_processing$gene_overlap$cancer_gene_list } else { stop('Invalid table name') } diff --git a/stemcnv_check/scripts/R/processCNVs_annotate_check-score.R b/stemcnv_check/scripts/R/processCNVs_annotate_check-score.R index 6976163..359e6bd 100644 --- a/stemcnv_check/scripts/R/processCNVs_annotate_check-score.R +++ b/stemcnv_check/scripts/R/processCNVs_annotate_check-score.R @@ -13,7 +13,7 @@ annotate_gene_overlaps <- function(gr, gr_genes) { } -annotate_cnv.check.score <- function(tb, high_impact_gr, highlight_gr, check_scores) { +annotate_cnv.check.score <- function(tb, stemcell_hotspots_gr, dosage_sensitive_gene_gr, cancer_genes_gr, check_scores) { tb %>% rowwise() %>% @@ -33,30 +33,31 @@ annotate_cnv.check.score <- function(tb, high_impact_gr, highlight_gr, check_sco ) + # Base score for any ROI hit ifelse(!is.na(ROI_hits), check_scores$any_roi_hit, 0) + - # Per gene/region score: - # - scores per non-gene HI / HL - (high_impact_gr %>% as_tibble() %>% - filter(mapping != 'gene_name' & hotspot %in% unlist(str_split(high_impact_hits, '\\|'))) %>% + # Scores for each hotspot hit/dosage gene/cancer gene (cumulative, even if its the same gene) + (stemcell_hotspots_gr %>% as_tibble() %>% + filter(hotspot %in% unlist(str_split(stemcell_hotspot, '\\|'))) %>% + pull(check_score) %>% sum()) + + # dosage_sensitive & cancer lists should only have genes, but are included anyway + (dosage_sensitive_gene_gr %>% as_tibble() %>% + filter(hotspot %in% unlist(str_split(dosage_sensitive_gene, '\\|'))) %>% + pull(check_score) %>% sum()) + + (cancer_genes_gr %>% as_tibble() %>% + filter(hotspot %in% unlist(str_split(cancer_gene, '\\|'))) %>% pull(check_score) %>% sum()) + - (highlight_gr %>% as_tibble() %>% - filter(mapping != 'gene_name' & hotspot %in% unlist(str_split(highlight_hits, '\\|'))) %>% - pull(check_score) %>% sum()) + - # - score all genes that aren't also an ROI, use max score of matching overlaps (but only one score per gene) + # Score all remaining (not hotpsot/dosage/cancer and not ROI) # dplyr will generate a bunch of warnings for calls without any genes - suppressWarnings(bind_rows( - high_impact_gr %>% as_tibble(), - highlight_gr %>% as_tibble(), - str_split(overlapping_genes, '\\|') %>% unlist() %>% - as_tibble() %>% dplyr::rename(hotspot = value) %>% - mutate(mapping = 'gene_name', check_score = check_scores$any_other_gene) - ) %>% - filter( - hotspot %in% unlist(str_split(overlapping_genes, '\\|')) & - !is.na(hotspot) & - mapping == 'gene_name' + str_split(overlapping_genes, '\\|') %>% unlist() %>% + as_tibble() %>% dplyr::rename(gene_name = value) %>% + filter( + gene_name %!in% c( + unlist(str_split(stemcell_hotspot, '\\|')), + unlist(str_split(dosage_sensitive_gene, '\\|')), + unlist(str_split(cancer_gene, '\\|')), + unlist(str_split(ROI_hits, '\\|')) + ) ) %>% - group_by(hotspot) %>% summarise(check_score = max(check_score)) %>% - pull(check_score) %>% sum()) + mutate(check_score = check_scores$any_other_gene) %>% + pull(check_score) %>% sum() ) %>% ungroup() } @@ -95,7 +96,8 @@ annotate_call.label <- function(gr.or.tb, call_cat_config) { check_score.critical <- ifelse(is.null(call_cat_config$check_score.critical), NA, call_cat_config$check_score.critical) critical_excl <- call_cat_config$filters.exclude.critical check_score.reportable <- ifelse(is.null(call_cat_config$check_score.reportable), NA, call_cat_config$check_score.reportable) - reportable_excl <- call_cat_config$filters.exclude.reportable + check_score.reportable <- ifelse(is.na(check_score.reportable), check_score.critical, call_cat_config$check_score.reportable) + reportable_excl <- call_cat_config$filters.exclude.reportable gr.or.tb %>% as_tibble() %>% @@ -108,8 +110,6 @@ annotate_call.label <- function(gr.or.tb, call_cat_config) { !is.na(ref_cov) ~ 'Reference genotype', check_score >= check_score.critical & !any(filters %in% critical_excl) ~ 'Critical', - check_score >= check_score.critical & - any(filters %in% critical_excl) ~ 'Reportable', check_score >= check_score.reportable & !any(filters %in% reportable_excl) ~ 'Reportable', TRUE ~ NA_character_ diff --git a/stemcnv_check/scripts/R/processCNVs_annotate_impact_lists.R b/stemcnv_check/scripts/R/processCNVs_annotate_impact_lists.R index b8b64c4..6e61ee4 100644 --- a/stemcnv_check/scripts/R/processCNVs_annotate_impact_lists.R +++ b/stemcnv_check/scripts/R/processCNVs_annotate_impact_lists.R @@ -103,9 +103,19 @@ parse_hotspot_table <- function(tb, gr_genes, gr_info) { filter(gene_name %in% sub_tb_name$hotspot) %>% as_tibble() %>% dplyr::rename(hotspot = gene_name) %>% - left_join(sub_tb_name, by = 'hotspot') %>% + mutate(call_type = 'any;loss;gain;LOH') %>% + separate_rows(call_type, sep = ';') %>% + # Specifically merge each call_type + inner_join(sub_tb_name, by = c('hotspot', 'call_type')) %>% + # Remove duplicates (can result from non-unique gene_names) + slice_max(check_score, by = c(hotspot, call_type), with_ties = FALSE) %>% as_granges() #message('parsed gene names') + unmatched_genes <- setdiff(sub_tb_name$hotspot, gr_name$hotspot) + if (length(unmatched_genes) > 0) { + message('The following gene names could not be identified in the gtf file (they are likely alternave names):', paste(unmatched_genes, collapse = ', ')) + } + } else { gr_name <- empty_gr } @@ -124,18 +134,76 @@ parse_hotspot_table <- function(tb, gr_genes, gr_info) { bind_ranges(empty_gr, gr_name, gr_pos, gr_gband) } + +get_dosage_sensivity_tb <- function(score_settings) { + + dosage_data <- 'https://zenodo.org/records/6347673/files/Collins_rCNV_2022.dosage_sensitivity_scores.tsv.gz' + doi <- '10.1016/j.cell.2022.06.036' + + tb <- read_tsv(dosage_data, show_col_types = F) %>% + rename_with( + ~str_replace(., '#gene', 'hotspot') %>% + str_replace('pHaplo', 'loss') %>% + str_replace('pTriplo', 'gain') + ) %>% + pivot_longer(cols = -hotspot, names_to = 'call_type', values_to = 'dosage_score') %>% + mutate( + list_name = 'Dosage-sensivity', + mapping = 'gene_name', + check_score = case_when( + call_type == 'loss' & dosage_score >= score_settings$pHaplo_threshold ~ score_settings$dosage_sensitive_gene, + call_type == 'gain' & dosage_score >= score_settings$pTriplo_threshold ~ score_settings$dosage_sensitive_gene, + TRUE ~ NA_integer_ + ), + description = paste0( + 'Gene with predicted dosage sensitivity (', + ifelse(call_type == 'loss', 'haploinsufficiency', 'triplosensitivity'), ')\\n', + 'Source: Collins et al. 2022 {1}.\\n', + ifelse(call_type == 'loss', 'pHaplo', 'pTriplo'), + ' score: ', round(dosage_score, 3) + ), + description_doi = doi, + ) %>% + filter(!is.na(check_score)) %>% + select(-dosage_score) + + description_html_pattern <- str_replace_all( + tb$description, + '([:,] ?)([^:,]+?)\\{([0-9]+)\\}(?=, ?|\\\\\\\\n|\\\\n|$)', + '\\1\\2' + ) %>% + str_replace_all('\\\\n', ' ') %>% + str_replace_all('\\n', ' ') + + tb$description_htmllinks <- map2_chr( + tb$description_doi, description_html_pattern, + \(doi, pattern) { + args <- doi %>% + str_split(', ?') %>% + # Make the doi text into an actual link + sapply(\(x) paste0('https://doi.org/', x)) %>% + unlist() %>% + set_names(paste0('a', 1:length(.))) + rlang::inject(str_glue(pattern, !!!args)) + } + ) + + tb +} + + annotate_impact_lists <- function(gr, hotspot_gr, list_name) { message('Annotation calls with gene lists') # Make an extra col listing all overlapping directly defined genes - gr@elementMetadata[[paste0(list_name, '_hits')]] <- NA_character_ + gr@elementMetadata[[list_name]] <- NA_character_ ov <- group_by_overlaps(gr, hotspot_gr) if (length(ov) > 0) { ov_hits <- ov %>% mutate(type_check = str_detect(CNV_type, str_replace(call_type, '^any$', '.*'))) %>% filter(type_check) %>% reduce_ranges(hits = paste(unique(sort(hotspot)),collapse = '|')) - gr[ov_hits$query,]@elementMetadata[[paste0(list_name, '_hits')]] <- ov_hits$hits + gr[ov_hits$query,]@elementMetadata[[list_name]] <- ov_hits$hits } return(gr) diff --git a/stemcnv_check/scripts/R/vcf_io_functions.R b/stemcnv_check/scripts/R/vcf_io_functions.R index 4ef774c..a62f629 100644 --- a/stemcnv_check/scripts/R/vcf_io_functions.R +++ b/stemcnv_check/scripts/R/vcf_io_functions.R @@ -84,8 +84,9 @@ static_cnv_vcf_header <- function(toolconfig, extra_annotation = FALSE, INFO = T '##INFO=', '##INFO=', '##INFO=', - '##INFO=', - '##INFO=', + '##INFO=', + '##INFO=', + '##INFO=', '##INFO=', '##INFO=', '##INFO=' @@ -186,7 +187,8 @@ get_fix_section <- function(tb) { extra_info_str <- paste( base_info_str, 'Check_Score={Check_Score};Precision_Estimate={Precision_Estimate};Call_label={Call_label}', - 'HighImpact={high_impact_hits};Highlight={highlight_hits};ROI_hits={ROI_hits}', + 'stemcell_hotspot={stemcell_hotspot};dosage_sensitive_gene={dosage_sensitive_gene}', + 'cancer_gene={cancer_gene};ROI_hits={ROI_hits}', 'Gap_percent={Gap_percent};Genes={overlapping_genes}', sep=';' ) @@ -198,7 +200,7 @@ get_fix_section <- function(tb) { # Need to use across + any_of to make this work if the columns aren't there # replace , by | for separator in INFO cols across( - any_of(c("high_impact_hits", "highlight_hits", "ROI_hits", "overlapping_genes")), + any_of(c("stemcell_hotspot", "dosage_sensitive_gene", "cancer_gene", "ROI_hits", "overlapping_genes")), ~ str_replace_all(., ',', '|') ), # round numbers @@ -206,7 +208,7 @@ get_fix_section <- function(tb) { # convert NA or empty string to ".", all columns with possible NA to character across( any_of(c("Check_Score", "Precision_Estimate", "Call_label", - "high_impact_hits", "highlight_hits", + "stemcell_hotspot", "cancer_gene", "dosage_sensitive_gene", "ROI_hits", "Gap_percent", "overlapping_genes")), ~ ifelse(is.na(.) | . == "", '.', as.character(.)) ), diff --git a/stemcnv_check/scripts/process_CNV_calls.R b/stemcnv_check/scripts/process_CNV_calls.R index f01d14b..4b60f9f 100644 --- a/stemcnv_check/scripts/process_CNV_calls.R +++ b/stemcnv_check/scripts/process_CNV_calls.R @@ -86,9 +86,11 @@ precision_estimates<- config$settings$CNV_processing$Precision$estimate_values gr_genes <- load_gtf_data(snakemake@params$gtf_file, config, target_chrom_style) gr_info <- load_genomeInfo(snakemake@params$ginfo_file, config, target_chrom_style) -high_impact_gr <- load_hotspot_table(config, 'HighImpact') %>% +stemcell_hotspots_gr <- load_hotspot_table(config, 'stemcell_hotspot') %>% parse_hotspot_table(gr_genes, gr_info) -highlight_gr <- load_hotspot_table(config, 'Highlight') %>% +cancer_genes_gr <- load_hotspot_table(config, 'cancer_gene') %>% + parse_hotspot_table(gr_genes, gr_info) +dosage_sensitive_gene_gr <- get_dosage_sensivity_tb(config$settings$CNV_processing$Check_score_values) %>% parse_hotspot_table(gr_genes, gr_info) array <- sampletable %>% @@ -100,8 +102,9 @@ density_file <- config$array_definition[[array]]$array_density_file cnvs <- cnvs %>% plyranges::select(-LRR) %>% get_median_LRR(snp_vcf_gr) %>% - annotate_impact_lists(high_impact_gr, 'high_impact') %>% - annotate_impact_lists(highlight_gr, 'highlight') %>% + annotate_impact_lists(stemcell_hotspots_gr, 'stemcell_hotspot') %>% + annotate_impact_lists(dosage_sensitive_gene_gr, 'dosage_sensitive_gene') %>% + annotate_impact_lists(cancer_genes_gr, 'cancer_gene') %>% annotate_roi(sample_id, sampletable, gr_genes, gr_info) %>% annotate_gaps( gap_file, @@ -116,7 +119,7 @@ cnvs <- cnvs %>% ) %>% annotate_gene_overlaps(gr_genes) %>% as_tibble() %>% - annotate_cnv.check.score(high_impact_gr, highlight_gr, check_scores) %>% + annotate_cnv.check.score(stemcell_hotspots_gr, dosage_sensitive_gene_gr, cancer_genes_gr, check_scores) %>% annotate_precision.estimates(size_categories, precision_estimates) %>% annotate_call.label(config$evaluation_settings$CNV_call_categorisation) diff --git a/stemcnv_check/scripts/report_template.Rmd b/stemcnv_check/scripts/report_template.Rmd index 605a7bf..638aaee 100644 --- a/stemcnv_check/scripts/report_template.Rmd +++ b/stemcnv_check/scripts/report_template.Rmd @@ -68,6 +68,7 @@ report_config <- params$report_config # Load helper functions source(file.path(config$snakedir, 'scripts/R/helper_functions.R')) +source(file.path(config$snakedir, 'scripts/R/processCNVs_annotate_impact_lists.R')) source(file.path(config$snakedir, 'scripts/R/R_table_functions.R')) source(file.path(config$snakedir, 'scripts/R/R_plotting_functions.R')) source(file.path(config$snakedir, 'scripts/R/vcf_io_functions.R')) @@ -141,8 +142,9 @@ include_roi_plots <- include.section('regions.of.interest') & ifelse( FALSE ) -high_impact_tb <- load_hotspot_table(config, 'HighImpact') -highlight_tb <- load_hotspot_table(config, 'Highlight') +stemcell_hotspot_tb <- load_hotspot_table(config, 'stemcell_hotspot') +dosage_sensitive_gene_tb <- get_dosage_sensivity_tb(config$settings$CNV_processing$Check_score_values) +cancer_gene_tb <- load_hotspot_table(config, 'cancer_gene') ## general functions @@ -198,12 +200,16 @@ make_CNV_plot_section <- function(call.table, plotsection = 'denovo') { # message(str_glue('N hotspots: {length(res$hotspots)}')) if (report_config$call.data.and.plots[[plotsection]]$include.hotspot.table & length(res$hotspots)>0) { res$hotspots %>% - hotspot_table_output(as.character(row$CNV_type), plotsection, high_impact_tb, highlight_tb, report_config, params$out_format) %>% + hotspot_table_output( + as.character(row$CNV_type), plotsection, + stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, + report_config, params$out_format + ) %>% subchunkify(str_glue('CNV_call.{plotsection}.nr{i}.hotspots')) cat('\n\n') } if (report_config$call.data.and.plots[[plotsection]]$include.gene.table.details != 'None') { - gene_table_output(res$genes, plotsection, high_impact_tb, highlight_tb, report_config) %>% + gene_table_output(res$genes, plotsection, stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, report_config) %>% subchunkify(str_glue('CNV_call.{plotsection}.nr{i}.table.genes')) cat('\n\n') } @@ -269,9 +275,10 @@ repo_excl_filters <- ifelse( ) cat(paste( - 'Call designation takes into account the Check_Score thresholds for critical and optionally reportable level.', - 'Additionally, calls flagged with an exclusion filter (i.e. below minimum size, or having a probe coverage gap) ', - 'for a specific designation are assigend the next lower level (i.e. critical -> reportable or reportable -> no label).\n\n', + 'Call designation takes into account minimum Check_Score thresholds for critical and (optionally) reportable level, ', + 'and for exclusion overlap with a reference call (these calls are not denovo and never criticcal or reportable) and ', + 'call filter flags (i.e. calls below minimum size, or having a probe coverage gap). ', + 'If no reportable Check_Score threshold is defined the critical one is used (but the reportable exclusion filters).\n\n', 'The current thresholds and exclsuion filters are:\n\n', ' - Critical Check_Score:', config$evaluation_settings$CNV_call_categorisation$check_score.critical, '\n\n', ' - Critical exclusion Filters:', crit_excl_filters, '\n\n', @@ -436,7 +443,11 @@ cat('This section describes all de-novo CNV calls, meaning calls without a match ``` ```{r denovo_calls.table, results='asis', eval = include.section('denovo_calls.table')} -CNV_table_output(denovo_calls.table, 'denovo', high_impact_tb, highlight_tb, gr_info, report_config, caption = 'de-novo CNV calls') +CNV_table_output( + denovo_calls.table, 'denovo', + stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, + gr_info, report_config, caption = 'de-novo CNV calls' +) cat('\n\n') ``` @@ -450,7 +461,11 @@ cat('This section describes all CNV calls that match the reference sample.\n\n') ``` ```{r reference_gt_calls.table, results='asis', eval = !is.na(ref_id) & include.section('reference_gt_calls.table')} -CNV_table_output(reference_calls.table, 'reference_gt', high_impact_tb, highlight_tb, gr_info, report_config, caption = 'reference genotype CNV calls') +CNV_table_output( + reference_calls.table, 'reference_gt', + stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, + gr_info, report_config, caption = 'reference genotype CNV calls' +) cat('\n\n') ``` @@ -519,13 +534,17 @@ fake_call_tb <- tibble( Call_label = ID, caller_merging_coverage = NA_real_, Precision_Estimate = NA_character_, - HighImpact = NA_character_, - Highlight = NA_character_, + stemcell_hotspot = NA_character_, + dosage_sensitive_gene = NA_character_, + cancer_gene = NA_character_, LRR = NA_real_, #FIXME: this could be calculated ) -CNV_table_output(fake_call_tb, 'regions_of_interest', high_impact_tb, highlight_tb, gr_info, report_config, caption = 'Regions of Interest') - +CNV_table_output( + fake_call_tb, 'regions_of_interest', + stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, + gr_info, report_config, caption = 'Regions of Interest' +) make_CNV_plot_section(fake_call_tb, 'regions_of_interest') cat('\n\n') diff --git a/stemcnv_check/scripts/summarise_stats.R b/stemcnv_check/scripts/summarise_stats.R index 433af90..3b2dfd0 100644 --- a/stemcnv_check/scripts/summarise_stats.R +++ b/stemcnv_check/scripts/summarise_stats.R @@ -116,8 +116,8 @@ get_call_stats <- function(gr.or.tb, call_count_excl_filters = list(), name_addi mutate( loss_gain_log2ratio = log2(sum(CNV_type == 'gain') / sum(CNV_type == 'loss')) %>% round(digits = 2), loss_gain_log2ratio = ifelse(is.infinite(loss_gain_log2ratio) | is.nan(loss_gain_log2ratio), NA, loss_gain_log2ratio), - highlight_calls = sum(!is.na(Highlight)), - high_impact_calls = sum(!is.na(HighImpact)), + cancer_gene_calls = sum(!is.na(cancer_gene)), + stemcell_hotspot_calls = sum(!is.na(stemcell_hotspot)), CNV_type = ifelse(CNV_type == 'LOH', 'LOH', 'CNV'), ) %>% group_by(sample_id, CNV_type, loss_gain_log2ratio) %>% diff --git a/stemcnv_check/supplemental-files/HighImpact-stemcell-hotspots.tsv b/stemcnv_check/supplemental-files/genelist-stemcell-hotspots.tsv similarity index 100% rename from stemcnv_check/supplemental-files/HighImpact-stemcell-hotspots.tsv rename to stemcnv_check/supplemental-files/genelist-stemcell-hotspots.tsv diff --git a/tests/testthat/test_helper_functions.R b/tests/testthat/test_helper_functions.R index 68cc4ff..02853cf 100644 --- a/tests/testthat/test_helper_functions.R +++ b/tests/testthat/test_helper_functions.R @@ -183,15 +183,15 @@ test_that('load_hotspot_table', { 'settings' = list( 'CNV_processing' = list( 'gene_overlap' = list( - 'high_impact_list' = test_path('../data/minimal-hotspots.tsv'), - 'highlight_list' = test_path('../data/minimal-hotspots.tsv') + 'stemcell_hotspot_list' = test_path('../data/minimal-hotspots.tsv'), + 'cancer_gene_list' = test_path('../data/minimal-hotspots.tsv') ) ) ) ) - load_hotspot_table(config, 'HighImpact') %>% + load_hotspot_table(config, 'stemcell_hotspot') %>% # remove 'spec_tbl_df' class from readr .[] %>% expect_equal(minimal_probes) - load_hotspot_table(config, 'Highlight') %>% + load_hotspot_table(config, 'cancer_gene') %>% .[] %>% expect_equal(minimal_probes) }) \ No newline at end of file diff --git a/tests/testthat/test_processCNVs_annotate_cnv-check-score.R b/tests/testthat/test_processCNVs_annotate_cnv-check-score.R index dd8b4bc..9feae57 100644 --- a/tests/testthat/test_processCNVs_annotate_cnv-check-score.R +++ b/tests/testthat/test_processCNVs_annotate_cnv-check-score.R @@ -32,32 +32,33 @@ ginfo_file <- test_path('../data/gr_info_minimal.tsv') base_tb <- tibble( seqnames = 'chr1', - start = c(4000, 10000, 28000000, 28060000, 40000, 5000000, 3000) %>% as.integer(), - end = c(5500, 14000, 28055000, 28065000, 50000, 7000000, 60000) %>% as.integer(), + start = c(4000, 10000, 40000, 28000000, 28060000, 40000, 5000000, 3000) %>% as.integer(), + end = c(5500, 14000, 50000, 28055000, 28065000, 50000, 7000000, 60000) %>% as.integer(), sample_id = 'test_sample', - CNV_type = c('gain', 'gain', 'gain', 'loss', 'loss', 'LOH', 'LOH'), + CNV_type = c('gain', 'gain', 'gain', 'gain', 'loss', 'loss', 'LOH', 'LOH'), ID = paste('combined', CNV_type, seqnames, start, end, sep='_'), - CNV_caller = c('StemCNV-check', 'toolA', 'StemCNV-check', 'StemCNV-check', 'toolA', 'toolB', 'toolB'), - n_probes = c(15, 100, 150, 100, 100, 50, 50), - n_uniq_probes = c(15, 100, 150, 100, 100, 50, 50), - CN = c(3, 3, 4, 1, 1, 2, 2), - FILTER = c('min_size', 'Probe_dens;probe_gap', 'probe_gap;high_probe_dens', 'test-dummy;high_probe_dens', NA_character_, NA_character_, NA_character_), - reference_overlap = c(T, T, F, F, F, F, T), - reference_coverage = c(100, 85.01, NA_real_, NA_real_, NA_real_, NA_real_, 60), - reference_caller = c('StemCNV-check', 'faketool', NA_character_, NA_character_,NA_character_,NA_character_, 'toolA'), - high_impact_hits = c(NA, NA, 'dummyC', NA, '1p36|chr1:40000-50000', NA, NA), - highlight_hits = c(NA, 'DDX11L1', NA, NA, NA, NA, 'DDX11L1|dummyB'), - ROI_hits = c('fake-ROI', NA, NA, 'dummyC', NA, NA, NA), - Gap_percent = c(0, 2000/4001, 25000/55001, 1000/5001, 2000/10001, 1e6/(2e6+1), 0), - probe_coverage_gap = c(FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE), - high_probe_density = c(NA, NA, TRUE, TRUE, NA, FALSE, FALSE) + CNV_caller = c('StemCNV-check', 'toolA', 'toolA', 'StemCNV-check', 'StemCNV-check', 'toolA', 'toolB', 'toolB'), + n_probes = c(15, 100, 100, 150, 100, 100, 50, 50), + n_uniq_probes = c(15, 100, 100, 150, 100, 100, 50, 50), + CN = c(3, 3, 3, 4, 1, 1, 2, 2), + FILTER = c('min_size', 'Probe_dens;probe_gap', NA_character_, 'high_probe_dens', 'test-dummy;high_probe_dens', 'probe_gap', NA_character_, NA_character_), + reference_overlap = c(T, T, F, F, F, F, F, T), + reference_coverage = c(100, 85.01, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 60), + reference_caller = c('StemCNV-check', 'faketool', NA_character_, NA_character_, NA_character_,NA_character_,NA_character_, 'toolA'), + stemcell_hotspot = c(NA, NA, 'chr1:40000-50000', 'dummyC', NA, '1p36|chr1:40000-50000', NA, NA), + dosage_sensitive_gene = c(NA, NA, 'dummyB', NA, 'dummyC', NA, NA, NA), + cancer_gene = c(NA, 'DDX11L1', NA, NA, NA, NA, NA, 'DDX11L1'), + ROI_hits = c('fake-ROI', NA, NA, NA, 'dummyC', NA, NA, NA), + Gap_percent = c(0, 2000/4001, 2000/10001, 25000/55001, 1000/5001, 2000/10001, 1e6/(2e6+1), 0), + probe_coverage_gap = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE), + high_probe_density = c(NA, NA, NA, TRUE, TRUE, NA, FALSE, FALSE) ) expected_gene_tb <- base_tb %>% mutate( - width = c( 1501, 4001, 55001, 5001, 10001, 2000001, 57001) %>% as.integer(), - n_genes = c(1, 1, 1, 1, 1, 0, 3) %>% as.integer(), - overlapping_genes = c('dummyA', 'DDX11L1', "dummyC", "dummyC", 'dummyB', NA, 'dummyA|DDX11L1|dummyB'), + width = c( 1501, 4001, 10001, 55001, 5001, 10001, 2000001, 57001) %>% as.integer(), + n_genes = c(1, 1, 1, 1, 1, 1, 0, 3) %>% as.integer(), + overlapping_genes = c('dummyA', 'DDX11L1', 'dummyB', "dummyC", "dummyC", 'dummyB', NA, 'dummyA|DDX11L1|dummyB'), ) expected_final_tb <- expected_gene_tb %>% @@ -87,7 +88,7 @@ test_that("Annotate CNV check scores", { 'large_CN_size_modifier' = 1.5 ) - hi_gr <- tibble( + stemcell_hotspot_gr <- tibble( seqnames = 'chr1', start = c(28050000, 40000, 0), end = c(28070000, 50000, 7200000), @@ -101,7 +102,7 @@ test_that("Annotate CNV check scores", { comment = NA ) %>% as_granges() - hl_gr <- tibble( + cancer_gene_gr <- tibble( seqnames = 'chr1', start = 11873, end = 14409, @@ -114,6 +115,20 @@ test_that("Annotate CNV check scores", { source = 'dummy', comment = NA ) %>% as_granges() + + dosage_sensitive_gene_gr <- tibble( + seqnames = 'chr1', + start = c(4000, 20000, 28050000), + end = c(5000, 50000, 28070000), + strand = '+', + list_name = 'test-dosage', + hotspot = c('dummyA', 'dummyB', 'dummyC'), + mapping = 'gene_name', + call_type = c('loss', 'gain', 'loss'), + check_score = 7, + source = 'dummy', + comment = NA + ) %>% as_granges() expected_tb <- expected_final_tb %>% mutate( @@ -121,25 +136,28 @@ test_that("Annotate CNV check scores", { Check_Score = c( # ROI (50) + 1 other gene w/o hotspot CNV_size_score(1501) + 50 + 0.2, - # HL gene (5) + # cancer gene (5) CNV_size_score(4001) + 5, - # CN4, HI gene (15) + # hotspot + dosage gene (30 & 7) + CNV_size_score(10001) + 30 + 7, + # CN4, hotspot gene (15) CNV_size_score(55001) * 1.5 + 15, - # ROI hit (50) + HI gene (15) - CNV_size_score(5001) + 50 + 15, - # HI hits (30 & 10) + 1 other gene + # ROI hit (50) + dosage gene (7) + CNV_size_score(5001) + 50 + 7, + # 2 hotspots (30 & 10) + 1 other gene CNV_size_score(10001) + 30 + 10 + 0.2, # no genes LOH_size_score(2000001) + 0, - # HL hit (5) + 2 other genes + # cancer gene (5) + 2 other genes LOH_size_score(57001) + 5 + 0.4 ) ) annotate_cnv.check.score( expected_final_tb, - hi_gr, - hl_gr, + stemcell_hotspot_gr, + dosage_sensitive_gene_gr, + cancer_gene_gr, config$settings$CNV_processing$Check_score_values ) %>% expect_equal(expected_tb) @@ -156,14 +174,14 @@ test_that("Annotate CNV check scores", { test_that("Annotate call label", { # Test scenarios: - # - ref GT ( = ref coverage >= X% ?!) [1,2,7] - # - critical score [4] - # - reportable score [5] - # - critical score, but excl list (-> reportable) [3] - # - reportable score, but excl list (-> NA) - # - NA [6] + # - ref GT ( = ref coverage >= X% ?!) [1,2,8] + # - critical score [5] + # - reportable score [4] + # - critical score, but crit. excl list (-> reportable) [6] + # - reportable score, but excl list (-> NA) [added, 9] + # - NA [7] call_cat_config <- list( - check_score.critical = 55, + check_score.critical = 53, filters.exclude.critical = c('probe_gap'), check_score.reportable = 50, filters.exclude.reportable = c('test-dummy') @@ -171,15 +189,15 @@ test_that("Annotate call label", { input_tb <- expected_final_tb %>% mutate( - Check_Score = c(63.03098, 12.93180, 59.71318, 69.18200, 53.47740, 42.88782, 23.37815) + Check_Score = c(53.03098, 12.93180, 50.27740, 52.06978, 66.18200, 53.47740, 42.88782, 23.37815) ) %>% bind_rows( - expected_final_tb[5,] %>% mutate(FILTER = 'test-dummy') + expected_final_tb[6,] %>% mutate(FILTER = 'test-dummy') ) expected_tb <- input_tb %>% mutate( - Call_label = c('Reference genotype', 'Reference genotype', 'Reportable', 'Critical', 'Reportable', NA, 'Reference genotype', NA) + Call_label = c('Reference genotype', 'Reference genotype', 'Reportable', 'Reportable', 'Critical', 'Reportable', NA, 'Reference genotype', NA) ) expect_equal(annotate_call.label(input_tb, call_cat_config), expected_tb) @@ -193,7 +211,7 @@ test_that("Annotate call label", { expected_tb <- input_tb %>% mutate( - Call_label = c('Reference genotype', 'Reference genotype', 'Reportable', 'Critical', NA, NA, 'Reference genotype', NA) + Call_label = c('Reference genotype', 'Reference genotype', NA, NA, 'Critical', NA, NA, 'Reference genotype', NA) ) expect_equal(annotate_call.label(input_tb, call_cat_config), expected_tb) }) diff --git a/tests/testthat/test_processCNVs_impact_lists.R b/tests/testthat/test_processCNVs_impact_lists.R index eee4a25..5619dd1 100644 --- a/tests/testthat/test_processCNVs_impact_lists.R +++ b/tests/testthat/test_processCNVs_impact_lists.R @@ -19,7 +19,7 @@ config <- list( 'gene_overlap' = list( 'exclude_gene_type_regex' = c(), 'include_only_these_gene_types' = c('lncRNA', 'miRNA', 'protein_coding'), - 'high_impact_list' = test_path('../data/minimal-hotspots.tsv') + 'stemcell_hotspot_list' = test_path('../data/minimal-hotspots.tsv') ) ) ) @@ -58,7 +58,7 @@ test_that('tb_to_gr_by_position', { ) }) -#TODO: somehow this didn't/doesn't catch all possible issues +#Note: somehow this didn't/doesn't catch all possible issues # function failed before, due to not correctly checking for all matched gband names test_that('tb_to_gr_by_gband', { gr_info <- load_genomeInfo(ginfo_file, config) @@ -127,41 +127,49 @@ test_that('parse_hotspot_table', { ) }) -# FIXME: enable skipping on all but manual execution -# test_that('parse inbuilt tables', { -# config <- list( -# 'genome_version' = 'hg19', -# 'snakedir' = test_path('../../stemcnv_check/'), -# 'global_settings' = list( -# 'hg19_gtf_file' = test_path('../../test_folders/static-data/gencode.v42.basic.annotation.gtf.gz'), -# 'hg19_genomeInfo_file' = test_path('../../test_folders/static-data/UCSC_hg38_chromosome-info.tsv') -# ), -# 'settings' = list( -# 'CNV_processing' = list( -# 'gene_overlap' = list( -# 'exclude_gene_type_regex' = c(), -# 'include_only_these_gene_types' = c('lncRNA', 'miRNA', 'protein_coding'), -# 'high_impact_list' = '__inbuilt__/supplemental-files/HighImpact-stemcell-hotspots.tsv', -# 'highlight_list' = '__inbuilt__/supplemental-files/genelist-cancer-drivers.tsv' -# ) -# ) -# ) -# ) -# gtf_file <- test_path('../../test_folders/static-data/gencode.v42.basic.annotation.gtf.gz') -# ginfo_file <- test_path('../../test_folders/static-data/gencode.v42.basic.annotation.gtf.gz') -# -# high_impact_tb <- load_hotspot_table(config) -# highlight_tb <- load_hotspot_table(config, 'Highlight') -# config$settings$CNV_processing$gene_overlap$highlight_list <- '__inbuilt__/supplemental-files/genelist-cancer-hotspots.tsv' -# highlight_tb2 <- load_hotspot_table(config, 'Highlight') -# gr_info <- load_genomeInfo(ginfo_file, config) -# gr_genes <- load_gtf_data(gtf_file, config) -# -# expect_no_error(parse_hotspot_table(high_impact_tb, gr_genes, gr_info)) -# expect_no_error(parse_hotspot_table(highlight_tb, gr_genes, gr_info)) -# expect_no_error(parse_hotspot_table(highlight_tb2, gr_genes, gr_info)) -# -# }) +# This requires the default cache files for hg19, do *not* run this in CI +# Also parsing the whole hg19 gtf takes a bit, so allow manual skipping as well? +test_that('parse inbuilt tables', { + skip_on_ci() + skip_on_covr() + config <- list( + 'genome_version' = 'hg19', + 'snakedir' = test_path('../../stemcnv_check/'), + 'global_settings' = list( + 'hg19_gtf_file' = '~/.cache/stemcnv-check/static-data/gencode.hg19.v45.gtf.gz', + 'hg19_genomeInfo_file' = '~/.cache/stemcnv-check/static-data/UCSC_hg19_chromosome-info.tsv' + ), + 'settings' = list( + 'CNV_processing' = list( + 'gene_overlap' = list( + 'exclude_gene_type_regex' = c(), + 'include_only_these_gene_types' = c('lncRNA', 'miRNA', 'protein_coding'), + 'stemcell_hotspot_list' = '__inbuilt__/supplemental-files/genelist-stemcell-hotspots.tsv', + 'cancer_gene_list' = '__inbuilt__/supplemental-files/genelist-cancer-drivers.tsv' + ) + ) + ) + ) + gtf_file <- config$global_settings$hg19_gtf_file + ginfo_file <- config$global_settings$hg19_genomeInfo_file + + stemcell_hotspot_tb <- load_hotspot_table(config) + cancer_gene_tb <- load_hotspot_table(config, 'cancer_gene') + config$settings$CNV_processing$gene_overlap$cancer_gene_list <- '__inbuilt__/supplemental-files/genelist-cancer-hotspots.tsv' + cancer_gene_tb2 <- load_hotspot_table(config, 'cancer_gene') + gr_info <- load_genomeInfo(ginfo_file, config) + gr_genes <- load_gtf_data(gtf_file, config) + score_settings <- list( + 'pHaplo_threshold' = 0.86, + 'pTriplo_threshold' = 0.94, + 'dosage_sensitive_gene' = 5 + ) + + expect_no_error(parse_hotspot_table(stemcell_hotspot_tb, gr_genes, gr_info)) + expect_no_error(parse_hotspot_table(cancer_gene_tb, gr_genes, gr_info)) + expect_no_error(parse_hotspot_table(cancer_gene_tb2, gr_genes, gr_info)) + expect_no_error(get_dosage_sensivity_tb(score_settings) %>% parse_hotspot_table(gr_genes, gr_info)) +}) # 1 - not hit @@ -196,12 +204,12 @@ test_that('annotate_impact_lists', { hotspots <- parse_hotspot_table(read_tsv(test_path('../data/minimal-hotspots.tsv')), gr_genes, gr_info) expected_gr <- sample_cnvs %>% - mutate(test_hits = c(NA, 'DDX11L1', 'dummyC', NA, '1p36|chr1:40000-50000', '1p36', '1p35.2')) + mutate(test = c(NA, 'DDX11L1', 'dummyC', NA, '1p36|chr1:40000-50000', '1p36', '1p35.2')) expect_equal(annotate_impact_lists(sample_cnvs, hotspots, 'test'), expected_gr) # test empty hotspots expected_gr <- sample_cnvs %>% - mutate(test_hits = NA_character_) + mutate(test = NA_character_) expect_equal(annotate_impact_lists(sample_cnvs, GRanges(), 'test'), expected_gr) }) diff --git a/tests/testthat/test_report_table_functions.R b/tests/testthat/test_report_table_functions.R index 730e63d..dbee3a7 100644 --- a/tests/testthat/test_report_table_functions.R +++ b/tests/testthat/test_report_table_functions.R @@ -22,15 +22,15 @@ config <- list( 'settings' = list( 'CNV_processing' = list( 'gene_overlap' = list( - 'high_impact_list' = test_path('../data/minimal-hotspots.tsv'), - 'highlight_list' = test_path('../data/minimal-hotspots.tsv') + 'stemcell_hotspot_list' = test_path('../data/minimal-hotspots.tsv'), + 'cancer_gene_list' = test_path('../data/minimal-hotspots.tsv') ) ) ) ) # Test `format_hotspots_to_badge` function -# format_hotspots_to_badge <- function(hotspot_vec, CNVtype_vec, gene_details, listname = 'high_impact') +# format_hotspots_to_badge <- function(hotspot_vec, CNVtype_vec, gene_details, listname = 'stemcell_hotspot') test_that("format_hotspots_to_badge", { testthat::local_edition(3) hotspot_vec <- c("", "1q21", "1q21", "dummyC", "1p36|DDX11L1", "1p36|DDX11L1") @@ -42,7 +42,7 @@ test_that("format_hotspots_to_badge", { # 5 - gene hit & gband hit matching CNV (loss) # 6 - gene hit & gband hit not matching CNV (Note: theoretically possible) - gene_details <- load_hotspot_table(config, 'HighImpact') + gene_details <- load_hotspot_table(config, 'stemcell_hotspot') expected <- c( '-', @@ -58,11 +58,11 @@ test_that("format_hotspots_to_badge", { '1p36DDX11L1' ) expect_equal( - format_hotspots_to_badge(hotspot_vec, CNVtype_vec, gene_details, 'high_impact'), + format_hotspots_to_badge(hotspot_vec, CNVtype_vec, gene_details, 'stemcell_hotspot'), expected ) - #test with include_hover = FALSE & listname = highlight + #test with include_hover = FALSE & listname = cancer_gene expected <- c( '-', '1q21', @@ -72,22 +72,31 @@ test_that("format_hotspots_to_badge", { '1p36DDX11L1' ) expect_equal( - format_hotspots_to_badge(hotspot_vec, CNVtype_vec, gene_details, 'highlight', FALSE), + format_hotspots_to_badge(hotspot_vec, CNVtype_vec, gene_details, 'cancer_gene', FALSE), expected ) }) -# hotspot_table_output(hotspots, cnv_type, plotsection, high_impact_tb, highlight_tb, report_config, out_format) %>% +# hotspot_table_output(hotspots, cnv_type, plotsection, stemcell_hotspot_tb, cancer_gene_tb, report_config, out_format) %>% test_that("hotspot_table_output", { hotspots <- c('DDX11L1', '1p36') cnv_type <- 'loss' - high_impact_tb <- load_hotspot_table(config, 'HighImpact') - highlight_tb <- tibble() + stemcell_hotspot_tb <- load_hotspot_table(config, 'stemcell_hotspot') + dosage_sensitive_gene_tb <- tibble( + list_name = NA_character_, + hotspot = NA_character_, + call_type = NA_character_, + description = NA_character_, + check_score = NA_real_, + mapping = NA_character_, + description_doi = NA_character_ + ) + cancer_gene_tb <- dosage_sensitive_gene_tb # these aren't used so far plotsection <- 'test' report_config <- list() - expected_tb <- high_impact_tb %>% + expected_tb <- stemcell_hotspot_tb %>% filter(hotspot %in% hotspots) expected <- expected_tb %>% @@ -110,7 +119,11 @@ test_that("hotspot_table_output", { escape = FALSE ) - hotspot_table_output(hotspots, cnv_type, plotsection, high_impact_tb, highlight_tb, report_config, 'html') %>% + hotspot_table_output( + hotspots, cnv_type, plotsection, + stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, + report_config, 'html' + ) %>% expect_equal(expected) # test non-html output @@ -119,26 +132,34 @@ test_that("hotspot_table_output", { dplyr::rename(dois = description_doi) %>% rename_with(format_column_names) %>% kable() - hotspot_table_output(hotspots, cnv_type, plotsection, high_impact_tb, highlight_tb, report_config, 'not-html') %>% + hotspot_table_output( + hotspots, cnv_type, plotsection, + stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, + report_config, 'not-html' + ) %>% expect_equal(expected) - # test with highlight table - highlight_tb <- high_impact_tb %>% + # test with cancer_gene table + cancer_gene_tb <- stemcell_hotspot_tb %>% filter(hotspot == '1p36') %>% - mutate(list_name = 'highlight') - high_impact_tb.no_ov <- high_impact_tb %>% + mutate(list_name = 'cancer_gene') + stemcell_hotspot_tb.no_ov <- stemcell_hotspot_tb %>% filter(hotspot != '1p36') expected <- expected_tb %>% - mutate(list_name = ifelse(hotspot == '1p36', 'highlight', list_name)) %>% + mutate(list_name = ifelse(hotspot == '1p36', 'cancer_gene', list_name)) %>% select(hotspot, call_type, list_name, description, check_score, description_doi) %>% dplyr::rename(dois = description_doi) %>% rename_with(format_column_names) %>% kable() - hotspot_table_output(hotspots, cnv_type, plotsection, high_impact_tb.no_ov, highlight_tb, report_config, 'not-html') %>% + hotspot_table_output( + hotspots, cnv_type, plotsection, + stemcell_hotspot_tb.no_ov, dosage_sensitive_gene_tb, cancer_gene_tb, + report_config, 'not-html' + ) %>% expect_equal(expected) - # test with same hotspot in both HighImpact and highlight table + # test with same hotspot in both stemcell_hotspot and cancer_gene table expected <- expected_tb %>% - mutate(list_name = ifelse(hotspot == '1p36', 'test-list|highlight', list_name)) %>% + mutate(list_name = ifelse(hotspot == '1p36', 'test-list|cancer_gene', list_name)) %>% separate_rows(list_name, sep = '\\|') %>% select(hotspot, call_type, list_name, description_htmllinks, check_score, mapping, description_doi) %>% dplyr::rename(description = description_htmllinks) %>% @@ -158,7 +179,11 @@ test_that("hotspot_table_output", { rownames = FALSE, escape = FALSE ) - hotspot_table_output(hotspots, cnv_type, plotsection, high_impact_tb, highlight_tb, report_config, 'html') %>% + hotspot_table_output( + hotspots, cnv_type, plotsection, + stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, + report_config, 'html' + ) %>% expect_equal(expected) # test with only partially matching cnv_type cnv_type <- 'LOH' @@ -168,7 +193,11 @@ test_that("hotspot_table_output", { dplyr::rename(dois = description_doi) %>% rename_with(format_column_names) %>% kable() - hotspot_table_output(hotspots, cnv_type, plotsection, high_impact_tb, highlight_tb, report_config, 'not-html') %>% + hotspot_table_output( + hotspots, cnv_type, plotsection, + stemcell_hotspot_tb, dosage_sensitive_gene_tb, cancer_gene_tb, + report_config, 'not-html' + ) %>% expect_equal(expected) }) \ No newline at end of file diff --git a/tests/testthat/test_vcf_io_functions.R b/tests/testthat/test_vcf_io_functions.R index 1b17569..37fad36 100644 --- a/tests/testthat/test_vcf_io_functions.R +++ b/tests/testthat/test_vcf_io_functions.R @@ -94,8 +94,9 @@ cnv_tb_annotated <- cnv_tb %>% Call_label = c('Critical', NA, 'Reportable', NA, NA, NA, NA, rep ('Reference genotype', 2)), # reference_caller reference_coverage = c(rep(NA, 4), runif(5, 0, 1)), - high_impact_hits = c(NA, NA, NA, NA, NA, 'gene1,gene2', NA, NA, NA), - highlight_hits = c(NA, NA, 'gene3', NA, NA, NA, NA, NA, NA), + stemcell_hotspot = c(NA, NA, NA, NA, NA, 'gene1,gene2', NA, NA, NA), + dosage_sensitive_gene = NA_character_, + cancer_gene = c(NA, NA, 'gene3', NA, NA, NA, NA, NA, NA), ROI_hits = c(NA, NA, NA, NA, NA, NA, NA, NA, 'ROI1'), Gap_percent = c(0, 0, 0, runif(6, 0, 1)), # not actually used in the function @@ -141,8 +142,9 @@ test_that('get_fix_section', { str_glue('Check_Score={cnv_tb_annotated_out$Check_Score};'), str_glue('Precision_Estimate={cnv_tb_annotated_out$Precision_Estimate};'), str_glue('Call_label={cnv_tb_annotated_out$Call_label};'), - str_glue('HighImpact={cnv_tb_annotated_out$high_impact_hits};'), - str_glue('Highlight={cnv_tb_annotated_out$highlight_hits};'), + str_glue('stemcell_hotspot={cnv_tb_annotated_out$stemcell_hotspot};'), + str_glue('dosage_sensitive_gene={cnv_tb_annotated_out$dosage_sensitive_gene};'), + str_glue('cancer_gene={cnv_tb_annotated_out$cancer_gene};'), str_glue('ROI_hits={cnv_tb_annotated_out$ROI_hits};'), str_glue('Gap_percent={cnv_tb_annotated_out$Gap_percent};'), str_glue('Genes={cnv_tb_annotated_out$overlapping_genes}')