diff --git a/stemcnv_check/control_files/allowedvalues_config.yaml b/stemcnv_check/control_files/allowedvalues_config.yaml index a1aa4fa..098133b 100644 --- a/stemcnv_check/control_files/allowedvalues_config.yaml +++ b/stemcnv_check/control_files/allowedvalues_config.yaml @@ -190,23 +190,22 @@ settings: allowed_sections: - - sample.information - - QC.summary - - QC.GenCall - - QC.PennCNV - - QC.CBS - - QC.settings - - SNV.table - - SNV.hotspot.coverage - - SNV.QC.details - - denovo_calls.table - - denovo_calls.plots - - reference_gt_calls.table - - reference_gt_calls.plots - - regions.of.interest - - virtual.karyotype - #- circo.plot - - SNP.dendrogram + - sample.information + - QC.summary + - QC.GenCall + - QC.PennCNV + - QC.CBS + - QC.settings + - SNV.table + - SNV.hotspot.coverage + - SNV.QC.details + - denovo_calls.table + - denovo_calls.plots + - reference_gt_calls.table + - reference_gt_calls.plots + - regions.of.interest + - SNP.dendrogram + - genome.overview allowed_plotsections: - denovo @@ -234,8 +233,10 @@ reports: SNP_comparison: dendrogram.color.by: str_insamplesheet dendrogram.shape.by: str_insamplesheet - - ideogram.exclude_filter: list__str__(probe_gap|high_probe_dens|min_size|min_probes|min_density) + + genome_overview: + call.exclude_filter: list__str__(probe_gap|high_probe_dens|min_size|min_probes|min_density) + show_reference: bool wildcard_constraints: sample_id: str diff --git a/stemcnv_check/control_files/default_config.yaml b/stemcnv_check/control_files/default_config.yaml index c9b9786..644ce1c 100644 --- a/stemcnv_check/control_files/default_config.yaml +++ b/stemcnv_check/control_files/default_config.yaml @@ -404,7 +404,7 @@ reports: # - reference_gt_calls.plots # - regions.of.interest # - SNP.dendrogram - # - virtual.karyotype + # - genome.overview # List of columns from the sample_table that should be included in the "Sample Information section" sample.info.extra.cols: ['Chip_Name', 'Chip_Pos'] @@ -446,6 +446,10 @@ reports: # Calls with any of these Filters are excluded from the digital karyotype plot ideogram.exclude_filter: ['min_size', 'min_probes', 'min_density'] + + genome_overview: + call.exclude_filter: ['min_size', 'min_probes', 'min_density'] + show_reference: True ##!complete # These contraints define which sample_ids, sentrix_pos (Chip_Pos) and sentrix_name (Chip_Name) are valid diff --git a/stemcnv_check/scripts/R/report_plotting_functions.R b/stemcnv_check/scripts/R/report_plotting_functions.R index 0afabc0..ccd0f17 100644 --- a/stemcnv_check/scripts/R/report_plotting_functions.R +++ b/stemcnv_check/scripts/R/report_plotting_functions.R @@ -2,81 +2,156 @@ library(tidyverse) library(patchwork) library(ggrepel) +add_CNV_plot_styling <- function (gg, panel_space_val = unit(5, units = 'mm')) { + gg + + facet_wrap(~Sample_Name, nrow = 1) + + theme( + strip.background = element_blank(), + strip.text.x = element_blank(), + panel.spacing = panel_space_val + ) +} -make_LRR_BAF_plots <- function( - call.row, raw_LRR_BAF, cnv_calls, gr_genes, gr_info, - total_min_size = 2e6, flank_factor = 2 -) { - - chr <- call.row$chrom - - if (total_min_size < 1e4) { - warning(str_glue('Re-Setting Minimum size window around primary plot area (CNV/region: {chr}:{call.row$start}-{call.row$end}) to at least 10kb')) - total_min_size <- 1e4 - } - - cnv_tools <- cnv_calls$CNV_caller %>% unique() %>% str_subset('StemCNV-check', TRUE) %>% sort() - get_cnv_y <- function(CNV_type, CNV_caller) { - # Go by tool order (start from 1) - out <- match(CNV_caller, cnv_tools) - # For PennCNV LOH & gain/loss may overlap and need separate tracks - out <- ifelse(CNV_type %!in% c('gain', 'loss'), 1-out, out) - out - } - # Set initial plot window as Call size * (1+2*flank_factor) - win_start <- call.row$start - call.row$Size * flank_factor - win_end <- call.row$end + call.row$Size * flank_factor - # If it is below minimum increase accordingly - if (win_end - win_start < total_min_size) { - #Don't want .5 positions - win_start <- win_start - floor((total_min_size - win_end + win_start)/2) - win_end <- win_end + ceiling((total_min_size - win_end + win_start)/2) +make_BAF_panel <- function( + chr, win_start, win_end, + plot.data, + highlight.tb = NULL, + area_tb = NULL, area_alpha = 0.3 +) { + gg <- plot.data %>% + mutate(color = ifelse(filter.passed, 'blue', 'grey70')) %>% + ggplot() + + geom_hline(yintercept = 0, col = 'black', linewidth=0.5) + + geom_hline(yintercept = 1, col = 'black', linewidth=0.5) + if(!is.null(area_tb)){ + gg <- gg + geom_rect( + data = area_tb, + aes(xmin = start, xmax = end, fill = color,ymin = 0, ymax = 1), + alpha = area_alpha + ) + + scale_fill_identity() } - - #Shrink down ends if they overlap 0/Chr_Max (from what is covered by probes) - chr_max <- raw_LRR_BAF %>% filter(Chr == chr) %>% pull(Position) %>% max() - win_start <- max(win_start, 0) - win_end <- min(win_end, chr_max) + gg <- gg + + geom_point( + aes(x = Position, y = `B Allele Freq`, color = color), + size = 0.5, shape = 20, show.legend = F + ) + if (!is.null(highlight.tb)) { + highlight.plotdata <- left_join( + highlight.tb %>% select(Sample_Name, ID, color), + plot.data %>% select(ID, Position, `B Allele Freq`, Sample_Name), + by = c('Sample_Name', 'ID') + ) + gg <- gg + geom_point( + data = highlight.plotdata, + aes(x = Position, y = `B Allele Freq`, color = color), + size = 1, shape = 20, show.legend = F + ) + } + gg + + scale_color_identity() + + theme_classic() + + scale_x_continuous( + expand = expansion(), + labels = label_number(big.mark = '.', decimal.mark = ','), + limits = c(win_start, win_end), + oob = oob_keep + ) + + scale_y_continuous(expand = expansion(), limits = c(-0.1, 1.1), oob = oob_squish, breaks = c(0, 0.5, 1)) + + labs(y = 'B Allele Frequency', x = paste0('Position (', chr, ')')) +} - # get raw LRR & BAF data; mark filtered points - plot.data <- raw_LRR_BAF %>% - filter(Chr == chr & Position >= win_start & Position <= win_end) %>% - mutate(Sample_Name = factor(sapply(sample_id, function(x) sample_headers[x]), levels = sample_headers)) - - if (nrow(plot.data)==0){ - warn_msg <- str_glue('No SNP probes found in primary plot area: {chr}:{win_start}-{win_end}') - warning(warn_msg) - return(list('gg' = warn_msg, 'genes' = tibble(), 'hotspots' = c())) +make_LRR_panel <- function( + chr, win_start, win_end, + plot.data, + highlight.tb = NULL, + area_tb = NULL, area_alpha = 0.3 +) { + gg <- plot.data %>% + mutate(color = ifelse(filter.passed, 'blue', 'grey70')) %>% + ggplot() + if(!is.null(area_tb)){ + gg <- gg + geom_rect( + data = area_tb, + aes(xmin = start, xmax = end, fill = color,ymin = -1.5, ymax = 1.5), + alpha = area_alpha + ) + + scale_fill_identity() } - - calls <- cnv_calls %>% - filter(seqnames == chr & end >= win_start & start < win_end) %>% - as_granges() %>% - unsplit_merged_CNV_callers() %>% - as_tibble() %>% - mutate( - x_pos = (end + start) / 2, - y_pos = map2_int(CNV_type, CNV_caller, get_cnv_y), - color = ifelse(CNV_type %!in% c('gain', 'loss'), 'grey50', '#1a9850'), - color = ifelse(CNV_type == 'loss', '#f46d43', color), - Sample_Name = factor(sapply(sample_id, function(x) sample_headers[x]), levels = sample_headers) - ) %>% - # Need to ensure table contains reference so everything is properly facet_wrapped - bind_rows( - tibble( - Sample_Name = factor(sample_headers, levels = sample_headers), - x_pos = NA_integer_, y_pos = NA_integer_ - ) + gg <- gg + + geom_hline(yintercept = 0, col = 'grey10', linewidth=0.2) + + geom_point( + aes(x = Position, y = `Log R Ratio`,color = color), + size = 0.5, shape = 20, show.legend = F + ) + if (!is.null(highlight.tb)) { + highlight.plotdata <- left_join( + highlight.tb %>% select(Sample_Name, ID, color), + plot.data %>% select(ID, Position, `Log R Ratio`, Sample_Name), + by = c('Sample_Name', 'ID') + ) + gg <- gg + geom_point( + data = highlight.plotdata, + aes(x = Position, y = `Log R Ratio`, color = color), + size = 1, shape = 20, show.legend = F ) + } + gg + + scale_color_identity() + + theme_classic() + + scale_x_continuous( + expand = expansion(), + labels = label_number(big.mark = '.', decimal.mark = ','), + limits = c(win_start, win_end), + position = 'top' + ) + + scale_y_continuous(expand = expansion(), limits = c(-1.5, 1.5), oob = oob_squish) + + labs(y = 'Log R Ratio', x = paste0('Position (', chr, ')')) + + facet_wrap(~Sample_Name, nrow = 1) +} + +make_CNV_panel <- function( + chr, win_start, win_end, + calls, + label_column = 'call_label' +) { + ggplot(calls) + + geom_tile(aes(x = x_pos, y = y_pos, width = width, height = .9, fill = color)) + + scale_fill_identity() + + geom_text( + aes(label = !!sym(label_column), x = x_pos, y = y_pos), + vjust = 0.5, hjust = 0.5, size = 2.5 + ) + + scale_x_continuous( + expand = expansion(), + labels = label_number(big.mark = '.', decimal.mark = ','), + limits = c(win_start, win_end), + oob = oob_keep + ) + + scale_y_continuous(expand = expansion()) + + theme_classic() + + theme( + axis.title = element_blank(), + axis.line = element_blank(), + axis.text = element_blank(), + axis.ticks = element_blank(), + plot.background = element_blank(), + panel.background = element_blank(), + panel.border = element_blank() + ) + + labs(y = 'Calls') # x = paste0('Position (', chr, ')') +} +make_gene_data <- function( + chr, win_start, win_end, sample_headers, + gr_genes, area_tb, + stemcell_hotspot_list, dosage_sensitive_gene_list, cancer_gene_list +) { direct_genes <- gr_genes %>% - filter_by_overlaps(GRanges(seqnames = chr, strand = '*', ranges = IRanges(start = call.row$start, end = call.row$end))) %>% + filter_by_overlaps(GRanges(seqnames = chr, strand = '*', ranges = IRanges(start = area_tb$start, end = area_tb$end))) %>% as_tibble() - - stemcell_hotspot_list <- call.row$stemcell_hotspot %>% str_split('\\|') %>% unlist() - dosage_sensitive_gene_list <- call.row$dosage_sensitive_gene %>% str_split('\\|') %>% unlist() - cancer_gene_list <- call.row$cancer_gene %>% str_split('\\|') %>% unlist() - gene.data <- gr_genes %>% + + gr_genes %>% filter_by_overlaps(GRanges(seqnames = chr, strand = '*', ranges = IRanges(start = win_start, end = win_end))) %>% as_tibble() %>% mutate( @@ -89,14 +164,51 @@ make_LRR_BAF_plots <- function( cancer_gene = gene_name %in% cancer_gene_list, ) %>% separate_rows(Sample_Name, sep = '---') %>% - # Need to ensure table contains reference so everything is properly facet_wrapped + # Need to ensure table contains all samples (reference) so everything is properly facet_wrapped bind_rows( tibble( Sample_Name = factor(sample_headers, levels = sample_headers), x_pos = NA_integer_, y_pos = NA_integer_ ) ) + +} + +make_gene_panel <- function( + chr, win_start, win_end, + gene.data +) { + + gene_track <- ggplot(gene.data) + + geom_tile( + aes( + x = x_pos, y = y_pos, width = width, height = .9, + fill = case_when( + stemcell_hotspot ~ 'red', + dosage_sensitive_gene ~ 'orange', + cancer_gene ~ 'orange', + direct_hit ~ 'black', + TRUE ~ 'grey50' + ) + ), + show.legend = F + ) + + scale_x_continuous(expand = expansion(), limits = c(win_start, win_end), oob = oob_keep) + + scale_y_continuous(expand = expansion(add = c(0.25, 0.25))) + + scale_fill_identity() + + theme_void() + + theme( + axis.title.y = element_text(angle = 90, vjust = 1) + ) + + labs(y = 'Genes') # x = paste0('Position (', chr, ')') + +} +make_header_data <- function( + chr, win_start, win_end, sample_headers, + gr_info, stemcell_hotspot_list +) { + info_data <- gr_info %>% filter_by_overlaps(GRanges(seqnames = chr, strand = '*', ranges = IRanges(start = win_start, end = win_end))) %>% as_tibble() %>% @@ -124,126 +236,16 @@ make_LRR_BAF_plots <- function( Sample_Name = factor(sample_headers, levels = sample_headers), x_pos = NA_integer_, y_pos = NA_integer_ ) - ) - - panel_space_val <- unit(5, units = 'mm') - - cnv_track <- ggplot(calls) + - geom_tile(aes(x = x_pos, y = y_pos, width = width, height = .9, fill = color)) + - scale_fill_identity() + - geom_text( - aes(label = paste0(CNV_caller, ': ', CNV_type), x = x_pos, y = y_pos), - vjust = 0.5, hjust = 0.5, size = 2.5 - ) + - scale_x_continuous( - expand = expansion(), - labels = label_number(big.mark = '.', decimal.mark = ','), - limits = c(win_start, win_end), - oob = oob_keep - ) + - scale_y_continuous(expand = expansion()) + - facet_wrap(~Sample_Name, nrow = 1) + - theme_classic() + - theme( - axis.title = element_blank(), - axis.line = element_blank(), - axis.text = element_blank(), - axis.ticks = element_blank(), - plot.background = element_blank(), - panel.background = element_blank(), - panel.border = element_blank(), - strip.background = element_blank(), - strip.text.x = element_blank(), - panel.spacing = panel_space_val - ) + - labs(y = 'Calls') - - gene_track <- ggplot(gene.data) + - geom_tile( - aes( - x = x_pos, y = y_pos, width = width, height = .9, - fill = case_when( - stemcell_hotspot ~ 'red', - dosage_sensitive_gene ~ 'orange', - cancer_gene ~ 'orange', - direct_hit ~ 'black', - TRUE ~ 'grey50' - ) - ), - show.legend = F - ) + - scale_x_continuous(expand = expansion(), limits = c(win_start, win_end), oob = oob_keep) + - scale_y_continuous(expand = expansion(add = c(0.25, 0.25))) + - scale_fill_identity() + - facet_wrap(~Sample_Name, nrow = 1) + - theme_void() + - theme( - strip.background = element_blank(), - strip.text.x = element_blank(), - axis.title.y = element_text(angle = 90, vjust = 1), - panel.spacing = panel_space_val - ) + - labs(y = 'Genes') - - lrr <- ggplot(plot.data) + - geom_rect( - data = tibble(Sample_Name = sample_headers), - aes(xmin = call.row$start, xmax = call.row$end, ymin = -1.5, ymax = 1.5), - fill = 'grey50', alpha = 0.3 - ) + - geom_hline(yintercept = 0, col = 'grey10', linewidth=0.2) + - geom_point( - aes(x = Position, y = `Log R Ratio`,color = filter.passed), - size = 0.5, shape = 20, show.legend = F - ) + - scale_color_manual(values=c('TRUE' = 'blue', 'FALSE' = 'grey70')) + - theme_classic() + - scale_x_continuous( - expand = expansion(), - labels = label_number(big.mark = '.', decimal.mark = ','), - limits = c(win_start, win_end), - position = 'top' - ) + - scale_y_continuous(expand = expansion(), limits = c(-1.5, 1.5), oob = oob_squish) + - labs(y = 'Log R Ratio', x = paste0('Position (', chr, ')')) + - facet_wrap(~Sample_Name, nrow = 1) + - theme( - strip.background = element_blank(), - strip.text.x = element_blank(), - panel.spacing = panel_space_val - ) + ) # + + # labs(y = 'gBand', x = paste0('Position (', chr, ')')) +} - baf <- ggplot(plot.data) + - geom_hline(yintercept = 0, col = 'black', linewidth=0.5) + - geom_hline(yintercept = 1, col = 'black', linewidth=0.5) + - geom_rect( - data = tibble(Sample_Name = sample_headers), - aes(xmin = call.row$start, xmax = call.row$end, ymin = 0, ymax = 1), - fill = 'grey50', alpha = 0.3 - ) + - geom_point( - aes(x = Position, y = `B Allele Freq`,color = filter.passed), - size = 0.5, shape = 20, show.legend = F - ) + - scale_color_manual(values=c('TRUE' = 'blue', 'FALSE' = 'grey70')) + +make_header_panel <- function( + chr, win_start, win_end, header_data, label_colunm = 'section_name' +) { + + ggplot(header_data) + theme_classic() + - scale_x_continuous( - expand = expansion(), - labels = label_number(big.mark = '.', decimal.mark = ','), - limits = c(win_start, win_end), - oob = oob_keep - ) + - scale_y_continuous(expand = expansion(), limits = c(-0.1, 1.1), oob = oob_squish, breaks = c(0, 0.5, 1)) + - labs(y = 'B Allele Frequency', x = paste0('Position (', chr, ')')) + - facet_wrap(~Sample_Name, nrow = 1) + - theme( - strip.background = element_blank(), - strip.text.x = element_blank(), - panel.spacing = panel_space_val - ) - - header <- ggplot(info_data) + - facet_wrap(~Sample_Name, nrow=1) + theme_classic() + geom_tile( aes(x = x_pos, y = y_pos, width = width, height = .9, fill = color), color = 'black', linewidth = 0.2 @@ -251,7 +253,7 @@ make_LRR_BAF_plots <- function( scale_fill_identity() + # Use repel to keep gband names in the plot area geom_text( - aes(label = section_name, x = x_pos, y = y_pos, color = textcolor), + aes(label = !!sym(label_colunm), x = x_pos, y = y_pos, color = textcolor), vjust = 0.5, hjust = 0.5, size = 2.5, show.legend = F ) + scale_color_identity() + @@ -270,9 +272,118 @@ make_LRR_BAF_plots <- function( axis.text = element_blank(), plot.background = element_blank(), panel.background = element_blank(), - panel.border = element_blank(), - panel.spacing = panel_space_val + panel.border = element_blank() ) +} + + +make_call_plot <- function( + call.row, raw_LRR_BAF, cnv_calls, gr_genes, gr_info, + sample_headers, + total_min_size = 2e6, flank_factor = 2 +) { + + chr <- call.row$chrom + area_tb <- tibble( + Sample_Name = factor(sample_headers, levels = sample_headers), + start = call.row$start, end = call.row$end, color = 'grey70' + ) + + if (total_min_size < 1e4) { + warning(str_glue('Re-Setting Minimum size window around primary plot area (CNV/region: {chr}:{area_tb$start}-{area_tb$end}) to at least 10kb')) + total_min_size <- 1e4 + } + + # Set initial plot window as Call size * (1+2*flank_factor) + win_start <- call.row$start - call.row$Size * flank_factor + win_end <- call.row$end + call.row$Size * flank_factor + # If it is below minimum increase accordingly + if (win_end - win_start < total_min_size) { + #Don't want .5 positions + win_start <- win_start - floor((total_min_size - win_end + win_start)/2) + win_end <- win_end + ceiling((total_min_size - win_end + win_start)/2) + } + + #Shrink down ends if they overlap 0/Chr_Max (from what is covered by probes) + chr_max <- raw_LRR_BAF %>% filter(Chr == chr) %>% pull(Position) %>% max() + win_start <- max(win_start, 0) + win_end <- min(win_end, chr_max) + + # get raw LRR & BAF data; mark filtered points + plot.data <- raw_LRR_BAF %>% + # Assume all samples in sample_headers have been loaded + filter(sample_id %in% names(sample_headers) & Chr == chr & Position >= win_start & Position <= win_end) %>% + mutate(Sample_Name = factor(sapply(sample_id, function(x) sample_headers[x]), levels = sample_headers)) + + if (nrow(plot.data)==0){ + warn_msg <- str_glue('No SNP probes found in primary plot area: {chr}:{win_start}-{win_end}') + warning(warn_msg) + return(list('gg' = warn_msg, 'genes' = tibble(), 'hotspots' = c())) + } + + cnv_tools <- cnv_calls$CNV_caller %>% unique() %>% str_subset('StemCNV-check', TRUE) %>% sort() + get_cnv_y <- function(CNV_type, CNV_caller) { + # Go by tool order (start from 1) + out <- match(CNV_caller, cnv_tools) + # For PennCNV LOH & gain/loss may overlap and need separate tracks + out <- ifelse(CNV_type %!in% c('gain', 'loss'), 1-out, out) + out + } + + calls <- cnv_calls %>% + filter(sample_id %in% names(sample_headers) & seqnames == chr & end >= win_start & start < win_end) %>% + as_granges() %>% + unsplit_merged_CNV_callers() %>% + as_tibble() %>% + mutate( + x_pos = (end + start) / 2, + y_pos = map2_int(CNV_type, CNV_caller, get_cnv_y), + color = case_when( + CNV_type == 'gain' ~ '#1a9850', + CNV_type == 'loss' ~ '#f46d43', + CNV_type == 'LOH' ~ 'grey50' + ), + Sample_Name = factor(sapply(sample_id, function(x) sample_headers[x]), levels = sample_headers), + call_label = str_glue('{CNV_caller}: {CNV_type}') + ) %>% + # Need to ensure table contains reference so everything is properly facet_wrapped + bind_rows( + tibble( + Sample_Name = factor(sample_headers, levels = sample_headers), + x_pos = NA_integer_, y_pos = NA_integer_ + ) + ) + + stemcell_hotspot_list <- call.row$stemcell_hotspot %>% str_split('\\|') %>% unlist() + dosage_sensitive_gene_list <- call.row$dosage_sensitive_gene %>% str_split('\\|') %>% unlist() + cancer_gene_list <- call.row$cancer_gene %>% str_split('\\|') %>% unlist() + panel_space_val <- unit(5, units = 'mm') + header.data <- make_header_data( + chr, win_start, win_end, sample_headers, + gr_info, stemcell_hotspot_list + ) + gene.data <- make_gene_data( + chr, win_start, win_end, sample_headers, + gr_genes, area_tb, + stemcell_hotspot_list, dosage_sensitive_gene_list, cancer_gene_list + ) + + header <- make_header_panel(chr, win_start, win_end, header.data) + + facet_wrap(~Sample_Name, nrow = 1) + cnv_track <- make_CNV_panel(chr, win_start, win_end, calls) %>% + add_CNV_plot_styling(panel_space_val) + lrr <- make_LRR_panel( + chr, win_start, win_end, plot.data, + area_tb = area_tb + ) %>% + add_CNV_plot_styling(panel_space_val) + baf <- make_BAF_panel( + chr, win_start, win_end, plot.data, + area_tb = area_tb + ) %>% + add_CNV_plot_styling(panel_space_val) + gene_track <- make_gene_panel(chr, win_start, win_end, gene.data) %>% + add_CNV_plot_styling(panel_space_val) n_cnvs <- length(na.omit(unique(calls$CNV_type))) gg <- header / cnv_track / lrr / baf / cnv_track / gene_track + plot_layout(heights = c(1, n_cnvs, 10, 10, n_cnvs, 2)) @@ -293,3 +404,62 @@ make_LRR_BAF_plots <- function( ) } + +make_chromsome_overview_plot <- function( + chr, sample_headers, label_SNVs, + raw_LRR_BAF, cnv_calls, gr_info +) { + plot.data <- raw_LRR_BAF %>% + filter(sample_id %in% names(sample_headers) & Chr == chr ) %>% + mutate(Sample_Name = factor(sapply(sample_id, function(x) sample_headers[x]), levels = sample_headers)) + + area_tb <- cnv_calls %>% + filter(sample_id %in% names(sample_headers) & seqnames == chr) %>% + mutate( + color = case_when( + CNV_type == 'gain' ~ '#1a9850', + CNV_type == 'loss' ~ '#f46d43', + CNV_type == 'LOH' ~ 'grey50' + ), + Sample_Name = factor(sapply(sample_id, function(x) sample_headers[x]), levels = sample_headers) + ) %>% + # Need to ensure table contains reference so everything is properly facet_wrapped + bind_rows( + tibble( + Sample_Name = factor(sample_headers, levels = sample_headers) + ) + ) + + SNV_label_data <- label_SNVs %>% + filter(sample_id %in% names(sample_headers) & seqnames == chr) %>% + mutate( + Sample_Name = factor(sapply(sample_id, function(x) sample_headers[x]), levels = sample_headers), + ) %>% + # Need to ensure table contains reference so everything is properly facet_wrapped + bind_rows( + tibble( + Sample_Name = factor(sample_headers, levels = sample_headers), + ) + ) + + chr_end <- gr_info %>% as_tibble() %>% filter(seqnames == chr) %>% pull(end) %>% max() + + header.data <- make_header_data( + #need the full data here + chr = chr, win_start = 0, win_end = chr_end, sample_headers = sample_headers, + gr_info = gr_info, stemcell_hotspot_list = 'dummy_string' + ) %>% + mutate(dummy_label = NA_character_) + panel_space_val <- unit(5, units = 'mm') + + header <- make_header_panel(chr, 0, chr_end, header.data, 'dummy_label') + + facet_wrap(~Sample_Name, nrow = 1) + lrr <- make_LRR_panel(chr, 0, chr_end, plot.data, SNV_label_data, area_tb, 0.7) %>% + add_CNV_plot_styling(panel_space_val) + baf <- make_BAF_panel(chr, 0, chr_end, plot.data, SNV_label_data, area_tb, 0.7) %>% + add_CNV_plot_styling(panel_space_val) + + gg <- header / lrr / baf + plot_layout(heights = c(1, 8, 8)) + + gg +} \ No newline at end of file diff --git a/stemcnv_check/scripts/report_template.Rmd b/stemcnv_check/scripts/report_template.Rmd index e2c9ca0..d4d9cd3 100644 --- a/stemcnv_check/scripts/report_template.Rmd +++ b/stemcnv_check/scripts/report_template.Rmd @@ -44,6 +44,29 @@ params: height: 40px; width: 250px; } +# collapsible text boxes +details { + user-select: none; +} +details>summary span.icon { + width: 24px; + height: 24px; + transition: all 0.3s; + margin-left: auto; +} +details[open] summary span.icon { + transform: rotate(180deg); +} +summary { + display: flex; + cursor: pointer; + background-color:#eee; + color: #444; +} +summary::-webkit-details-marker { + display: none; +} + @@ -73,21 +96,6 @@ source(file.path(config$snakedir, 'scripts/R/report_table_functions.R')) source(file.path(config$snakedir, 'scripts/R/report_plotting_functions.R')) source(file.path(config$snakedir, 'scripts/R/vcf_io_functions.R')) -include.section <- function(section) { - include <- ifelse( - all(report_config$include_sections == '__all__'), - T, - section %in% report_config$include_sections - ) - exclude <- section %in% report_config$exclude_sections - # also exclude QC sections if they are not defined in settings - include <- include & !( - str_detect(section, '^QC\\.(?!(summary|GenCall|settings))') & - str_remove(section, 'QC\\.') %!in% config$settings$CNV.calling.tools - ) - include & !exclude -} - # General variables & settings image_folder <- paste0(params$report_name, '-', params$out_format, '_images/') @@ -135,11 +143,6 @@ gr_genes <- load_gtf_data(params$gtf_file, config, target_chrom_style) gr_info <- load_genomeInfo(params$ginfo_file, config, target_chrom_style) roi_gr <- get_roi_gr(sample_id, sampletable, config, gr_genes, gr_info, target_chrom_style) -include_roi_plots <- include.section('regions.of.interest') & ifelse( - 'Regions_of_Interest' %in% colnames(sampletable), - length(roi_gr) > 0, - FALSE -) stemcell_hotspot_tb <- load_hotspot_table(config, 'stemcell_hotspot') dosage_sensitive_gene_tb <- get_dosage_sensivity_tb(config$settings$CNV_processing$Check_score_values) @@ -161,6 +164,28 @@ tr_tibble <- function(tb) { ## Rmd related functions + +include.section <- function(section) { + include <- ifelse( + all(report_config$include_sections == '__all__'), + T, + section %in% report_config$include_sections + ) + exclude <- section %in% report_config$exclude_sections + # also exclude QC sections if they are not defined in settings + include <- include & !( + str_detect(section, '^QC\\.(?!(summary|GenCall|settings))') & + str_remove(section, 'QC\\.') %!in% config$settings$CNV.calling.tools + ) + include & !exclude +} + +include_roi_plots <- include.section('regions.of.interest') & ifelse( + 'Regions_of_Interest' %in% colnames(sampletable), + length(roi_gr) > 0, + FALSE +) + CNV_ID_str <- function(ID, i, region_name = NULL, ...) { #ROIs might have user defined names if(!is.null(region_name)) id_str <- paste0('roi-', region_name) @@ -180,8 +205,8 @@ make_CNV_plot_section <- function(call.table, plotsection = 'denovo') { for (i in call.table$i) { row <- call.table[i,] - res <- make_LRR_BAF_plots( - row, raw_LRR_BAF, cnv_calls, gr_genes, gr_info, + res <- make_call_plot( + row, raw_LRR_BAF, cnv_calls, gr_genes, gr_info, sample_headers, report_config$call.data.and.plots[[plotsection]]$plot.region.minsize, report_config$call.data.and.plots[[plotsection]]$plot.flanking.region.relative ) @@ -234,6 +259,18 @@ subchunkify <- function(g, id, fig_height=7, fig_width=7, options = '') { cat(knitr::knit(text = knitr::knit_expand(text = sub_chunk), quiet = TRUE)) } +collapsible_html_text <- function(text, header = 'Description' ) { + paste0( + '
\n\n\n\n', header, '\n\n', + '\n\n', + '\n\n', + '

\n\n', + text, + '

\n\n', + '
\n\n' + ) %>% cat() +} + ``` @@ -283,28 +320,28 @@ snv_critical_desc <- list( 'any-protein-changing' = ' - changes the protein sequence' ) - -cat(paste( +paste( 'This summary table is meant to serve as a quick overview of the quality of an hPSC sample. ', - 'Note that without a reference sample all detected variants are considered "de-novo", even if most are likely from the original donor germline.\n\n', + 'Note that without a reference sample all detected variants are considered "de-novo", even if most are likely from ', + 'the original donor germline.\n\n', 'Coloring of all fields is based on (usually) two thresholds defined in the config file (under the evaluation_settings section),', 'one for a signalling (yellow) level, and one for a more serious warning (orange) or even critical (red) level. ', 'Only certain values are potentially considered critical and are marked by bold text in the table, ', 'which values behave like this is also defined in the config.\n\n', - 'For assigning SNVs (as determined by the Chip Array) as critical, the SNV genotype needs to differs from reference ', - 'sample (if one is defined) and (by default preselection) it needs to change protein sequence. ', + 'For assigning SNVs (as determined by the Chip Array) as critical, the SNV genotype needs to differs from the ', + 'reference sample (if one is defined) and (by default preselection) it needs to change the protein sequence. ', 'Additionally one of the following criteria needs to be met:\n\n', paste(sapply(config$settings$SNV_analysis$critical_SNV, \(x){snv_critical_desc[[x]]}), collapse = '\n\n'), '\n\n', 'CNV Call designation takes into account minimum Check_Score thresholds for critical and (optionally) reportable level, ', 'and for exclusion overlap with a reference call (these calls are not denovo and never criticcal or reportable) and ', 'call filter flags (i.e. calls below minimum size, or having a probe coverage gap). ', - 'If no reportable Check_Score threshold is defined the critical one is used (but the reportable exclusion filters).\n\n', + 'If no reportable Check_Score threshold is defined the critical one is used (but with the reportable exclusion filters).\n\n', 'The current thresholds and exclsuion filters are:\n\n', ' - Critical Check_Score:', config$evaluation_settings$CNV_call_categorisation$check_score.critical, '\n\n', ' - Critical exclusion Filters:', crit_excl_filters, '\n\n', ' - Reportable Check_Score:', config$evaluation_settings$CNV_call_categorisation$check_score.reportable, '\n\n', ' - Reportable exclusion Filters: ', repo_excl_filters, '\n\n' -)) +) %>% collapsible_html_text('Summary table explanations') read_excel( fix_rel_filepath(params$input$summary_xlsx, config), @@ -316,6 +353,11 @@ cat('\n\n') ```{r qc.gencall, eval = include.section('QC.GenCall'), results='asis'} cat('### GenCall\n\n') + +paste( + 'This table displays the direct quality metrics from the GenCall software.\n\n' +) + read_excel( fix_rel_filepath(params$input$summary_xlsx, config), sheet = 'gencall_stats' @@ -326,6 +368,11 @@ cat('\n\n') ```{r qc.penncnv, eval = include.section('QC.PennCNV'), results='asis'} cat('### PennCNV\n\n') +paste( + 'The first table displays quality metrics from the PennCNV algorythm. ', + 'The second displays CNV call statistcs for only PennCNV\n\n' +) + read_excel( fix_rel_filepath(params$input$summary_xlsx, config), sheet = 'PennCNV_QC_info' @@ -342,6 +389,10 @@ cat('\n\n') ```{r qc.CBS, eval = include.section('QC.CBS'), results='asis'} cat('### CBS\n\n') +paste( + 'This table displays CNV call statistcs for only CBS.\n\n' +) + read_excel( fix_rel_filepath(params$input$summary_xlsx, config), sheet = 'CBS_stats' @@ -388,10 +439,10 @@ raw_LRR_BAF <- c(params$input$snp_vcf, params$input$ref_snp_vcf) %>% fix_CHROM_format(target_chrom_style) %>% as_tibble() %>% #FIXME / future: adapt this - dplyr::rename(Chr = seqnames, Position = start, Name = ID, + dplyr::rename(Chr = seqnames, Position = start, `B Allele Freq` = BAF, `Log R Ratio` = LRR) %>% mutate(filter.passed = FILTER == 'PASS') %>% - dplyr::select(sample_id, Name, Chr, Position, filter.passed, `Log R Ratio`, `B Allele Freq`) + dplyr::select(sample_id, ID, Chr, Position, filter.passed, `Log R Ratio`, `B Allele Freq`) cnv_calls <- c(params$input$cnv_vcf, params$input$ref_cnv_vcf) %>% sapply(fix_rel_filepath, config=config) %>% @@ -573,6 +624,15 @@ snc_qc_tb <- params$inputs$snv_analysis %>% ```{r SNV.table, eval = include.section('SNV.table'), results='asis'} cat('## Table of SNVs\n\n') +paste( + 'This table lists all SNVs/SNPs detected by the Chip Array which are different from the reference genome and', + 'are annotated at least as protein changing. The "SNV label" further categorizes the SNVs into critical,', + 'unreliable critical (due to missing reference or bad quality calls), protein changing, and reference genotype.', + 'The hidden "Critical reason" column has further details on why a SNV is considered critical. These are also', + 'highlighted in the "ROI Hits", "Gene Name" (curated hotspots for stem cells) and "Annotation" (specific effect of', + 'the SNP on the protein) columns.\n\n' +) %>% collapsible_html_text('SNV table explanations') + snv_annotation_tb <- tibble( list_name = 'critical_annotations', hotspot = config$settings$SNV_analysis$critical_annotations, @@ -685,7 +745,60 @@ cat('\n\n') ``` -# Sample comparison {.tabset} +# Sample comparison + +```{r genome.overview, eval = include.section('genome.overview'), results='asis'} + +cat('## Genome Overview {.tabset}\n\n') + +paste( + 'The following plots each show a whole chromosome overview of the sample, combing to a whole genome view.', + 'CNV calls (filtered based on the config settings) are shown as colored background bars, with the color indicating the', + 'type of call: green for gains, red for losses, and grey for LOH.', + 'Additionally, if the sample has a reference, SNVs that are labelled critical or protein changing/unreliable', + 'critical are also highlighted in red and orange, respectively.\n\n' +) %>% collapsible_html_text('Genome overview explanations') + +if (!is.na(ref_id)) { + label_SNVs <- SNV_table %>% + filter(SNV_label != 'reference_GT') %>% + mutate( + sample_id = sample_id, + color = ifelse(SNV_label == 'critical', 'red', 'orange') + ) +} else { + label_SNVs <- SNV_table %>% mutate(color = 'red', sample_id = sample_id) %>% filter(SNV_label == 'dummy') +} + +call_filter_regex <- ifelse( + is.null(report_config$genome_overview$call.exclude_filter), + 'dummy', + report_config$genome_overview$call.exclude_filter %>% paste(collapse = '|') +) +filtered_calls <- cnv_calls %>% + filter(!str_detect(FILTER, call_filter_regex)) + +for (chrom in levels(raw_LRR_BAF$Chr) %>% str_subset('M', T)) { + cat('### ', chrom, '\n\n') + + subchunkify( + make_chromsome_overview_plot(chrom, sample_headers[1], label_SNVs, raw_LRR_BAF, filtered_calls, gr_info), + paste0('chromosome_overview.', chrom), 5, 12 + ) + + if (!is.na(ref_id) & report_config$genome_overview$show_reference) { + subchunkify( + make_chromsome_overview_plot(chrom, sample_headers[2], label_SNVs, raw_LRR_BAF, filtered_calls, gr_info), + paste0('chromosome_overview.ref.', chrom), 5, 12 + ) + } + + cat('\n\n') +} + +cat('\n\n') + +``` ```{r snp.dendrogram, results='asis', eval = include.section('SNP.dendrogram')} @@ -698,23 +811,23 @@ snp.distance.tb <- read_excel( if (nrow(snp.distance.tb) == 1) { - cat('No additional samples found for the comparison, no dendrogram can be built.\n\n') + cat('No additional samples used for comparison, no dendrogram can be built.\n\n') } else { - cat( - 'Sample identities can be comparsed based on the dendrogram built on the SNP genotypes. ', - 'The dendrogram is built using the manhattan distance between samples, counting both alleles ', - 'from Probes that are not quality in every included sample. ', - 'Accoringly, the distance between two samples is the sum of the absolute differences between the two ', - 'alleles at each SNP (also shown in the table below) after QC filters. ', - 'Samples that are very close together are likely identical or clonally related. ', + paste( + 'Sample identities can be comparsed based on the dendrogram built on the SNP genotypes.', + 'The dendrogram is built using the manhattan distance between samples, counting both alleles', + 'from Probes that are not quality in every included sample.', + 'Accoringly, the distance between two samples is the sum of the absolute differences between the two', + 'alleles at each SNP (also shown in the table below) after QC filters.', + 'Samples that are very close together are likely identical or clonally related.', 'Sample selection as well as color and shape lables are controlled by the config file.\n\n' - ) + ) %>% collapsible_html_text('Dendrogram explanations') #Build annotation table - color_by <- report_config$SNP_comparison$dendrogram.color.by #%>% check_cols_exits() - shape_by <- report_config$SNP_comparison$dendrogram.shape.by #%>% check_cols_exits() + color_by <- report_config$SNP_comparison$dendrogram.color.by + shape_by <- report_config$SNP_comparison$dendrogram.shape.by dend.format.df <- sampletable %>% filter(Sample_ID %in% snp.distance.tb$sample_distance_to) @@ -878,88 +991,3 @@ if (nrow(snp.distance.tb) == 1) { } ``` - -```{r virtual.karyotype, eval = include.section('virtual.karyotype'), results='asis'} - -cat('## Virtual Karyotype\n\n') - -if (!is.na(ref_id)) { - cat(str_glue('The first plot shows only this sample ({sample_name}), the second a side by side comparison with the reference sample (left: {sample_name}; right: {ref_name}).\n\n')) -} - -# Use prepared static data table (genome build specific, UCSC derived) instead of inbuilt RIdeogram -# data(human_karyotype, package="RIdeogram") -genome_data <- gr_info %>% - filter(centromer) %>% - group_by(size) %>% - reduce_ranges() %>% - as.data.frame() %>% - dplyr::rename(Chr = seqnames, CE_start = start, CE_end = end, End = size) %>% - mutate(Start = 0, Chr = str_remove(Chr, 'chr')) %>% - select(Chr, Start, End, CE_start, CE_end) - - -ideo_filter_regex <- ifelse( - is.null(report_config$ideogram.exclude_filter), - 'dummy', - report_config$ideogram.exclude_filter %>% paste(collapse = '|') -) -sample.calls <- cnv_calls %>% - filter(!str_detect(FILTER, ideo_filter_regex)) %>% - dplyr::select(seqnames, start, end, CNV_type, sample_id) %>% - dplyr::rename(Chr = seqnames, Start = start, End = end) %>% - mutate(Value = ifelse(CNV_type %!in% c('gain', 'loss'), 0, 1), - Value = ifelse(CNV_type == 'loss', -1, Value), - Chr = str_remove(Chr, 'chr')) -overlay.tb <- sample.calls %>% - filter(sample_id == !!sample_id) %>% - dplyr::select(Chr, Start, End, Value) - -#This should not be the case unless there are 0 calls that generated images -if (!dir.exists(file.path(workdir, image_folder))) { - dir.create(file.path(workdir, image_folder)) -} - -svg_file1 <- tempfile(fileext = '.svg') -#somehow 'convertSVG' can't deal with absolute paths -png_file1 <- file.path(image_folder, 'virtualKaryotype_ideogram.png') - -ideogram(genome_data, overlaid = overlay.tb, colorset1 = c('red', 'grey50', 'green'), output = svg_file1) -convertSVG(svg_file1, file = png_file1) - -if (params$out_format == 'html') { - cat(paste0('')) -} else { - #For reasons tiny-tex/latex executes from the directoy this template is located in (rather than the defined outdir), so relative paths don't work for pdf - include_graphics(file.path(workdir, png_file1) %>% normalizePath, rel_path = FALSE) -} - -if (!is.na(ref_id)) { - refdata <- sample.calls %>% - filter(sample_id == ref_id) %>% - dplyr::select(Chr, Start, End, Value) - - svg_file2 <- tempfile(fileext = '.svg') - png_file2 <- file.path(image_folder, 'virtualKaryotype_ideogram_vsRef.png') - - ideogram( - genome_data, - overlaid = overlay.tb, - label = refdata, - label_type = 'heatmap', - colorset1 = c('red', 'grey50', 'green'), - colorset2 = c('red', 'grey50', 'green'), - output = svg_file2 - ) - convertSVG(svg_file2, file = png_file2) - - if (params$out_format == 'html') { - cat(paste0('')) - } else { - include_graphics(file.path(workdir, png_file2) %>% normalizePath, rel_path = FALSE) - } -} - -cat('\n\n') - -```