Skip to content

Commit

Permalink
- fixes in report (SNP dendrogram and dosage links)
Browse files Browse the repository at this point in the history
- fixing pseudo-autosomal filter to actually work
  • Loading branch information
Nicolai-vKuegelgen committed Oct 30, 2024
1 parent 4f098fb commit 4ab9223
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 159 deletions.
2 changes: 1 addition & 1 deletion stemcnv_check/control_files/allowedvalues_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ settings:
GenTrainScore: float_le1_ge0
GenCallScore: float_le1_ge0
Position.duplicates: str_(keep|remove|highest-GenCall|highest-GenTrain)
pseudoautosomal: str_(keep|remove|remove-male)
Pseudoautosomal: str_(keep|remove|remove-male)

default-filter-set: filtersetnodefault

Expand Down
2 changes: 1 addition & 1 deletion stemcnv_check/control_files/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ settings:
GenTrainScore: 0.7
GenCallScore: 0.7 # Anything above this should be a stable genotype call
Position.duplicates: highest-GenCall # keep|remove|highest-GenCall|highest-GenTrain
pseudoautosomal: remove-male # keep|remove|remove-male
Pseudoautosomal: remove-male # keep|remove|remove-male
## Other filter settings we have tested:
## This is closest to other Illumina defaults, but susceptible to noise
# basic:
Expand Down
1 change: 1 addition & 0 deletions stemcnv_check/rules/SNP_processing.smk
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ rule filter_snp_vcf:
err=os.path.join(LOGPATH, "filter_snp_vcf", "{sample_id}", "{filter}.error.log"),
#out=os.path.join(LOGPATH, "filter_snp_vcf", "{sample_id}", "{filter}.out.log")
params:
filter=lambda wildcards: config['settings']['probe-filter-sets'][wildcards.filter],
sample_sex=lambda wildcards: get_ref_id(wildcards, True)[2],
genome_version=lambda wildcards: get_static_input('genome_version')(wildcards)
conda:
Expand Down
4 changes: 2 additions & 2 deletions stemcnv_check/scripts/R/hotspot_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,8 @@ get_dosage_sensivity_tb <- function(score_settings) {

description_html_pattern <- str_replace_all(
tb$description,
'([:,] ?)([^:,]+?)\\{([0-9]+)\\}(?=, ?|\\\\\\\\n|\\\\n|$)',
'\\1<a href="{a\\3}" target="_blank" rel="noopener noreferrer">\\2</a>'
'Source: Collins et al. 2022 \\{1\\}.',
str_glue('Source: <a href="{doi}" target="_blank" rel="noopener noreferrer">Collins et al. 2022</a>.')
) %>%
str_replace_all('\\\\n', '&#013;') %>%
str_replace_all('\\n', '&#013;')
Expand Down
323 changes: 169 additions & 154 deletions stemcnv_check/scripts/report_template.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -689,178 +689,193 @@ cat('\n\n')

```{r snp.dendrogram, results='asis', eval = include.section('SNP.dendrogram')}
cat('## Identity comparison (SNP dendrogram)\n\n')
cat('## Identity comparison\n\n')
snp.distance.tb <- read_excel(
fix_rel_filepath(params$input$snv_analysis, config),
sheet = 'SNP_GT_distances'
)
#Build annotation table
color_by <- report_config$SNP_comparison$dendrogram.color.by #%>% check_cols_exits()
shape_by <- report_config$SNP_comparison$dendrogram.shape.by #%>% check_cols_exits()
dend.format.df <- sampletable %>%
filter(Sample_ID %in% snp.distance.tb$sample_distance_to)
if (length(color_by) == 1) {
if (nrow(unique(dend.format.df[, color_by])) > 10) {
warn_str <- str_glue('More than 10 colors are needed to use "{color_by}" for coloring. Consider using fewer unqiue entries.')
cat(warn_str, '\n\n')
warning(warn_str)
}
dend.format.df <- dend.format.df %>%
mutate(across(all_of(color_by), ~ factor(., levels = sort(unique(.)))) ) %>%
arrange(!!sym(color_by)) %>%
mutate(col = viridis_pal(option='H')(length(unique(!!sym(color_by))))[match(!!sym(color_by), unique(!!sym(color_by)))],
col = ifelse(is.na(!!sym(color_by)), 'grey90', col)
)
col_map <- dend.format.df$col %>% unique()
names(col_map) <- levels(pull(dend.format.df, !!sym(color_by)))
if (nrow(snp.distance.tb) == 1) {
cat('No additional samples found for the comparison, no dendrogram can be built.\n\n')
} else {
warning('No matching column to use for coloring!')
cat('No matching column to use for coloring!\n')
dend.format.df <- dend.format.df %>%
mutate(col = 'black', `_dummy_color` = 'bar')
col_map <- c('bar' = 'black')
color_by <- '_dummy_color'
}
if (length(shape_by) == 1) {
dend.format.df <- dend.format.df %>%
mutate(across(all_of(shape_by), ~ factor(., levels = sort(unique(.), na.last = T))),
) %>%
arrange(!!sym(shape_by))
if (nrow(na.omit(unique(dend.format.df[, shape_by]))) <= 5) {
use.shapes <- 15:18
na.shape <- 1
} else if (nrow(na.omit(unique(dend.format.df[, shape_by]))) <= 15) {
use.shapes <- 0:14
na.shape <- 16
cat(
'Sample identities can be comparsed based on the dendrogram built on the SNP genotypes. ',
'The dendrogram is built using the manhattan distance between samples, counting both alleles. ',
'Accoringly, the distance between two samples is the sum of the absolute differences between the two ',
'alleles at each SNP (also shown in the table below). Samples that are very close together are likely ',
'identical or clonally related. ',
'Sample selection as well as color and shape lables are controlled by the config file.\n\n'
)
#Build annotation table
color_by <- report_config$SNP_comparison$dendrogram.color.by #%>% check_cols_exits()
shape_by <- report_config$SNP_comparison$dendrogram.shape.by #%>% check_cols_exits()
dend.format.df <- sampletable %>%
filter(Sample_ID %in% snp.distance.tb$sample_distance_to)
if (length(color_by) == 1) {
if (nrow(unique(dend.format.df[, color_by])) > 10) {
warn_str <- str_glue('More than 10 colors are needed to use "{color_by}" for coloring. Consider using fewer unqiue entries.')
cat(warn_str, '\n\n')
warning(warn_str)
}
dend.format.df <- dend.format.df %>%
mutate(across(all_of(color_by), ~ factor(., levels = sort(unique(.)))) ) %>%
arrange(!!sym(color_by)) %>%
mutate(col = viridis_pal(option='H')(length(unique(!!sym(color_by))))[match(!!sym(color_by), unique(!!sym(color_by)))],
col = ifelse(is.na(!!sym(color_by)), 'grey90', col)
)
col_map <- dend.format.df$col %>% unique()
names(col_map) <- levels(pull(dend.format.df, !!sym(color_by)))
} else {
others <- dend.format.df[pull(dend.format.df, !!sym(shape_by)) %>% as.integer() > 15, shape_by] %>% na.omit() %>% unlist()
warn_str <- str_glue('Only 15 shapes are available, but "{shape_by}" would need {length(unique(pull(dend.format.df, !!sym(shape_by))))}. ',
'Consider using fewer unqiue entries. These values are summarised as "Other": {paste(others, collapse = ", ")}')
cat(warn_str, '\n\n')
warning(warn_str)
use.shapes <- 0:14
na.shape <- 16
warning('No matching column to use for coloring!')
cat('No matching column to use for coloring!\n')
dend.format.df <- dend.format.df %>%
mutate(col = 'black', `_dummy_color` = 'bar')
col_map <- c('bar' = 'black')
color_by <- '_dummy_color'
}
if (length(shape_by) == 1) {
dend.format.df <- dend.format.df %>%
mutate(across(all_of(shape_by), ~ factor(., levels = sort(unique(.), na.last = T))),
) %>%
arrange(!!sym(shape_by))
if (nrow(na.omit(unique(dend.format.df[, shape_by]))) <= 5) {
use.shapes <- 15:18
na.shape <- 1
} else if (nrow(na.omit(unique(dend.format.df[, shape_by]))) <= 15) {
use.shapes <- 0:14
na.shape <- 16
} else {
others <- dend.format.df[pull(dend.format.df, !!sym(shape_by)) %>% as.integer() > 15, shape_by] %>% na.omit() %>% unlist()
warn_str <- str_glue('Only 15 shapes are available, but "{shape_by}" would need {length(unique(pull(dend.format.df, !!sym(shape_by))))}. ',
'Consider using fewer unqiue entries. These values are summarised as "Other": {paste(others, collapse = ", ")}')
cat(warn_str, '\n\n')
warning(warn_str)
use.shapes <- 0:14
na.shape <- 16
dend.format.df <- dend.format.df %>%
mutate(
across(all_of(shape_by), ~ ifelse(as.integer(.) > 15, 'Other', as.character(.))),
across(all_of(shape_by), ~ factor(., levels = unique(.)))
)
}
dend.format.df <- dend.format.df %>%
mutate(
across(all_of(shape_by), ~ ifelse(as.integer(.) > 15, 'Other', as.character(.))),
across(all_of(shape_by), ~ factor(., levels = unique(.)))
shape = use.shapes[match(!!sym(shape_by), unique(!!sym(shape_by)))],
shape = ifelse(is.na(!!sym(shape_by)), na.shape, shape)
)
shape_map <- dend.format.df$shape %>% unique()
names(shape_map) <- levels(pull(dend.format.df, !!sym(shape_by)))
} else {
warning('No matching column to use for shapes!')
cat('No matching column to use for shapes!\n\n')
dend.format.df <- dend.format.df %>%
mutate(shape = 16, `_dummy_shape` = 'foo')
shape_map <- c('foo' = 16)
shape_by <- '_dummy_shape'
}
dend.format.df <- dend.format.df %>%
mutate(
shape = use.shapes[match(!!sym(shape_by), unique(!!sym(shape_by)))],
shape = ifelse(is.na(!!sym(shape_by)), na.shape, shape)
SNP.genotype.distances <- snp.distance.tb %>% select(-sample_distance_to) %>% as.matrix()
# Build dendrogram & sort annotation table by it
hc <- snp.distance.tb %>%
column_to_rownames('sample_distance_to') %>%
as.dist() %>%
hclust()
dd <- as.dendrogram(hc)
dend.format.df <- dend.format.df[match(labels(dd), dend.format.df$Sample_ID),]
dend <- dd %>%
set('labels_col', dend.format.df$col) %>%
set('leaves_col', dend.format.df$col) %>%
set('leaves_pch', dend.format.df$shape)
gg1 <- dend %>%
raise.dendrogram (max(SNP.genotype.distances)/50) %>%
set('labels_cex', .8) %>%
set('leaves_cex', 2) %>%
set('branches_lwd', .5) %>%
as.ggdend() %>%
ggplot(offset_labels = -max(SNP.genotype.distances)/25, theme = theme_classic()) +
scale_y_continuous(
name = 'SNP distance (manhattan method)',
breaks = function(limits) breaks_pretty()(c(max(limits[1], 0), limits[2]), 5) + max(SNP.genotype.distances)/50,
labels = function(breaks) number(breaks - max(SNP.genotype.distances)/50, big.mark = ','),
limits = c(-max(SNP.genotype.distances)/2, NA),
expand = expansion()
) +
scale_x_continuous(limits = c(0, NA), expand = expansion(add = c(0, .5))) +
geom_line(
data = tibble(x = c(0, 0), y = max(SNP.genotype.distances) * c(1/50, 51/50)),
aes(x=x, y=y)
) +
theme(
legend.position = "bottom",
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.y = element_blank(),
axis.title.y = element_text(angle = 90, hjust = .75)
)
# make legend
gg2 <- dend.format.df %>%
#arrange(SampleGroup) %>%
ggplot(aes(x = Sample_ID, y = 1, col = !!sym(color_by), shape = !!sym(shape_by))) +
geom_point() +
scale_color_manual(
values = col_map,
guide = ifelse(
color_by == '_dummy_color',
list('none'),
list(guide_legend(direction = 'horizontal',title.position = 'top', ncol = 8, byrow=T))
)[[1]]
) +
scale_shape_manual(
values = shape_map,
guide = ifelse(
shape_by == '_dummy_shape',
list('none'),
list(guide_legend(direction = 'horizontal',title.position = 'top', ncol = 8, byrow=T))
)[[1]]
) +
theme(legend.box = "vertical", legend.text = element_text(size = 10), legend.title = element_text(size = 12))
gg <- gg1 + as_ggplot(get_legend(gg2)) + plot_layout(ncol = 1, heights = c(5, 1))
subchunkify(gg, 'snp.dendrogram.plot', 8, 10)
shape_map <- dend.format.df$shape %>% unique()
names(shape_map) <- levels(pull(dend.format.df, !!sym(shape_by)))
} else {
warning('No matching column to use for shapes!')
cat('No matching column to use for shapes!\n\n')
dend.format.df <- dend.format.df %>%
mutate(shape = 16, `_dummy_shape` = 'foo')
shape_map <- c('foo' = 16)
shape_by <- '_dummy_shape'
snp.distance.tb %>%
filter(sample_distance_to == sample_id) %>%
dplyr::select(-sample_distance_to) %>%
select(all_of(labels(dd))) %>%
simple_table_output(params$out_format, paste0('Distances to "', sample_id, '"'))
}
SNP.genotype.distances <- snp.distance.tb %>% select(-sample_distance_to) %>% as.matrix()
# Build dendrogram & sort annotation table by it
hc <- snp.distance.tb %>%
column_to_rownames('sample_distance_to') %>%
as.dist() %>%
hclust()
dd <- as.dendrogram(hc)
dend.format.df <- dend.format.df[match(labels(dd), dend.format.df$Sample_ID),]
dend <- dd %>%
set('labels_col', dend.format.df$col) %>%
set('leaves_col', dend.format.df$col) %>%
set('leaves_pch', dend.format.df$shape)
gg1 <- dend %>%
raise.dendrogram (max(SNP.genotype.distances)/50) %>%
set('labels_cex', .8) %>%
set('leaves_cex', 2) %>%
set('branches_lwd', .5) %>%
as.ggdend() %>%
ggplot(offset_labels = -max(SNP.genotype.distances)/25, theme = theme_classic()) +
scale_y_continuous(
name = 'SNP distance (manhattan method)',
breaks = function(limits) breaks_pretty()(c(max(limits[1], 0), limits[2]), 5) + max(SNP.genotype.distances)/50,
labels = function(breaks) number(breaks - max(SNP.genotype.distances)/50, big.mark = ','),
limits = c(-max(SNP.genotype.distances)/2, NA),
expand = expansion()
) +
scale_x_continuous(limits = c(0, NA), expand = expansion(add = c(0, .5))) +
geom_line(
data = tibble(x = c(0, 0), y = max(SNP.genotype.distances) * c(1/50, 51/50)),
aes(x=x, y=y)
) +
theme(
legend.position = "bottom",
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank(),
axis.line.y = element_blank(),
axis.title.y = element_text(angle = 90, hjust = .75)
)
# make legend
gg2 <- dend.format.df %>%
#arrange(SampleGroup) %>%
ggplot(aes(x = Sample_ID, y = 1, col = !!sym(color_by), shape = !!sym(shape_by))) +
geom_point() +
scale_color_manual(
values = col_map,
guide = ifelse(
color_by == '_dummy_color',
list('none'),
list(guide_legend(direction = 'horizontal',title.position = 'top', ncol = 8, byrow=T))
)[[1]]
) +
scale_shape_manual(
values = shape_map,
guide = ifelse(
shape_by == '_dummy_shape',
list('none'),
list(guide_legend(direction = 'horizontal',title.position = 'top', ncol = 8, byrow=T))
)[[1]]
) +
theme(legend.box = "vertical", legend.text = element_text(size = 10), legend.title = element_text(size = 12))
gg <- gg1 + as_ggplot(get_legend(gg2)) + plot_layout(ncol = 1, heights = c(5, 1))
subchunkify(gg, 'snp.dendrogram.plot', 8, 10)
snp.distance.tb %>%
filter(sample_distance_to == sample_id) %>%
dplyr::select(-sample_distance_to) %>%
select(all_of(labels(dd))) %>%
simple_table_output(params$out_format, paste0('Distances to "', sample_id, '"'))
```

```{r virtual.karyotype, eval = include.section('virtual.karyotype'), results='asis'}
Expand Down
Loading

0 comments on commit 4ab9223

Please sign in to comment.