CTG_report.Rmd

---
params:
    data_file: "/Volumes/cmap_obelix/pod/custom/CPS/jdwork/CPS004/CPS004_ctg_viability_data.Rds"
title: "`r basename(dirname(params$data_file))`"
author: "Andrew Boghossian"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F, include = F, warning = F, error = F, message = F,
                      fig.width = 10)
library(plyr)
library(tidyverse)
library(ggthemes)
library(magrittr)
library(future.apply)
library(psych)
library(ComplexHeatmap)
library(viridis)
library(circlize)
library(ggridges)
theme_set(theme_bw(base_size = 12))
suppressMessages(source("src/helperFunctions.R"))
suppressMessages(source("src/make_dose_curves.R", chdir = T))

project <- word(basename(dirname(params$data_file)), 2, -1, sep = fixed("_"))
```

```{r load data}
normalized_data <- read_rds(params$data_file)
if (all(is.na(normalized_data$plate_map_name)) &
    "Source Barcode" %in% colnames(normalized_data)) {
  normalized_data %<>% dplyr::mutate(plate_map_name = `Source Barcode`)
}
```

# Introduction

This is a QC of the `r project` CTG plates.

# Luminescence and viability distributions

## Luminescence distributions

```{r luminescence, include = T, fig.width = 10, fig.height = 8}
normalized_data %>% 
  ggplot(aes(x = lum, color = pert_type)) + 
  geom_density() +
  geom_histogram(aes( y = ..density.., fill = pert_type), alpha = .3, position = 'identity') + 
  scale_x_log10() +
  labs(x = "Luminescence", y = "Density", color = "Type") +
  ggtitle('Distribution of luminescence, separated by treatment type') +
  scale_color_tableau(guide = "none") + scale_fill_tableau(name = "pert_type") +
  facet_grid(plate_map_name + replicate ~ ccle_name)
```

## Control luminescence distributions

```{r, include=T, fig.width=10, fig.height=8}
normalized_data %>%
  dplyr::filter(pert_type != "treatment") %>%
  ggplot(aes(x = lum)) + 
  geom_histogram(aes( y = ..density..), position = 'identity') + 
  labs(x = "Luminescence", y = "Density", color = "Type") +
  ggtitle('Distribution of luminescence, separated by treatment type') +
  facet_grid(plate_map_name + replicate ~ word(ccle_name, 1, sep = fixed("_")) + pert_type,
             scales = "free")
```

## Viability distributions

```{r viability, include = T, fig.width = 10, fig.height = 8}
normalized_data %>% 
  ggplot(aes(x = viability, color = pert_type)) + 
  geom_density() +
  geom_histogram(aes( y = ..density.., fill = pert_type), alpha = .3, position = 'identity') + 
  scale_x_log10() +
  labs(x = "Viability", y = "Density", color = "Type") +
  scale_color_tableau(guide = "none") + scale_fill_tableau(name = "pert_type") +
  ggtitle('Distribution of viability, separated by treatment type') +
  facet_grid(plate_map_name + replicate ~ word(ccle_name, 1, sep = fixed("_")))
```

# SSMDs

```{r calc ssmd}
ssmds <- normalized_data %>%
  dplyr::group_by(arp_barcode, ccle_name, plate_map_name, replicate) %>%
  dplyr::summarise(ssmd = calculate_ssmd_robust(log_fc[pert_type == 'poscon'],
                                                log_fc[pert_type == 'negcon'])) %>%
  dplyr::ungroup()
```

```{r ssmd plot, include = T}
ssmds %>%
  DT::datatable()

ssmds %>%
  ggplot(aes(x = ssmd, fill = paste(word(ccle_name, 1, sep = fixed("_")), plate_map_name, replicate))) + 
  geom_histogram() + 
  labs(x = "SSMD", y = "Count", fill = "Cell Line Replicate") +
  scale_y_continuous(limits = c(0, NA)) +
  geom_vline(aes(xintercept = -2), linetype = "dashed")
```

# Replicate correlations
```{r, include = T, fig.height=8}
if (length(normalized_data$replicate %>% unique()) > 1) {
  normalized_data %>%
    dplyr::filter(pert_type != "treatment" | !is.na(dose), is.finite(viability)) %>%
    tidyr::pivot_wider(id_cols = c("ccle_name", "pert_type", "dose", "broad_id"),
                       names_from = c("plate_map_name", "replicate"),
                       values_from = "viability",
                       values_fn = median) %>%
    dplyr::select(-ccle_name, -dose, -pert_type, -broad_id) %>%
    .[, apply(., 2, function(x) sum(!is.na(x)) > 4)] %>%
    psych::pairs.panels(ellipses = F, hist.col = tableau_color_pal()(1)) 
} else {
  normalized_data %>%
    dplyr::filter(pert_type != "treatment" | !is.na(dose), is.finite(viability)) %>%
    dplyr::group_by(ccle_name, pert_type, dose, broad_id) %>%
    dplyr::mutate(fake_rep = 1:n()) %>%
    dplyr::ungroup() %>%
    tidyr::pivot_wider(id_cols = c("ccle_name", "pert_type", "dose", "broad_id"),
                       names_from = c("plate_map_name", "fake_rep"),
                       values_from = "viability",
                       values_fn = median) %>%
    dplyr::select(-ccle_name, -dose, -pert_type, -broad_id) %>%
    .[, apply(., 2, function(x) sum(!is.na(x)) > 4)] %>%
    psych::pairs.panels(ellipses = F, hist.col = tableau_color_pal()(1))
}

```

# Does viability decrease with dose?

## Spearman correlation

We can use spearman correlation to measure the monotonicity of dose vs viability. 

Negative correlations indicate dose-response behavior.

```{r correlations, include = TRUE}
spearman_correlations <- normalized_data %>%
  dplyr::group_by(arp_barcode, broad_id, ccle_name, plate_map_name, replicate) %>%
  dplyr::filter(length(unique(dose)) > 3) %>%
  dplyr::summarise(spearman = cor(viability, as.numeric(dose), method = 'spearman', use = 'p')) %>%
  dplyr::ungroup()

ggplot(spearman_correlations, aes(y = spearman, x = replicate)) + 
  geom_boxplot(aes(fill = replicate)) + 
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme(axis.text.x = element_text(angle = 45)) + 
  scale_fill_tableau(guide = "none") +
  facet_grid(plate_map_name ~ word(ccle_name, 1, sep = fixed("_")), scales = 'free') +
  labs(x = "Replicate", y = "Spearman Correlation")
```

## Treatment dose trends

```{r, include = T, fig.width = 12, fig.height = 12}
if ("dose_2" %in% colnames(normalized_data)) {
  normalized_data %>%
    dplyr::mutate(dose = as.numeric(dose)) %>%
    dplyr::filter(pert_type == "treatment", !is.na(dose)) %>%
    dplyr::mutate(dose_2 = signif(dose_2, digits = 3)) %>%
    ggplot(aes(x = dose, y = viability, color = plate_map_name)) +
    geom_smooth() + geom_point() +
    scale_x_log10() +
    facet_wrap(broad_id + broad_id_2 + dose_2 ~ word(ccle_name, 1, sep = fixed("_")),
               scales = "free_x", ncol = 6) +
    labs(color = "Plate") +
    scale_color_tableau(palette = "Tableau 20") +
    ylim(c(-5, 150))
} else {
  normalized_data %>%
    dplyr::mutate(dose = as.numeric(dose)) %>%
    dplyr::filter(pert_type == "treatment", !is.na(dose)) %>%
    ggplot(aes(x = dose, y = viability, color = plate_map_name)) +
    geom_smooth() + geom_point() +
    scale_x_log10() +
    facet_wrap(broad_id ~ word(ccle_name, 1, sep = fixed("_")),
               scales = "free_x", ncol = 6) +
    labs(color = "Plate") +
    scale_color_tableau(palette = "Tableau 20") +
    ylim(c(-5, 150))
}
```
**Note:** these plots are limited to the range $[-5, 150]$ and exclude some outlying points (particularly negative values) on worse performing plates.


# Expected sensitivity compounds 

## Comparison with past screens

```{r comparison processing}
# validation compound info
val_compounds <- data.table::fread('val_compounds.csv', data.table = F) %>%
  .$pert_id
val_mapping <- data.table::fread('val_compounds.csv', data.table = F)

# read in old MTS files and clean up to have same columns (lazy fix)
old_mts_files <- list.files("old_processed_ctg_data", pattern = "*.Rds")
old_mts_data <- old_mts_files %>%
  purrr::set_names(str_sub(., 1, -5)) %>%
  purrr::map_dfr(~read_rds(file.path("old_processed_ctg_data", .x)) %>%
                   dplyr::select(ccle_name,
                                 any_of(c("brd_id", "broad_sample", "broad_id",
                                          "compound_plate", "plate_map_name",
                                          "dose", "pert_dose", "mmoles_per_liter",
                                          "pert_type", "trt_type")),
                                 source, viability, log_fc),
                 .id = "file") %>%
  dplyr::mutate(broad_sample = ifelse(is.na(broad_sample), brd_id, broad_sample),
                broad_sample = ifelse(is.na(broad_sample), broad_id, broad_sample),
                dose = ifelse(is.na(dose), pert_dose, dose),
                dose = ifelse(is.na(dose), mmoles_per_liter, dose),
                plate_map_name = ifelse(is.na(plate_map_name), compound_plate, plate_map_name),
                pert_type = ifelse(is.na(pert_type), trt_type, pert_type)) %>%
  dplyr::select(-brd_id, -broad_id, -pert_dose, -compound_plate, -trt_type, -mmoles_per_liter) %>%
  dplyr::mutate(pert_id = brd_id_to_structure_id(broad_sample),
                plate_map_name = paste0(plate_map_name, " (", source, ")")) %>%
  dplyr::rename(broad_id = broad_sample) %>%
  dplyr::mutate(pert_id = ifelse(str_detect(pert_id, fixed('BRD-A12230535')),
                                 "BRD-K73472992", pert_id))

# filter old data to validation only
old_mts_val <- old_mts_data %>% 
  dplyr::inner_join(val_mapping) %>%
  dplyr::mutate(ccle_name = word(ccle_name, 1, sep = fixed("_")),
                ccle_name = str_replace_all(ccle_name, "[^[:alnum:] ]", ""))

val_compound_ctg <- normalized_data %>%
  dplyr::mutate(pert_id = brd_id_to_structure_id(broad_id)) %>%
  dplyr::filter(pert_id %in% val_compounds) %>%
  dplyr::mutate(plate_map_name = paste0(plate_map_name, " (", project, ")")) %>%
  dplyr::mutate(ccle_name = word(ccle_name, 1, sep = fixed("_")),
                ccle_name = str_replace_all(ccle_name, "[^[:alnum:] ]", "")) %>%
  dplyr::select(-any_of(c("pert_name"))) %>%
  dplyr::inner_join(val_mapping)
```

```{r}
# function to compare validation compounds of two screens
make_val_compound_plot <- function(new_df, old_df, comp_name, dataset) {
  
  # load in new and old screens
  new_df <- new_df %>%
    dplyr::filter(pert_name == comp_name) %>%
    dplyr::mutate(dose = as.numeric(dose))
  old_df <- old_df %>%
    dplyr::filter(source == dataset,
                  pert_name == comp_name) %>%
    dplyr::mutate(dose = as.numeric(dose)) %>%
    dplyr::filter(ccle_name %in% unique(new_df$ccle_name))
  
  # compute median viabilities
  new_median_viability <- new_df %>%
    dplyr::group_by(ccle_name, dose, plate_map_name, source) %>%
    dplyr::summarise(viability = median(viability, na.rm = T), .groups = "drop") %>%
    dplyr::ungroup()
  old_median_viability <- old_df %>%
    dplyr::group_by(ccle_name, dose, plate_map_name, source) %>%
    dplyr::summarise(viability = median(viability, na.rm = T), .groups= "drop") %>%
    dplyr::ungroup()
  
  # combine
  combined_viability <- dplyr::bind_rows(new_df, old_df)
  combined_median_viability <- dplyr::bind_rows(new_median_viability, old_median_viability)
  
  # fit curves
  curves <- combined_viability %>%
    dplyr::mutate(pert_effective_dose = as.numeric(dose), grouping = 1) %>%
    plyr::ddply(.(pert_name, plate_map_name, ccle_name),
                function(x) get_sigmoid(x, fitDoseCurve(x))) %>%
    dplyr::mutate(dose = exp(x), viability = y)
  
  # plot
  combined_viability %>%
    ggplot(aes(x = dose, y = viability, color = plate_map_name)) + 
    geom_point(alpha = .3) + 
    geom_point(data = new_df) +
    geom_line(data = dplyr::filter(curves, str_detect(plate_map_name, project))) +
    geom_line(data = curves, alpha = .3) +
    scale_x_log10() + 
    labs(x = "Dose", y = "Viability", color = "Plate") +
    facet_grid(. ~ ccle_name) + 
    scale_color_tableau() +
    ggtitle(paste(dataset, comp_name))
}

compounds <- distinct(old_mts_val, pert_name, source) %>%
  dplyr::filter(pert_name %in% unique(val_compound_ctg$pert_name)) %>%
  dplyr::arrange(desc(source))

if (nrow(compounds) > 0) {
  comp_plots <- future_apply(compounds, 1,
                             FUN = function(x) make_val_compound_plot(val_compound_ctg, old_mts_val,
                                                                      x["pert_name"], x["source"])) 
}
```

```{r comparison plots, fig.width = 10, fig.height = 4, include=T}
if (nrow(compounds) > 0 ) {
  for(p in comp_plots) {
    print(p)
  } 
}
```

# Heatmaps {.tabset .tabset-pills}
```{r, results='asis', include = T, fig.height=3, fig.width=8}
for (cell_line in (normalized_data %>% 
                   dplyr::filter(is.finite(lum)))$ccle_name %>% unique()) {
  cat(sprintf('\n\n## %s {.tabset .tabset-pills}\n\n', cell_line, '\n\n'))
  df <- normalized_data %>% dplyr::filter(ccle_name == cell_line) %>%
    dplyr::mutate(plate_row = str_sub(Well_Position, 1, 1),
                  plate_col = str_sub(Well_Position, 2, -1)) %>%
    dplyr::arrange(replicate)
  plots <- list(); ix <- 1
  for (plate in df$arp_barcode %>% unique) {
    plot_df <- df %>% 
      dplyr::filter(arp_barcode == plate) %>%
      dplyr::distinct(replicate, plate_row, plate_col, lum, pert_type) %>%
      dplyr::arrange(plate_col, plate_row)
    if (all(is.na(plot_df$lum))) next
    rep <- unique(plot_df$replicate)
    plot_mat <- plot_df %>%
      reshape2::acast(plate_row ~ plate_col, value.var = "lum")
    type_mat <- plot_df %>%
      dplyr::mutate(pert_type = str_sub(pert_type, 1, 1)) %>%
      reshape2::acast(plate_row ~ plate_col, value.var = "pert_type")
    
    p <- ComplexHeatmap::Heatmap(plot_mat, cluster_rows = F, cluster_columns = F,
                                 heatmap_legend_param = list(title = "Luminescence"),
                                 row_title = paste(plate, rep),
                                 col = viridis(1000),
                                 cell_fun = function(j, i, x, y, w, h, col) {grid.text(type_mat[i, j], x, y)})
    # plots[[ix]] <- p; ix <- ix +1
    print(p)
    cat("\n\n")
  }
  # ht_list <- reduce(plots, `%v%`)
  # draw(ht_list, merge_legend = F, show_heatmap_legend = F)
  # cat("\n\n")
}
```