4.Performance on real EWAS-data.Rmd

---
title: "Performance on real EWAS-data"
author: "Bai Ling"
date: "`r format(Sys.Date(), format = '%d %B %Y')`"
output: 
  html_document:
    theme: cerulean
    css: style.css
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.align = "center")
options(stringsAsFactors = FALSE)
setwd("/Users/apple/Documents/GitHub/EWAS/")
```

## Contents
***
- [1. Run time for each method on real EWAS data](#runtime)
- [2. Signal numbers: ST vs BH](#STvsBH)
- [3. Overview of changes across methods and covariates](#overview)  
- [4. An example demonstrated the strong informativeness of covariate mean](#example)
- [5. Informative covariates enhance detection power](#informative)
<br>
<br>
<br>

#### Library R packages and basic settings
```{r library prepare, message=FALSE, warning=FALSE}
library(ggplot2)
library(tidyverse)
library(reshape2)
library(circlize)
library(ComplexHeatmap)
library(RColorBrewer)
library(cowplot)
library(ggplotify)
library(ggpubr)
library(ggforce)
method_col <- c("CAMT" = "#FF0000", "AdaPT" = "#E69F00", "IHW" = "#56B4E9", 
                "FDRreg" = "#009E73", "BL" = "#0072B2")
measure_col <- c("BH"="#1B9E77", "ST"="#D95F02", "sd.b"="#7570B3", "sd.m"="#E7298A", "mean"="#A6CEE3", 
                 "mad"="#1F78B4", "dip"="#B2DF8A", "precision"="#FFFF00", "icc.b"="#FB9A99", 
                 "icc.m"="#E31A1C", "refgene.pos"="#FDBF6F", "chr"="#A6761D", "cpg.loc"="#CAB2D6",
                 "dhs"="#949494", "direction"="#7FFFD4", "probe.type" ="#FFE4E1")
```

### 1. Run time for each method on real EWAS data {#runtime}
***
```{r run time, fig.height=4, fig.width=6}
runtime <- readRDS("Data/runtime.RDs")
runtime <- mutate(runtime, log_runtime = log10(Run_Time)) %>%
  group_by(measure, method) %>%
  summarise(n = n(),
            mean = mean(log_runtime),
            sd = sd(log_runtime),
            se = sd / sqrt(n))

selected_methods <- c("AdaPT", "BL", "CAMT", "FDRreg", "IHW")
selected_measure <- c("dip", "icc.b", "icc.m", "mad", "mean", "precision", "sd.b", "sd.m",
                         "dhs", "direction", "probe.type", "cpg.loc", "refgene.pos", "chr")

fs6 <- runtime %>%
  filter(method %in% selected_methods, measure %in% selected_measure) %>%
  ggplot(aes(x = measure, y = mean, group = method, color = method)) +
  geom_line(lwd = 0.4) +
  geom_point(size = 0.5) +
  geom_errorbar(aes(ymin = mean-1.96*se, ymax = mean+1.96*se), width = 0.15, lwd = 0.35) +
  ylab("log10 Runtime(s)") + xlab("") +
  theme_bw() +
  theme(title = element_text(size = 10), 
         text = element_text(size = 8), 
         axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 8), 
         axis.text.y = element_text(size = 8),
         legend.position = "right", 
         legend.justification = "center",
         legend.title = element_blank()) +
   scale_x_discrete(limits = selected_measure) +
   scale_color_manual(values = method_col)

fs6
```

```{r, include=FALSE}
save_plot("Figures/FigureS6.pdf", fs6)
```

### 2. Signal numbers: ST vs BH {#STvsBH}
*** 
For each data set, the number of ST detected significant differential CpGs is always no less than that of BH. So the result of ST is used as reference in the comparison with other covariate-adaptive FDR control methods.
```{r, message=FALSE, warning=FALSE, fig.height=4, fig.width=6}
reference_discovery <- read.csv("Data/reference_discovery.csv", header = TRUE)
fs7 <- reference_discovery %>%
  ggplot(aes(x = BH + 1, y = ST + 1)) +
  geom_point() +
  geom_abline(slope = 1, color = "red") +
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(trans = "log10") +
  theme_bw() +
  xlab("BH") + ylab("ST") +
  theme(title = element_text(size = 10), 
         text = element_text(size = 8))
fs7
```

```{r, include=FALSE}
save_plot("Figures/FigureS7.pdf", fs7)
```

### 3. Overview of changes across methods and covariates{#overview}
***
#### 3.1 Heatmap: Performance evaluation by signal changes
```{r heatmp, message=FALSE, fig.height=6, fig.width=7.5}
discovery_number <- read.csv("Data/discovery_number.csv", header = TRUE)
covariate_order <- c("mean", "sd.b", "mad", "precision", "sd.m", "icc.m", "cpg.loc",
                     "direction", "icc.b", "refgene.pos", "probe.type", "dip", "dhs", "chr")

plot_data <- left_join(discovery_number, reference_discovery, by = "dataset") %>%
  filter(! measure %in% c("icc.b", "icc.m")) %>%
  arrange(pi0) %>%
  mutate(logFC = log2((discovery + 1)/(ST + 1)))

# prepare the matrix for heatmap
plot_data$measure <- factor(plot_data$measure, levels = covariate_order)
plot_data$dataset <- factor(plot_data$dataset, levels = unique(plot_data$dataset))
heatmap_data <- dcast(plot_data, dataset ~ method + measure, value.var = "logFC")
rownames(heatmap_data) <- heatmap_data$dataset
heatmap_data$dataset <- NULL
heatmap_data <- as.matrix(heatmap_data)

# prepare row and column annotation
annotation_row <- plot_data %>%
  select(dataset, ST, pi0)
annotation_row <- annotation_row[! duplicated.data.frame(annotation_row), ]

# top annotation: ranks of discovered signal number recovered by different covariates within each dataset
top_anno <- plot_data %>%
  group_by(dataset, method) %>%
  mutate(rank = rank(discovery, ties.method = "average")) %>%
  dcast(dataset ~ method + measure, value.var = "rank")
top_anno$dataset <- NULL

# heatmap
col_fun <- colorRamp2(c(-1, -0.5, 0, 0.5, 1), c("royalblue", "blue", "white", "red", "red3"), space = "RGB")
ST_color <- colorRamp2(c(0, 200, 1000, 10000), c("#DEEBF7", "#C6DBEF", "#6BAED6", "#08306B"))
pi0_color <- colorRamp2(c(0.6, 0.9, 0.95, 1), c("#F7FCF5", "#A1D99B", "#41AB5D", "#00441B"))

f3 <- Heatmap(heatmap_data,
              name = "logFC",
              rect_gp = gpar(col = "grey", lwd = 0.05),
              cluster_rows = FALSE,
              cluster_columns = FALSE,
              col = col_fun, 
              left_annotation = rowAnnotation(
                ST = annotation_row$ST,
                pi0 = annotation_row$pi0,
                col = list(ST = ST_color,
                           pi0 = pi0_color),
                simple_anno_size = unit(0.25, "cm"),
                annotation_name_gp = gpar(fontsize = 8),
                annotation_legend_param = list(title_gp = gpar(fontsize = 9),
                                               labels_gp = gpar(fontsize = 7))),
              top_annotation = columnAnnotation(
                rank = anno_boxplot(top_anno, outline = FALSE, 
                gp = gpar(fill = measure_col[covariate_order[-grep("icc", covariate_order)]]))),
              row_names_gp = gpar(fontsize = 5), 
              column_names_gp = gpar(fontsize = 8),
              column_split = rep(c("AdaPT", "BL", "CAMT", "FDRreg", "IHW"), each = 12),
              column_gap = unit(0, "mm"),
              column_labels = unlist(lapply(str_split(colnames(heatmap_data), pattern = "_"),
                                                 function(x)x[[2]])),
             heatmap_legend_param = list(title_gp = gpar(fontsize = 9),
                                         labels_gp = gpar(fontsize = 7)))

f3
```

```{r, include=FALSE}
pdf("Figures/Figure3.pdf", width = 6.69, height = 6)
draw(f3)
dev.off()
```

#### 3.2 Boxplot of logFC corresponding to the heatmap
```{r, fig.height=4, fig.width=6}
# Method AdaPT's performance is very different from other methods, so an alternative range is set for it.

p1 <- plot_data %>%
  filter(method == "AdaPT") %>%
  ggplot(aes(x = measure, y = logFC)) +
  geom_boxplot(aes(fill = measure), outlier.shape = NA, fatten = 1) +
  scale_fill_manual(values = measure_col) +
  coord_cartesian(ylim = c(-6, 6)) +
  facet_wrap(~ method) +
  theme_bw() +
  xlab("") +
  theme(title = element_text(size = 10),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none")

p2 <- plot_data %>%
  filter(method != "AdaPT") %>%
  ggplot(aes(x = measure, y = logFC)) +
  geom_boxplot(aes(fill = measure), outlier.shape = NA, fatten = 1) +
  scale_fill_manual(values = measure_col) +
  coord_cartesian(ylim = c(-2, 2)) +
  facet_wrap(~ method, nrow = 1, scales = "free_y") +
  theme_bw() +
  xlab("") + ylab("") +
  theme(title = element_text(size = 10),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.key.size = unit(0.3, "cm"),
        legend.position = "right",
        legend.justification = "center")

fs8_1 <- plot_grid(p1, p2, rel_widths = c(1, 4))
```
<br>
<br>

#### 3.3 Boxplot of logFC for each Method with the best informative covariate
```{r, message=FALSE, warning=FALSE, fig.height=4, fig.width=6}
# select the most informative covariate (with min omnibus test p-value)
omnibus_result <- read.csv("Data/omnibus.result.csv", header = TRUE)

# select the most significant covariate for each dataset
selected <- omnibus_result %>%
  group_by(dataset) %>%
  arrange(factor(dataset), p.value, desc(stat.o)) %>%
  filter(rank(p.value, ties.method = "first") == 1, p.value < 0.05) %>%
  select(dataset, measure) %>%
  mutate(id = paste0(dataset, "_", measure))

fs8_2 <- left_join(discovery_number, reference_discovery, by = "dataset") %>%
  mutate(logFC = log2((discovery + 1)/(ST + 1))) %>%
  mutate(id = paste0(dataset, "_", measure)) %>%
  filter(id %in% selected$id) %>%
  ggplot(aes(x = method, y = logFC, color = method)) +
  geom_boxplot(outlier.shape = NA, fatten = 1, lwd = 0.3) +
  geom_sina(size = 1, alpha = 0.5, shape = 16) +
  coord_cartesian(ylim = c(-3.75, 2.5)) +
  scale_color_manual(values = method_col) +
  theme_bw() +
  theme(legend.position = "none", 
        axis.text.x = element_text(size = 8),
        axis.text.y = element_text(size = 8),
        title = element_text(size = 10)) +
  xlab("") + ylab("logFC")
```
<br>
<br>
```{r}
fs8 <- plot_grid(fs8_1, fs8_2, labels = c("A", "B"), ncol = 1, label_size = 18)
fs8
```

```{r, include=FALSE}
save_plot("Figures/FigureS8.pdf", fs8, base_width = 6.69, base_height = 4)
```

### 4. An example demonstrated the strong informativeness of covariate mean {#example}
***
```{r, fig.width=7, fig.height=2.5}
EWAS33 <- readRDS("Data/dataset/EWAS33.RDS")
candidate_covariates <- c("sd.b", "mad", "mean", "precision")

EWAS33_data <- data.frame()
for(s in candidate_covariates){
  df <- data.frame(covariate_value = EWAS33[[s]],
                   p_value = EWAS33$P.value)
  df$covariate <- s
  f <- ecdf(df$covariate_value)
  df$percentile_covariate <- f(df$covariate_value)
  EWAS33_data <- rbind(EWAS33_data, df)
}

fs9 <- ggplot(data = EWAS33_data, mapping = aes(x = percentile_covariate, y = -log10(p_value))) +
  geom_hex(bins = 100) + 
  facet_wrap(~covariate, nrow = 1) + 
  theme_bw() +
  theme(legend.key.size = unit(0.3, "cm"))
fs9
```

```{r, include=FALSE}
save_plot("Figures/FigureS9.pdf", plot = fs9, base_width = 7, base_height = 2.5)
```

### 5. Informative covariates enhance detection power {#informative}
***
#### 5.1 Heatmap only with the most informative covariates
```{r}
plotdata_subset <- discovery_number %>%
  mutate(id = paste0(dataset, "_", measure)) %>%
  filter(id %in% selected$id, measure != "icc.b", measure != "icc.m") %>%
  left_join(reference_discovery, by = "dataset") %>%
  filter(ST > 0) %>%
  arrange(pi0) %>%
  mutate(logFC = log2((discovery + 1)/(ST + 1)))

heatmapdata_subset <- dcast(plotdata_subset, dataset ~ method, value.var = "logFC")
rownames(heatmapdata_subset) <- heatmapdata_subset$dataset
heatmapdata_subset$dataset <- NULL
rownames(reference_discovery) <- reference_discovery$dataset
heatmapdata_subset$ST <- reference_discovery[rownames(heatmapdata_subset), "ST"]
heatmapdata_subset$pi0 <- reference_discovery[rownames(heatmapdata_subset), "pi0"]
heatmapdata_subset <- heatmapdata_subset[order(heatmapdata_subset$pi0), ]
fs10 <- Heatmap(as.matrix(heatmapdata_subset[, 1:5]),
             name = "logFC",
             rect_gp = gpar(col = "grey", lwd = 0.05),
             col = col_fun,
             cluster_rows = FALSE,
             left_annotation = rowAnnotation(
               ST = heatmapdata_subset$ST,
               pi0 = heatmapdata_subset$pi0,
               col = list(ST = ST_color,
                          pi0 = pi0_color),
               simple_anno_size = unit(0.1, "inch"),
               annotation_name_gp = gpar(fontsize = 6),
               annotation_legend_param = list(title_gp = gpar(fontsize = 6),
                                              labels_gp = gpar(fontsize = 5),
                                              grid_height = unit(0.1, "inch"),
                                              grid_width = unit(0.1, "inch"))),
             row_names_gp = gpar(fontsize = 4), 
             column_names_gp = gpar(fontsize = 6),
             column_names_rot = 45,
             column_names_centered = TRUE,
             show_row_dend = FALSE,
             column_dend_height = unit(0.2, "inch"),
             width = unit(1, "inch"),
             height = unit(4, "inch"),
             heatmap_legend_param = list(title_gp = gpar(fontsize = 6),
                                         labels_gp = gpar(fontsize = 5),
                                         grid_height = unit(0.1, "inch"),
                                         grid_width = unit(0.1, "inch")))
fs10
```

```{r, include=FALSE}
pdf("Figures/FigureS10.pdf", width = 2.5, height = 5)
draw(fs10)
dev.off()
```

### 5.2 Signal number changes vs omnibus test p-value
Three groups ("p<=0.001", "0.001<p<=0.01", "p>0.01") are classified according to omnibus test p-value. We explored the signal number changes accross the three groups.
```{r, fig.width=6.69, fig.height=3.5, message=FALSE, warning=FALSE}
# Method AdaPT's performance is very different from other methods, so an alternative range is set for it.

p1 <- left_join(plot_data, omnibus_result) %>%
  filter(method == "AdaPT") %>%
  mutate(group = factor(ifelse(p.value <= 0.001, "p<=0.001", ifelse(p.value <= 0.01, "0.001<p<=0.01", "p>0.01")),
                        levels = c("p<=0.001", "0.001<p<=0.01", "p>0.01"))) %>%
  ggplot(aes(x = group, y = logFC)) +
  geom_boxplot(aes(fill = group), outlier.shape = NA, fatten = 1) +
  stat_summary(fun.y = mean, geom = "point", shape = 18, color = "black") +
  coord_cartesian(ylim = c(-6, 6)) +
  facet_wrap(~method, nrow = 1, scales = "free") +
  theme_bw() +
  theme(title = element_text(size = 10),
        axis.text = element_text(size = 8),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none")

p2 <- left_join(plot_data, omnibus_result) %>%
  filter(method != "AdaPT") %>%
  mutate(group = factor(ifelse(p.value <= 0.001, "p<=0.001", ifelse(p.value <= 0.01, "0.001<p<=0.01", "p>0.01")),
                        levels = c("p<=0.001", "0.001<p<=0.01", "p>0.01"))) %>%
  ggplot(aes(x = group, y = logFC)) +
  geom_boxplot(aes(fill = group), outlier.shape = NA, fatten = 1) +
  stat_summary(fun.y = mean, geom = "point", shape = 18, color = "black") +
  coord_cartesian(ylim = c(-2, 2)) +
  facet_wrap(~method, nrow = 1, scales = "free") +
  theme_bw() +
  theme(title = element_blank(),
        axis.text = element_text(size = 8),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "right")

fs11 <- plot_grid(p1, p2, rel_widths = c(1, 4.5))
fs11
```

```{r, include=FALSE}
save_plot("Figures/FigureS11.pdf", fs11, base_width = 8)
```

### 5.3 Two strategies improve detection power: the most informative covariate and combination of multiple informative covariates
```{r, warning=FALSE, message=FALSE, fig.width=4.5, fig.height=6}
CAMT_combined <- read.csv("Data/CAMT_combined.csv", header = TRUE)
CAMT_combined$fromST <- NULL

long_data <- melt(CAMT_combined, id.vars = "X", value.name = "discovery", variable.name = "measure")

selected <- omnibus_result %>%
  group_by(dataset) %>%
  arrange(factor(dataset), p.value, desc(stat.o)) %>%
  filter(rank(p.value, ties.method = "first") == 1, p.value < 0.05) %>%
  select(dataset, measure) %>%
  left_join(long_data, by = c("dataset" = "X", "measure" = "measure")) %>%
  select(dataset, measure, discovery) %>%
  ungroup()
selected$measure <- "most.sig"

colnames(long_data) <- colnames(selected)
long_data <- rbind(long_data, selected)

long_data$discovery <- as.numeric(long_data$discovery)
long_data <- long_data %>%
  left_join(reference_discovery) %>%
  mutate(logFC = log2((1+discovery)/(1+ST)),
         id = paste0(dataset, "_", measure)) 

wide_data <- dcast(long_data, dataset ~ measure, value.var = "logFC")
rownames(wide_data) <- wide_data$dataset
wide_data$dataset <- NULL
wide_data$icc.b <- NULL
wide_data$icc.m <- NULL
wide_data$most.sig[is.na(wide_data$most.sig)] <- 0
covariate_order <- c("mean", "sd.b", "mad", "precision", "sd.m", "cpg.loc",
                     "direction", "refgene.pos", "probe.type", "dip", "dhs", "chr",
                     "most.sig", "multiple")
wide_data <- wide_data[, covariate_order]

# heatmap top annotation
top_anno <- long_data %>%
  group_by(measure) %>%
  dcast(dataset ~ measure, value.var = "logFC")
rownames(top_anno) <- top_anno$dataset
top_anno <- top_anno[rownames(wide_data), colnames(wide_data)]

# row annotation
rownames(reference_discovery) <- reference_discovery$dataset
wide_data$ST <- reference_discovery[rownames(wide_data), "ST"]
wide_data$pi0 <- reference_discovery[rownames(wide_data), "pi0"]
wide_data <- wide_data[order(wide_data$pi0), ]

col_fun <- colorRamp2(c(-4, -2, 0, 2, 4), c("royalblue", "blue", "white", "red", "red3"), 
                      space = "RGB")
measrue_color <- c("most.sig"="#FB9A99", "multiple"="#E31A1C", "sd.b"="#7570B3", "sd.m"="#E7298A",
                   "mean"="#A6CEE3", "mad"="#1F78B4", "dip"="#B2DF8A", "precision"="#FFFF00",
                   "refgene.pos"="#FDBF6F", "chr"="#A6761D", "cpg.loc"="#CAB2D6", "dhs"="#949494",
                   "direction"="#7FFFD4", "probe.type" ="#FFE4E1")
fs12 <- Heatmap(as.matrix(wide_data[, 1:14]),
        col = col_fun,
        border = "grey",
        na_col = "lightgrey",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        name = "logFC",
        row_names_gp = gpar(fontsize = 4), 
        column_names_gp = gpar(fontsize = 7),
        width = unit(2, "inch"),
        height = unit(4, "inch"),
        heatmap_legend_param = list(title_gp = gpar(fontsize = 6),
                                    labels_gp = gpar(fontsize = 5),
                                    grid_height = unit(0.1, "inch"),
                                    grid_width = unit(0.13, "inch")),
        top_annotation = columnAnnotation(
          logFC = anno_boxplot(top_anno, outline = FALSE,
                               gp = gpar(fill = measrue_color[covariate_order])),
          annotation_height = unit(0.7, "inch"),
          annotation_name_gp = gpar(fontsize = 6)),
        left_annotation = rowAnnotation(
          ST = wide_data$ST,
          pi0 = wide_data$pi0,
          col = list(ST = ST_color,
                     pi0 = pi0_color),
          simple_anno_size = unit(0.1, "inch"),
          annotation_name_gp = gpar(fontsize = 6),
          annotation_legend_param = list(title_gp = gpar(fontsize = 6),
                                         labels_gp = gpar(fontsize = 5),
                                         grid_height = unit(0.1, "inch"),
                                         grid_width = unit(0.13, "inch")))
        )
fs12
```

```{r, include=FALSE}
pdf("Figures/FigureS12.pdf", width = 4.5, height = 6)
draw(fs12)
dev.off()
```