report/7.1_Prioritize_ID_RDconnect.Rmd


# RD-connect data analyses

Note: Access to the data is restricted to validated users, Please see the details of our access, queries and filtering in the Methods section of the manuscript. 

```{r}
library(pacman)
p_load(cowplot)
###################


# RD-connect patient queries (cia RDconnect platform)
patient_data <- read_tsv("../data/RDconnect_queries.tsv")

#patient_data %>% group_by(`Solved status`,`In platform`) %>% count 

```

```{r load patient files}
file_variants = list.files("../data/rd-connect/")

# check if all patients are in the files
ID_files <- sapply(file_variants, function(i) str_remove(i, ".tsv"))

check.missID = FALSE
if(check.missID){
    #missing variants (as upload process)
  missed_IDs <- setdiff(solved_patient_annotated$`Experiment ID`, ID_files)
  
  if(length(missed_IDs)==0){
    print("All patients have corresponding files")
  } else{
    sprintf("missing Experimental IDs: %s",paste(missed_IDs, collapse = ", "))
  }
}

patient_variants <- list()
for(i in file_variants){
  patient_variants[[str_remove(i, ".tsv")]] <- read_tsv(paste0("../data/rd-connect/",i), skip_empty_rows = T, col_types = 'cccccc-ccccccccccc')
}

patient_variants_df <- bind_rows(patient_variants, .id = "Experiment ID")


solved_patients <- patient_data %>% 
  separate_rows(Genes,sep =  ";") %>%
  filter(`In platform`, grepl("solved", Genes)) %>%
  mutate(causal_gene = str_remove(Genes,pattern =  " \\(solved\\)"))

# merge results from all methods

solved_patient_annotated <- solved_patients %>% 
  select(`Participant ID`, causal_gene, `Experiment ID`) %>%
  rename(Patient= `Participant ID`, "GeneName"=causal_gene) %>%
  mutate(causal = TRUE)


patient_variants_df <- left_join(patient_variants_df,solved_patient_annotated[,c('Patient', 'Experiment ID')]) %>% 
  filter(Chr %in% c(1:22, "X","Y","MT"))

# check if there are likely missing variants
#chr_count <- patient_variants_df %>%
#  count(`Experiment ID`, Chr)

#ggplot(chr_count) + geom_tile(aes(y = `Experiment ID`, x = Chr, fill = n))


```


## Adding internal cohort + add causal gene to the gene list if missing

- A number of patients in the list (e.g. 40) have no variants in the gene indicated to be causal. This might be due to it being detect through other methods outside of the original data deposit. These data are included in the patient suspect gene list.

```{r}
# simply merge 
RDConnect_patient_gene_df <- patient_variants_df  %>%
 # mutate(inPatient = TRUE) %>%
  select(`Experiment ID`, Patient, Gene)  %>%
  dplyr::rename(GeneName = Gene) %>%
  distinct() %>%
  # add the causal gene list to ensure it is in the patient list
  rbind(solved_patient_annotated %>% select(-causal))  %>%
  # count, if = 2 means it is found in both variant list and causal gene list -> meaning that the gene is found in the variant list
  count(`Experiment ID`, Patient, GeneName, name = "causalInVariantList") %>%
  mutate(causalInVariantList = factor(causalInVariantList, levels = c(1:2), labels = c(F,T)),
         cohort = "RDConnect") %>%
  left_join(., solved_patient_annotated)


# Internal patient
internal_patient_gene_list <- read_csv("../data/patient_gene_list.csv") %>%
  mutate(`Experiment ID` = paste0("INT_", Patient_ID),
         causal = ifelse(Diagnostic == "YES", TRUE, NA)) %>%
  dplyr::rename(GeneName = GENE) %>%
  mutate(causalInVariantList = T,
         cohort = "Internal") 

# merge RDconnect and local data
patient_gene_df <- rbind(RDConnect_patient_gene_df, internal_patient_gene_list %>%
                           select(colnames(RDConnect_patient_gene_df)))


# check if local gene is labelled in proper ID
all_causal_genes <- patient_gene_df %>% filter(!is.na(causal)) %>% pull(GeneName) %>% unique()

source("../functions/fn_source.R")
all_causal_genes_ID <- IDconvert(all_causal_genes, from ="SYMBOL", to = "ENTREZID")

# check which genes as no equiv id
all_causal_genes_without_ID = all_causal_genes[is.na(all_causal_genes_ID)]

# manually modify the genes without valid causal gene
genes_without_ID_replace <- c("SIL1", "GMPPB", "TRNL1", "TRAPPC9")
names(genes_without_ID_replace) = c("*608005", "*615320", "MT-TL1", "TRAPP9")

patient_gene_df <- patient_gene_df %>%
  mutate(GeneName = ifelse(GeneName %in% names(genes_without_ID_replace), genes_without_ID_replace[GeneName], GeneName))
```


# Filtering patients without causal genes described in their variant list

```{r }

patient_all_genes_df <- patient_variants_df  %>%
 # mutate(inPatient = TRUE) %>%
  left_join(., solved_patient_annotated, by = c("Experiment ID", "Patient", Gene = 'GeneName')) %>%
  distinct(`Experiment ID`, Patient, Gene, causal) 


patients_withGenesFound <- patient_all_genes_df %>% count(`Experiment ID`, causal)  %>% 
  filter(!is.na(causal)) %>% mutate(genefound = TRUE)


patient_genes_df <- left_join(solved_patient_annotated, patients_withGenesFound[,c(1,4)])  

solved_patient_genes <- patient_genes_df %>% filter(genefound) %>% pull(GeneName, name = `Experiment ID`)
```  


# Number of genes per patients

```{r}
library(report)

genes_per_solved_patients <- patient_all_genes_df %>%
  filter(`Experiment ID` %in% names(solved_patient_genes)) %>%
  count(`Experiment ID`)

p <- ggplot(genes_per_solved_patients, aes(x=n, y = "")) + 
  geom_violin(fill = "#fccf66") + 
  geom_jitter(col = "#ebaf26" ) +
 # ggforce::geom_sina() +
    cowplot::theme_minimal_vgrid() +
    ggtitle("Distribution of genes", subtitle = report(genes_per_solved_patients$n)) +
    xlab ("# genes per patient, filtered by rare variants") + 
    ylab("")

#ggsave("../Figs/RDconnect_genes_per_patient_violin.pdf", width = 6, height = 3)

p
```


```{r ggstatsplot histogram, eval=F, include=FALSE}
# pacman::p_load(ggstatsplot, hrbrthemes)
# p_histopatient <- gghistostats(
#   data = genes_per_solved_patients, # dataframe from which variable is to be taken
#   x = n, # numeric variable whose distribution is of interest
#   title = "Number of genes with rare variants per patient", # title for the plot
#   caption = sprintf("Intellectual disability patients (solved), RD-Connect, range: [ %i, %i]", min(genes_per_solved_patients$n), max(genes_per_solved_patients$n)),
#   xlab = "#genes with potentially pathogenic variants",
#   test.value = 0, # default value is 0
#   binwidth = 10, # binwidth value (experiment)
#   ggtheme = cowplot::theme_cowplot()
# )
# 
# #ggsave("../Figs/RDconnect_genes_per_patient_histogram.pdf", plot = p_histopatient, width = 6, height = 4)
# 
# p_histopatient
```


## HPO per patient - RDconnect
```{r top phenotypes for RDconnect patients ALL}

RDConnect_solved_IDs <- patient_gene_df %>% filter(cohort == "RDConnect") %>% pull(`Experiment ID`) %>% unique()

patient_hpo_df <- patient_data %>%
  filter(`Experiment ID` %in% RDConnect_solved_IDs) %>%
  separate_rows(`HPO ids`, sep = ";") %>%
  # ensures the HPO term is properly labelled (start with HP)
  filter(str_starts(string = `HPO ids`, pattern = "HP"), str_count(`HPO ids`)==10) %>%
  distinct(`Experiment ID`,`HPO ids`) 
 
HPO_count <- patient_hpo_df %>%
  count(`HPO ids`) %>%
  arrange(-n)

HPO_details <- read_tsv("../data/raw_data/HPO_phenotype_to_genes.txt", 
                        col_types = c('cc-----'), col_names = c("HPOID", "HPOname"),
                        skip = 1) %>%
  distinct()

HPO_count <- left_join(HPO_count, HPO_details, by = c(`HPO ids` = "HPOID"))

HPO_count$HPOname <- factor(HPO_count$HPOname, levels = rev(HPO_count$HPOname))

# most abundant phenotypes
top_HP_terms <- HPO_count[1:20,]

# relabel the particular term that are too long
top_HP_terms$HPOname <- fct_recode(top_HP_terms$HPOname, !!!c("Delayed speech"="Delayed speech and language development")) 
#labels(top_HP_terms$HPOname)[2] <- "Delayed Speech"
top_HP_terms$queried_term <- grepl("Intellectual disability", top_HP_terms$HPOname)

p_HPOcount <- ggplot(top_HP_terms, aes(x = n, y = HPOname)) + 
  geom_col(aes(fill = queried_term)) + 
  theme_cowplot() +
  ggtitle("Most frequent phenotypes") +
  xlab ("# Patients") + 
  ylab("Phenotype term") +
  scale_fill_manual(values = rev(c("#fccf66", "#8DB5CD"))) +
  guides(fill = F)

#ggsave("../Figs/RDconnect_top20_phenotypes.pdf", plot = p_HPOcount, width = 6, height = 4)

p_HPOcount
```


```{r phenotype per RDconnect patient}
HPO_count_per_patient <- patient_hpo_df %>%
  count(`Experiment ID`) %>%
  arrange(-n)


Patients_with_diagnostics <-  tibble(`Experiment ID` = patient_data$`Experiment ID`,
                                     Diagnosed = patient_data$`ORDO disorder` != "unknown")
                                     #Diagnosed = ifelse(patient_data$`OMIM disorder` != "unknown" | patient_data$`ORDO disorder` != "unknown" , T, F) )

HPO_count_per_patient_tile <- HPO_count_per_patient %>%
  left_join(., Patients_with_diagnostics) %>%
  arrange(Diagnosed) %>%
  group_by(n) %>%
  mutate(pos = 1:n())

p_HPO_count_per_patient_tile <- ggplot(HPO_count_per_patient_tile, aes(x = n, y = pos)) + 
  geom_tile(aes(fill = Diagnosed), col = "#DADADA", size = 0.8) + 
  theme_cowplot() +
  ggtitle("# Phenotypic terms per patient") +
  xlab ("# Phenotype terms") + 
  ylab("# Patients") +
  scale_y_continuous(breaks = c(1,3,5,7,9)) +
  scale_fill_manual(values = rev(c("#fccf66", "#8DB5CD"))) +
  guides(fill = F)

#ggsave("../Figs/RDconnect_HPO_count_per_patient_tiled.pdf", plot = p_HPO_count_per_patient_tile, width = 6*1.25, height = 2*1.25)

p_HPO_count_per_patient_tile
```


## HPO association to seed genes
```{r}
internal_patient_hpo_df <- read_csv("../data/patient_hpo_terms.csv", 
                           col_names = c('Patient', "hpo_id", "hpo_label"), skip = 1) %>%
  mutate(hpo_id = str_trim(hpo_id, side = 'both')) %>%
  left_join(., internal_patient_gene_list[,c("Patient","Patient_ID", "Experiment ID")]) %>%
  distinct()


patient_hpo_df_all_cohort <- rbind(patient_hpo_df, 
                                   internal_patient_hpo_df %>% 
                                     mutate(`HPO ids` = hpo_id) %>%
                                     select(`Experiment ID`, `HPO ids`) 
                                     
                                     )


file_hpo <- "../cache/rdconnect_hpo_to_genes.RDS"

if(!file.exists(file_hpo)){
  pacman::p_load(pbapply)
  
  # create a function for retrieving genes associated to a hpo term
  hpo_genes_query = function(term){
    url_hpo <- httr::GET(paste0("https://hpo.jax.org/api/hpo/term/",term,"/genes?max=-1"))
    gene_list <- httr::content(url_hpo)
    
    # return gene label 
    gene_names <- sapply(gene_list$genes, function(x) x$entrezGeneSymbol)
    return(list(gene_names))
  }
  
  hpo_gene_association <- pbapply::pbsapply(unique(patient_hpo_df_all_cohort$`HPO ids`), hpo_genes_query)

  
  saveRDS(hpo_gene_association, file_hpo)
} else{
  print("read precomputed HPO association")
  hpo_gene_association <- readRDS(file_hpo)
}


# take only genes exist in network as seeds
seed_genes <- patient_hpo_df_all_cohort %>% 
  group_by(`Experiment ID`) %>%
  summarise(hpo_terms = list(`HPO ids`)) %>%
  rowwise() %>%
  mutate(associated_genes = list(hpo_gene_association[hpo_terms])) #%>%
 # left_join(., solved_patients[,c("Participant ID", "Experiment ID")])
```

# Prediction based on genes associated with the phenotypes alone

```{r}
# Prediction based on HPO

genes_associated_with_patient_phenotype <- seed_genes %>% rowwise() %>%
  mutate(all_hpo_associated_genes = list(unique(unlist(associated_genes)))) %>%
  pull(all_hpo_associated_genes, name = `Experiment ID`)

# how mayne gene phenotype patients are actually in the patients


patient_all_genes_with_features_df <- patient_gene_df %>%
  rowwise() %>%
  mutate(inHPO = GeneName %in% genes_associated_with_patient_phenotype[[`Experiment ID`]])


sum_patient_genes_in_HPO_df <- patient_all_genes_with_features_df %>%
  filter(cohort == "RDConnect") %>%
  count(`Experiment ID`, inHPO, name = "n_inHPO") %>% 
  filter(inHPO) %>%
  select(-inHPO) %>%
  inner_join(., genes_per_solved_patients) %>%
  pivot_longer(., cols = c("n","n_inHPO"), names_to = "group", values_to = "count") %>%
  mutate(group = factor(group, levels = rev(c("n","n_inHPO")), labels = rev(c("All genes", "Genes related to \n patient phenotypes"))))


p <- ggplot(sum_patient_genes_in_HPO_df, aes(x=count, y = group)) + 
  geom_violin(fill = "#fccf66") + 
  geom_jitter(col = "#ebaf26" ) +
 # ggforce::geom_sina() +
    cowplot::theme_minimal_vgrid() +
    ggtitle("Distribution of genes")+#, subtitle = report(genes_per_solved_patients$n)) +
    xlab ("# genes per patient, filtered by rare variants") + 
    ylab("") + 
  scale_x_log10()

#ggsave("../Figs/RDconnect_genes_per_patient_violin_with_HPO_filter.pdf", width = 6, height = 3)

p
```
## Gene prioritization based on different feature-level

```{r}

# features
gene_prop_df <- read_tsv("../cache/gene_features.tsv")

patient_all_genes_with_features_df <- patient_all_genes_with_features_df %>%
  left_join(., gene_prop_df, by = c(GeneName = "gene"))
  

# add ranks
patient_all_genes_df_with_rank <- patient_all_genes_with_features_df %>%
  group_by(`Experiment ID`) %>%
  mutate(rank_inHP = rank(!inHPO, ties.method = "random"),
         rank_HP = rank(-HP_terms),
         rank_popularity = rank(PubMedRank),
         rank_expression_BrainAll = rank(-AllBrainAvg),
         rank_expression_FrontalCortex = rank(-FrontalCortex),
         rank_pathwaycounts = rank(-n_pathways)
         )


# plot ROC AUC

feature_based_AUC_plot <- patient_all_genes_df_with_rank %>%
  select(starts_with("rank_"), `Experiment ID`, GeneName, causal) %>%
  mutate(total_gene = n()) %>%
  pivot_longer(cols = starts_with("rank_"), names_to = "group", values_to = "rank") %>%
  mutate(pos = (rank-1)/(total_gene-1), 
         label = ifelse(is.na(causal), 0,1))

pacman::p_load(pROC)
rocvals <- list()
rocvals_df <- list()
auc_vals <- list()

for(i in unique(feature_based_AUC_plot$group)){
  df <- feature_based_AUC_plot %>% filter(group==i)
  response <- df$label
  predictor <- df$pos
  rocvals[[i]] <- pROC::roc(response, predictor)
  auc_vals[[i]] <- pROC::auc(response, predictor)
  rocvals_df[[i]] <- tibble(Sensitivity =  rocvals[[i]]$sensitivities, Specificity = rocvals[[i]]$specificities)
}


rocvals_merged_df <- bind_rows(rocvals_df, .id = "rank_type")

palettes <- c('#66c2a5','#fc8d62','#8da0cb')

p_roc_gene_prop <- ggplot(rocvals_merged_df, aes(x = Specificity, y = Sensitivity, col = rank_type)) + 
  geom_line(size=2) + 
  scale_x_reverse() + 
  #scale_color_manual(values = palettes) + 
  scale_color_brewer() + 
#  guides(col = FALSE) +
  cowplot::theme_cowplot()


p_roc_gene_prop
```

## Comparison with the informed multiplex propagation

```{r}

patient_network_ranking_results <- "../cache/test_rdconnect_ranking_results.RDS"

# if the result file are not there yet, load the data
if(!file.exists(patient_network_ranking_results)){
  source("../source/prioritize_patient_rdconnect.R")
}

patient_annotated_df <- readRDS(patient_network_ranking_results)
# this is to compare causal gene with the entire genome
patient_annotated_df_all_methods <- bind_rows(patient_annotated_df, .id = "network_set") %>%
  mutate(`Experiment ID` = Patient) %>%
  select(network_set, `Experiment ID`,  GeneName, seed, avg)


gene_per_patients <- patient_gene_df %>%
    count(`Experiment ID`, name = "total_genes") 

patient_all_genes_rank <- patient_annotated_df_all_methods %>%
  #dplyr::rename(Patient = "Experiment ID") %>%
  # add the causal column
   inner_join(., patient_gene_df) %>%
  # add gene count per patient + filter for patients with causal genes in the variant list
  inner_join(., gene_per_patients) %>%
  group_by(`Experiment ID`, network_set) %>%
  arrange(-avg) %>%
  mutate(network_rank = rank(-avg)) #%>%

patient_all_genes_rank_filtered <- patient_all_genes_rank %>%
  filter(causal) %>%
  filter(!is.na(network_set)) %>%
  mutate(rank_group = base::cut(network_rank, c(0:5, 10,20,500), labels = c(1,2,3,4,5,"top10","top20","over20")))

patient_causal_gene_rank_df <- patient_all_genes_rank_filtered %>%
  select(-c(avg, rank_group)) %>%
  pivot_wider(names_from = "network_set", values_from = "network_rank") 


###### AUC plot for patients

AUC_plot <- patient_all_genes_rank %>%
  mutate(pos = (network_rank-1)/(total_genes-1),
         label = ifelse(is.na(causal), 0,1))

pacman::p_load(pROC)
rocvals_df <- list()
for(i in unique(AUC_plot$network_set)){
  df <- AUC_plot %>% filter(network_set==i)
  response <- df$label
  predictor <- df$pos
  rocvals[[i]] <- pROC::roc(response, predictor)
  auc_vals[[i]] <- pROC::auc(response, predictor)
  rocvals_df[[i]] <- tibble(Sensitivity =  rocvals[[i]]$sensitivities, Specificity = rocvals[[i]]$specificities)
}


rocvals_merged_df_network <- bind_rows(rocvals_df, .id = "rank_type")

```

```{r}
 # look at the two of them closely

palettes <-  c(RColorBrewer::brewer.pal(6, "Greens"), RColorBrewer::brewer.pal(3, "Blues"))

results_combine <- rbind(rocvals_merged_df, rocvals_merged_df_network)
results_combine$rank_type <- factor(results_combine$rank_type, levels = c(unique(rocvals_merged_df$rank_type), unique(rocvals_merged_df_network$rank_type)))

p <- ggplot(results_combine, aes(x = Specificity, y = Sensitivity, col = rank_type)) + 
  geom_line(size=2) + 
  scale_x_reverse() + 
  scale_color_manual(values = palettes) + #guides(col = FALSE) +
  cowplot::theme_cowplot()

p
```

Reduce the information to network examples

```{r}
#palettes <-  c(RColorBrewer::brewer.pal(5, "Greens")[2:5], "#3182BD")
#palettes <-  c(RColorBrewer::brewer.pal(5, "Greens")[2:5],  "grey80")
palettes <-  c(paste0("LightSkyBlue", 1:4),  "#FCCE67")

selected_features <- c("rank_inHP", "rank_popularity", "rank_expression_BrainAll", "rank_pathwaycounts", "informedMultiPlex")

selected_features_label <- c("Feature: Phenotype overlap", "Feature: PubMed counts", "Feature: Brain expression level", "Feature: Pathway counts", "Reference: Informed Multiplex\nPropagation")

results_combine <- rbind(rocvals_merged_df, rocvals_merged_df_network) %>%
  filter(rank_type %in% selected_features)

results_combine$rank_type <- factor(results_combine$rank_type, levels = selected_features, label = selected_features_label)

p <- ggplot(results_combine, aes(x = Specificity, y = Sensitivity, col = rank_type)) + 
  geom_line(size=2) + 
  scale_x_reverse() + 
  geom_abline(slope = 1, intercept = 1, col = "grey50", linetype = "dashed") +
  scale_color_manual(values = palettes) + #guides(col = FALSE) +
  cowplot::theme_cowplot() +
  theme(legend.position = c(0.4,0.25)) +
  labs(color = "Ranking Criterion")

#ggsave("../Figs/Ranking_AUC_Network_vs_Features_comparison.pdf", p, width = 4, height = 4, scale = 1.25)

p
```

## Comparing AUC levels

although this might look like it is incremental improvements over assigning HP alone, this is valuable for when narrowing down the variant list - or the case in which the causal genes are not yet associated to the phenotypes (elaborate on a few case here as well - yes it sucks but it's needed.) - also, with the omnigenic model - having such network-based approach for assessing pathogenic effects are anyway required.

```{r}
# performing p-value + AUC

auc_vals_df <- tibble(Method  = names(auc_vals), AUC = unlist(auc_vals)) %>%
  filter(Method %in% selected_features) %>%
  mutate(Method = factor(Method, levels = selected_features, labels = str_remove_all(selected_features_label, "Reference: |Feature: ")),
        col = palettes,
         Method = fct_reorder(Method, -AUC)) %>%
  arrange(AUC)


p_auc = ggplot(auc_vals_df, aes(x = AUC, y = Method, fill = Method)) + geom_col() +  scale_fill_manual(values = palettes[c(5, 1:4)]) + guides(fill = FALSE) + cowplot::theme_cowplot()

auc_vals_df
```

Com[uting p-values based on Delong's test

```{r}


pval_df <- combn(selected_features, 2) %>% 
  t() %>% as_tibble() %>%
   rowwise() %>%  
mutate(pval = pROC::roc.test(rocvals[[as.character(V1)]], rocvals[[as.character(V2)]])$p.value)#,
     # pval = ifelse(pval==1, NA, pval))
#p value between the two best peforming methofds

#ggplot(pval_df, aes(x=Var1, y=Var2, fill = -log10(pval))) + geom_tile(na.rm = T) + theme_nothing() + scale_fill_distiller(direction = -1, na.value = "Transparent")

pval_df
```


```{r}
network_based_rank <- patient_all_genes_rank %>%
  ungroup() %>%
  filter(!is.na(causal), !grepl("INT",`Experiment ID`)) %>%
  mutate(group = cut(network_rank, breaks = c(0,5,10,20,50,100,200,1000))) %>%
  count(network_set, group)
  

feature_based_rank <- feature_based_AUC_plot %>%
  ungroup() %>%
    dplyr::rename(network_set = "group") %>%
   filter(!is.na(causal), !grepl("INT",`Experiment ID`)) %>%
  mutate(group = cut(rank, breaks = c(0,5,10,20,50,100,200,1000))) %>%
  count(network_set, group)


rank_group <- rbind(network_based_rank, feature_based_rank) %>%
  filter(n > 0)

```

```{r}
#palettes <-  c(RColorBrewer::brewer.pal(4, "Greens"), "#3182BD")

p <- ggplot(rank_group %>% filter(network_set %in% selected_features, group %in% c("(0,5]", "(5,10]", "(10,20]")) %>%
                                    mutate(network_set = factor(network_set, levels = selected_features, labels = str_remove(selected_features_label, "Reference: ")),
                                           group = factor(group, labels = paste0("Top ", c(5,10,20)))), aes(x = network_set, y = n, fill = network_set)) + 
 geom_col(position = "dodge") +
  facet_grid(.~ group) +
  scale_fill_manual(values = palettes) + #guides(col = FALSE) +
  cowplot::theme_cowplot() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Ranking criteria") +
  ylab("# Patients") + labs(fill = "Ranking criterion")

#ggsave("../Figs/ranking_patient_top_compare_methods.pdf", p, width = 8, height = 2)

p
```

- Number supporting the plot above

The difference between each method is presumably due to the absence of causal genes in the feature

```{r}
rank_group %>% group_by(network_set) %>% 
  summarise(group = group, top5 = n, all_patients = sum(n)) %>% 
  filter(group == "(0,5]") %>%
  mutate(percent = top5/131*100)
```

# Temporal-holdout benchmark (Suppl. Fig. 8)

```{r}
PanelApp_data <- read_tsv("../data/raw_data/PanelApp_ID_v3.0_2019-12-10.tsv", col_types = paste0('--cc',str_dup("-", 32)))

PanelApp_IDgenes <- PanelApp_data %>% filter(grepl("Expert Review Green", `Sources(; separated)`)) %>% pull(`Gene Symbol`) %>% unique

gene_rank_data <- rbind(feature_based_AUC_plot,
                        AUC_plot %>% 
                          mutate(rank = rank(-avg)) %>%
                          dplyr::rename(group = "network_set", total_gene = "total_genes") %>%
                          select(colnames(feature_based_AUC_plot))) %>%
  filter(!grepl("INT", `Experiment ID`), group %in% c(selected_features, "rank_HP")) %>% 
  mutate(inID = GeneName %in% hpo_gene_association[["HP:0001249"]], 
         inIDPanel = GeneName %in% PanelApp_IDgenes,
         rank_binned = cut(rank, breaks = c(0,5,10,20,50,1000))) 

# genes not in IDPanel
Patient_gene_not_in_Panel <- gene_rank_data %>% ungroup %>%filter(causal, !inIDPanel) %>% pull(`Experiment ID`) %>% unique

AUC_plot_nonPanel <- gene_rank_data %>% ungroup %>% 
  filter(`Experiment ID` %in% Patient_gene_not_in_Panel) %>%
  rename(group = "network_set")

pacman::p_load(pROC)

rocvals_nonPanel = list()
auc_vals_nonPanel = list()
rocvals_df_nonPanel = list()
for(i in unique(AUC_plot_nonPanel$network_set)){
  df <- AUC_plot_nonPanel %>% filter(network_set==i)
  response <- df$label
  predictor <- df$pos
  rocvals_nonPanel[[i]] <- pROC::roc(response, predictor)
  auc_vals_nonPanel[[i]] <- pROC::auc(response, predictor)
  rocvals_df_nonPanel[[i]] <- tibble(Sensitivity =  rocvals_nonPanel[[i]]$sensitivities, Specificity = rocvals_nonPanel[[i]]$specificities)
}


rocvals_merged_df_network_nonPanel <- bind_rows(rocvals_df_nonPanel, .id = "rank_type")


# number of patients where causal genes are not labelled as green
gene_rank_data %>% ungroup %>%filter(causal, !inIDPanel) %>% count(group, rank_binned) %>% group_by(group) %>% summarise(sum = sum(n))
```


```{r}
rocvals_merged_df_network_nonPanel_plot <- rocvals_merged_df_network_nonPanel %>%
  filter(rank_type %in% selected_features) %>%
  mutate(rank_type = factor(rank_type, levels = selected_features, label = selected_features_label))

p <- ggplot(rocvals_merged_df_network_nonPanel_plot, aes(x = Specificity, y = Sensitivity, col = rank_type)) + 
  geom_line(size=2) + 
  scale_x_reverse() + 
  geom_abline(slope = 1, intercept = 1, col = "grey50", linetype = "dashed") +
  scale_color_manual(values = palettes) + #guides(col = FALSE) +
  cowplot::theme_cowplot() +
  guides(col = F)+
  #theme(legend.position = c(0.5,0.25)) +
  labs(color = "Ranking Criterion")

#ggsave("../Figs/Ranking_AUC_Network_vs_Features_comparison_genesNotFoundinPanel.pdf", p, width = 4, height = 4, scale = 1.25)

p
```

### Ranking based on group:
```{r}
AUC_plot_nonPanel_count <- AUC_plot_nonPanel %>% filter(causal, !inIDPanel, network_set %in% selected_features) %>% count(network_set, rank_binned)

p <- ggplot(AUC_plot_nonPanel_count %>% filter(rank_binned %in% c("(0,5]", "(5,10]", "(10,20]")) 
          %>%
                                    mutate(network_set = factor(network_set, levels = selected_features, labels = str_remove(selected_features_label, "Reference: ")),
                                           rank_binned = factor(rank_binned, labels = paste0("Top ", c(5,10,20)))
                                          )
, aes(x = network_set, y = n, fill = network_set)) + 
 geom_col(position = "dodge") +
  facet_grid(.~ rank_binned) +
  scale_fill_manual(values = palettes) + #guides(col = FALSE) +
  cowplot::theme_cowplot() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
  xlab("Ranking criteria") +
  ylab("# Patients") + labs(fill = "Ranking criterion")

#ggsave("../Figs/ranking_patient_top_compare_methods_genes_not_in_IDpanel.pdf", p, width = 8, height = 2)

p
```
### scatter plot comparison:

```{r}
AUC_plot_nonPanel_compare <- AUC_plot_nonPanel %>%
  filter(network_set %in% c("rank_HP", "informedMultiPlex"), causal) %>%
  select(`Experiment ID`, GeneName, network_set, rank) %>%
  pivot_wider(names_from = "network_set", values_from = "rank") 
  
selected_genes <- c("PAX3", "MAPK8IP3", "ZNF292", "FA2H", "PHIP", "CARPRIN1", "AGO2", "NFIB", "CSF1R", "SPAST")

p <- ggplot(AUC_plot_nonPanel_compare %>% filter(rank_HP < 200, informedMultiPlex < 200), aes(x = rank_HP, y = informedMultiPlex, label = ifelse(GeneName %in% selected_genes, GeneName, ""))) + geom_point(col = "grey20") + ggrepel::geom_text_repel() +
  geom_abline(slope = 1, intercept = 0, col = "grey50", linetype = "dashed")  + xlab("Rank: HP count") + ylab("Rank: Informed Multiplex Propagation") + theme_cowplot()

#ggsave("../Figs/ranking_patient_top_compare_methods_genes_not_in_IDpanel_scattered.pdf", p, width = 4, height = 4)

p
```


## Compare the performance between the full and Temporal holdout set 
```{r,  layout="l-body-outset"}
auc_vals_df <- tibble(Method  = names(auc_vals), AUC = unlist(auc_vals)) %>%
  filter(Method %in%  c("rank_inHP", "rank_popularity", "rank_expression_BrainAll", "rank_pathwaycounts", 
"informedMultiPlex")) %>%
  mutate(Method = factor(Method, levels = c("rank_inHP", "rank_popularity", "rank_expression_BrainAll", "rank_pathwaycounts", 
"informedMultiPlex")
, labels = str_remove_all(selected_features_label, "Reference: |Feature: ")),
   #     col = palettes,
#   Method = as.character(Method),
  testList = "Full"
)

auc_vals_nonPanel_df <- tibble(Method  = names(auc_vals_nonPanel), AUC = unlist(auc_vals_nonPanel)) %>%
  filter(Method %in% selected_features) %>%
  mutate(Method = factor(Method, levels = selected_features, labels = str_remove_all(selected_features_label, "Reference: |Feature: ")),
   #     col = palettes,
         Method = as.character(Method, -AUC),
   testList = "-Panel") %>%
  arrange(AUC)


auc_vals_df_combn <- rbind(auc_vals_df, auc_vals_nonPanel_df) #%>%
 # mutate(Method = factor(Method, levels = selected_features_label))#,
         # Method = fct_reorder(Method, -(AUC)))

p <- ggplot(auc_vals_df_combn, aes(x = testList, y = AUC, fill = Method)) + geom_col() + 
  facet_grid(.~ Method)  + 
  coord_cartesian(ylim = c(0.5, 1)) + 
  scale_fill_manual(values = palettes) + theme_cowplot() + xlab("Patient list") 

#ggsave("../Figs/barplot_compare_AUC_full_vs_nonPanel_patientList.pdf", height = 3, width = 8)

p
```