fix: modifications after stageR correction

iaradsouza1 · Oct 24, 2023 · 4c38efe · 4c38efe
1 parent 6f90c81
commit 4c38efe
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 47 deletions.
diff --git a/scripts/diff_tx_correct.R b/scripts/diff_tx_correct.R
@@ -5,18 +5,18 @@ library(stageR)
 library(GenomicFeatures)
 
 # Load TX data from differential expression
-load("results/diff_exp/tx_rin_ph_diff.rda")
+load("results/diff_exp/edger_tx_rin_ph_diff.rda")
 df_res <- df_edger_ph_rin_group_tx
 colnames(df_res)[1] <- "tx"
 
 # remove transcript version
 df_res$tx <- gsub("\\.+\\d+", "", rownames(df_res))
 
 # Load transcript-gene info -----------------------------------------------
-# gtf <- "data/genome/Homo_sapiens.GRCh38.97.gtf.gz"
-# txdb.filename <- "data/genome/Homo_sapiens.GRCh38.97.gtf.sqlite"
-gtf <- "Homo_sapiens.GRCh38.97.gtf.gz"
-txdb.filename <- "Homo_sapiens.GRCh38.97.gtf.sqlite"
+gtf <- "data/genome/Homo_sapiens.GRCh38.97.gtf.gz"
+txdb.filename <- "data/genome/Homo_sapiens.GRCh38.97.gtf.sqlite"
+#gtf <- "Homo_sapiens.GRCh38.97.gtf.gz"
+#txdb.filename <- "Homo_sapiens.GRCh38.97.gtf.sqlite"
 
 # Load db
 txdb <- loadDb(txdb.filename)
@@ -72,7 +72,7 @@ for (i in 1:length(regions)) {
 
     # Get the corrected values
     padj <- getAdjustedPValues(stageRObj, order = TRUE, onlySignificantGenes = T)
-    padj <- padj[!padj$transcript == 0,]
+    # padj <- padj[!padj$transcript == 0,]
 
     if (nrow(padj) == 0) {
       ls_temp[[j]] <- NULL
@@ -93,7 +93,7 @@ if(!dir.exists("results/diff_exp/")) {
 }
 
 # Save results
-save(df_res_padj, file = "results/diff_exp/diff_tx_corrected.rda")
+save(df_res_padj_tx, file = "results/diff_exp/diff_tx_corrected.rda")
 
 
 
diff --git a/scripts/network.R b/scripts/network.R
@@ -10,7 +10,7 @@ library(ggraph)
 library(magrittr)
 library(RedeR)
 
-# ------------------ GET GENES AND INTERACTIONS -------------------------
+# ------------------ GET GENES AND INTERACTIONS ---------------------------
 # Load diff genes table
 load("results/diff_exp/diff_df.rda")
 gwas_intersections <- read_csv("results/tables/gwas_intersection.csv")
@@ -82,6 +82,9 @@ addGraph(rdp, g)
 nodes <- read_tsv("results/networks/model_nodes.txt")
 edges <- read_delim("results/networks/model_edges.txt")
 
+nodes <- read_tsv("~/model_nodes.txt")
+edges <- read_delim("~/model_edges.txt")
+
 # Import nodes coordinates determined by vivagraph
 layout <- read.csv("results/networks/layout.csv")
 
@@ -115,7 +118,7 @@ ggraph(g, x = x, y = y) +
     pie_scale = 0.2,
     show.legend = F
   ) +
-  geom_node_text(aes(label = alias), size = 1.1, nudge_x = 2, nudge_y = 4) + 
+  geom_node_text(aes(label = alias), size = 0.9, nudge_x = 2, nudge_y = 4) + 
   #geom_node_label(aes(label = alias)) + 
   scale_fill_manual(values = c("#0ac80aff", "#4f4affff", "#ff822fff")) +
   coord_fixed() +
@@ -131,7 +134,7 @@ svg(filename = "results/plots_paper/network.svg", height = 10, width = 10)
 print(p)
 dev.off()
 
-# Percentage of total genes in the network: 51,52%%
+# Percentage of total genes in the network: 51,24%
 n_distinct(nodes$alias) / n_distinct(diff_df$hgnc_symbol)
 
 
@@ -226,7 +229,7 @@ set_graph_params <- function(g, dict, f_ls) {
 
 l_groups <- map(split(diff_df$hgnc_symbol, diff_df$group), unique)
 
-graphs_by_group <- imap(l_groups, function(x, i){
+graphs_by_group <- imap(l_groups, function(x, i) {
 
   # CHANGE HERE IF YOU WANT FIRST NEIGHBORS
   # (also remember to change the path)
@@ -241,14 +244,14 @@ graphs_by_group <- imap(l_groups, function(x, i){
   # To set all gene labels, set diff_df instead of diff_temp
   g <- set_graph_params(g, dc, f_ls)
 
-  if(include_first_neighbors){
+  if(include_first_neighbors) {
     colors_list <- V(g)$pie
     degrees <- degree(g,v=V(g))
     filter <- !(paste(colors_list) == "c(1, 0, 0, 0)" & degrees == 1)
     g <- induced_subgraph(g, filter)
   }
 
-  if(length(V(g)$pie.color) != 0){
+  if(length(V(g)$pie.color) != 0) {
     pdf(paste0("results/networks/", i, ".pdf"), width = 10, height = 10)
     plot(g)
     dev.off()

diff --git a/scripts/network_layout.R b/scripts/network_layout.R
@@ -3,6 +3,12 @@
 
 library(easylayout)
 
+# Read igraph data
+load("results/networks/int.rda")
+
+# Create graph
+g <- graph_from_edgelist(as.matrix(int[,1:2]), directed = F)
+
 # Organize main layout
 layout <- easylayout::vivagraph(g)
 

diff --git a/scripts/plots.rmd b/scripts/plots.rmd
@@ -62,7 +62,7 @@ diff_df %>%
   ggplot(aes(x = col, y = n, fill = type)) +
   geom_bar(position = "stack", stat = "identity") +
   labs(x = "", y = "Number of transcriptionally altered genes", fill = "") +
-  scale_y_continuous(limits = c(0, 1200), breaks = seq(0, 1200, 200)) + 
+  scale_y_continuous(limits = c(0, 1400), breaks = seq(0, 1400, 200)) + 
   scale_fill_manual(values = color_scale) + 
   theme_classic() + 
   theme(
@@ -152,7 +152,7 @@ df_plot %>%
 ggplot(aes(x = as.numeric(x_axis), y = n, fill = type)) +
   geom_bar(stat = "identity", position = "stack") +
   facet_grid(cols = vars(region)) + 
-  scale_y_continuous(name = "Number of transcriptionally altered genes", limits = c(0, 460), breaks = seq(0, 460, 50), minor_breaks = F) +
+  scale_y_continuous(name = "Number of transcriptionally altered genes", limits = c(0, 500), breaks = seq(0, 500, 50), minor_breaks = F) +
   #facet_zoom(x = x_axis %in% c("Female", "Male", "Intersection")) +
   scale_fill_manual(name = "", values = color_scale) +
   scale_x_continuous("",
@@ -181,7 +181,7 @@ diff_df %>%
   )) -> tmp8
 
 # Create list of genes in female, in male, and in both sexes
-l_genes <- split(tmp8$gene, tmp8$sex)
+l_genes <- split(tmp8$hgnc_symbol, tmp8$sex)
 
 # Plot Venn diagram
 cairo_pdf(file = "results/plots_paper/fig2B.pdf", width = 4, height = 4)
@@ -357,11 +357,8 @@ ggsave("results/plots_paper/fig2C_2.png", height = 4, width = 5, dpi = 300)
 
 ## Figure 3
 
-Enrichment plot was built as described in `enrichment.R` in `script` directory. 
+Figure 3 was produced on the biotype analysis, in the `summarise_biotypes.R` script.
 
-## Figure 4 and 5
-
-Figures 4 and 5 were built as described in `plot_dtu.R` in `script` directory.
 
 # Supplementary Figures
 
@@ -397,14 +394,15 @@ diff_df %>%
   ggplot(aes(x = sex, y = p_gt, fill = gt)) +
     geom_bar(stat = "identity") + 
     facet_grid(.~ region) +
-    scale_y_continuous(breaks = seq(0,1,0.2), 
-                       labels = scales::percent(seq(0,1,0.2))) +
+    scale_y_continuous(breaks = seq(0,1,0.25), 
+                       labels = scales::percent(seq(0,1,0.25))) +
     scale_x_discrete(labels = c("female" = expression("\u2640"), 
                                 "male" = expression("\u2642"))) +
     scale_fill_manual(values = c("G" = "#5E835Fff", "T" = "#85587C"),
                       labels = c("G" = "Genes", "T" = "Transcripts")) + 
     labs(x = "", y = "Pergentage of transcriptionally altered genes", fill = "") + 
     theme_bw() + 
+    geom_hline(yintercept = 0.5, lty = 2, lwd = 0.2) +
     theme(
       strip.background = element_rect(fill = "white"),
       axis.text.x = element_text(size = 15, colour = "black"),
@@ -489,7 +487,7 @@ load("results/important_variables/ann.rda")
 ann %>% 
   rownames_to_column("run") %>% 
   dplyr::select(run, phenotype, gender, region) %>% 
-  count(phenotype, gender, region, name = "number_of_samples") %>% 
+  dplyr::count(phenotype, gender, region, name = "number_of_samples") %>% 
   arrange(gender, region) %>% 
   openxlsx::write.xlsx(file = "results/tables/number_of_samples.xlsx", rowNames = F)
 ```
@@ -504,21 +502,21 @@ diff_df %>%
 ## Supplementary Table 3
 
 ```{r}
-genes_by_group_female %>% 
-  openxlsx::write.xlsx(file = "results/tables/intersection_tables.xlsx", row.names = F, sheetName = "Female_Intersections")
+wb <- createWorkbook()
+
+addWorksheet(wb, sheetName = "Female_Intersections")
+writeData(wb, sheet = "Female_Intersections", genes_by_group_female)
 
-genes_by_group_male %>% 
-  openxlsx::write.xlsx(file = "results/tables/intersection_tables.xlsx", row.names = F, sheetName = "Male_Intersections",
-             append = T)
+addWorksheet(wb, sheetName = "Male_Intersections")
+writeData(wb, sheet = "Male_Intersections", genes_by_group_male)
 
-genes_by_sex %>% 
-  openxlsx::write.xlsx(file = "results/tables/intersection_tables.xlsx", row.names = F, sheetName = "Sex_Intersections",
-             append = T)
+addWorksheet(wb, sheetName = "Sex_Intersections")
+writeData(wb, sheet = "Sex_Intersections", genes_by_sex)
 
-genes_by_regions %>% 
-  openxlsx::write.xlsx(file = "results/tables/intersection_tables.xlsx", row.names = F, sheetName = "Regions_Intersections",
-             append = T)
+addWorksheet(wb, sheetName = "Regions_Intersections")
+writeData(wb, sheet = "Regions_Intersections", genes_by_regions)
 
+saveWorkbook(wb, "results/tables/intersection_tables.xlsx", overwrite = TRUE)
 ```
 
 

diff --git a/scripts/summarise_biotypes.R b/scripts/summarise_biotypes.R
@@ -63,19 +63,19 @@ dtu_w_biotype <- dtu_w_biotype %>%
 
 dge_plot <- dge_w_biotype %>% 
   group_by(biotype) %>% 
-  summarise(biotype_n = n() / length(unique(dge_w_biotype$gene_id)) * 100) %>% 
+  dplyr::summarise(biotype_n = dplyr::n() / length(unique(dge_w_biotype$gene_id)) * 100) %>% 
   ungroup() %>% 
-  mutate(type = "DGE")
+  dplyr::mutate(type = "DGE")
 
 dte_plot <- dte_w_biotype %>%
   group_by(biotype) %>%
-  summarise(biotype_n = n() / length(unique(dte_w_biotype$transcript_id))* 100) %>%
+  summarise(biotype_n = dplyr::n() / length(unique(dte_w_biotype$transcript_id))* 100) %>%
   ungroup() %>% 
   mutate(type = "DTE")
 
 dtu_plot <- dtu_w_biotype %>%
   group_by(biotype) %>%
-  summarise(biotype_n = n() / length(unique(dtu_w_biotype$isoform_id))* 100) %>%
+  summarise(biotype_n = dplyr::n() / length(unique(dtu_w_biotype$isoform_id))* 100) %>%
   ungroup() %>% 
   mutate(type = "DTU")
 
@@ -105,7 +105,7 @@ ggsave(biotype_plot, file = "results/plots_paper/biotype_plot.pdf", width = 7, h
 dge_plot <- dge_w_biotype %>% 
   separate(group, into = c("region", "sex")) %>% 
   group_by(biotype, sex) %>% 
-  summarise(biotype_n = n()) %>% 
+  summarise(biotype_n = dplyr::n()) %>% 
   ungroup() %>% 
   group_by(sex) %>% 
   mutate(prop = biotype_n / sum(biotype_n) * 100,
@@ -114,7 +114,7 @@ dge_plot <- dge_w_biotype %>%
 dte_plot <- dte_w_biotype %>% 
   separate(group, into = c("region", "sex")) %>% 
   group_by(biotype, sex) %>% 
-  summarise(biotype_n = n()) %>% 
+  summarise(biotype_n = dplyr::n()) %>% 
   ungroup() %>% 
   group_by(sex) %>% 
   mutate(prop = biotype_n / sum(biotype_n) * 100,
@@ -123,7 +123,7 @@ dte_plot <- dte_w_biotype %>%
 dtu_plot <- dtu_w_biotype %>% 
   separate(group, into = c("region", "sex")) %>% 
   group_by(biotype, sex) %>% 
-  summarise(biotype_n = n()) %>% 
+  summarise(biotype_n = dplyr::n()) %>% 
   ungroup() %>% 
   group_by(sex) %>% 
   mutate(prop = biotype_n / sum(biotype_n) * 100,
@@ -149,7 +149,7 @@ ggplot(df_plot, aes(x = reorder(biotype, dplyr::desc(prop)), y = prop, fill = ty
         strip.background = element_rect(fill = "white")) -> biotype_plot_by_sex
 
 # Save
-ggsave(biotype_plot_by_sex, filename = "results/plots_paper/biotype_by_sexplot.pdf", width = 7, height = 4)
+ggsave(biotype_plot_by_sex, filename = "results/plots_paper/fig3.pdf", width = 7, height = 4)
 
 # Test feature prevalence differences between female and male -------------
 
@@ -165,7 +165,7 @@ biotypes_by_sex %>%
   group_map(~ {
     cat(.y$type, sep = "\n")
     cont_table <- table(.x$biotype, .x$sex)
-    return(list(fisher = fisher.test(cont_table), count_table = cont_table))
+    return(list(fisher = fisher.test(cont_table, simulate.p.value = T), count_table = cont_table))
   }) -> biot_tests_fisher
 
 biotypes_by_sex %>% 
@@ -214,10 +214,10 @@ biotypes_by_sex %>%
   arrange(type, biotype) %>% 
   filter(sex == "female") %>%  
   group_by(region, type) %>% 
-  mutate(n1 = n()) %>% 
+  mutate(n1 = dplyr::n()) %>% 
   ungroup() %>% 
   group_by(biotype, type, region) %>% 
-  mutate(n2 = n(),
+  mutate(n2 = dplyr::n(),
          prop_by_region = (n2 / n1) * 100) %>% 
   arrange(desc(type), desc(region)) %>% 
   ungroup() %>% 
@@ -245,10 +245,10 @@ biotypes_by_sex %>%
   arrange(type, biotype) %>% 
   filter(sex == "male") %>%  
   group_by(region, type) %>% 
-  mutate(n1 = n()) %>% 
+  mutate(n1 = dplyr::n()) %>% 
   ungroup() %>% 
   group_by(biotype, type,region) %>% 
-  mutate(n2 = n(),
+  mutate(n2 = dplyr::n(),
          prop_by_region = (n2 / n1) * 100) %>% 
   arrange(desc(type), desc(region)) %>% 
   ungroup() %>%