Use limma instead. Its more straightforward and gives essentially the…

… same results
AlexsLemonade · cansavvy · Nov 10, 2020 · Nov 11, 2020 · Nov 11, 2020 · Nov 11, 2020
commit d16639aa40d7eab61ef4515d1be32a6ed0a0d211
diff --git a/04-advanced-topics/network-analysis_rnaseq_01_wgcna.Rmd b/04-advanced-topics/network-analysis_rnaseq_01_wgcna.Rmd
@@ -401,83 +401,50 @@ module_eigengenes <- bwnet$MEs
 head(module_eigengenes)
 ```
 
-## Which modules have biggest differences across treatment groups
+## Which modules have biggest differences across treatment groups?
 
 We can also see if our eigengenes relate to our metadata labels. 
-First we would have to make an annotation data frame that we can give to heatmap.
+First we double check that our samples are still in order.
 
 ```{r}
-# Let's prepare the annotation data.frame for taking a look at our eigengenes
-labeled_eigengenes_df <- module_eigengenes %>% 
-  tibble::rownames_to_column("sample") %>%
-  dplyr::inner_join(metadata %>% 
-                      dplyr::select(refinebio_accession_code, refinebio_treatment), 
-                                        by = c("sample" = "refinebio_accession_code"))
+all.equal(metadata$refinebio_accession_code, rownames(module_eigengenes))
 ```
 
-Set up a custom function for running ANOVA and extracting summary stats. 
+```{r}
+# Create the design matrix from the refinebio_treatment variable
+des_mat <- model.matrix(~ metadata$refinebio_treatment)
+```
+
+Limma wants our tests to be per row, so we need to transpose so the eigengenes are rows
 
 ```{r}
-run_lineaar_model <- function(anova_df = labeled_eigengenes_df, 
-                      eigengene_col = "ME52", 
-                      group_variable_col = "refinebio_treatment") {
-  # Given a data frame with the variables you want to teest
-  #
-  # Args:
-  #   anova_df: a data fram that contains the variables you want to use ANOVA to test
-  #   eigengene_col: a string that is the column name for the eigengene you want to test
-  #   group_variable_col: a string that is the column name for the grouping variable you would like   #                       to look for differences across
-  #
-  # Returns:
-  # The summary statistics from running ANOVA
-
-  # Set up a formula using the eigengene column name and grouping variable name
-  aov_formula <- formula(paste(eigengene_col, "~", group_variable_col))
-
-  # Run ANOVA based on that formula
-  aov_results <- lmFit(aov_formula, data = anova_df)
-
-  # Extract the summary stats 
-  summary_stats <- data.frame(summary(aov_results)$coefficients) %>% 
-    tibble::rownames_to_column("group") %>%
-    dplyr::filter(variable != )
-
-    # Make a new column that has the eigengene module name
-    dplyr::mutate(module = eigengene_col)
-
-  # Return the summary stats
-  return(summary_stats)
-}
+# Transpose so the eigengenes are rows and samples are columns
+module_eigengenes_t <- t(module_eigengenes)
 ```
 
-Obtain a vector of the module names we can supply to the `run_anova()` function in the next step. 
+Run linear model on each module.
 
 ```{r}
-# We need the module column names but not the other column names
-module_names <- grep("ME", colnames(labeled_eigengenes_df), value = TRUE)
+# Apply linear model to data
+fit <- limma::lmFit(module_eigengenes_t, design = des_mat)
+
+# Apply empirical Bayes to smooth standard errors
+fit <- limma::eBayes(fit)
 ```
 
-Run ANOVA on each eigengene module and bind results together. 
+Apply multiple testing correction and obtain stats in a data frame. 
 
 ```{r}
-# Run ANOVA on each module
-aov_results_df <- purrr::map(module_names, 
-                             ~ run_anova(anova_df = labeled_eigengenes_df, 
-                                         eigengene_col = .x, 
-                                         group_variable_col = "refinebio_treatment")) %>%
-  # Bind all the modules results into one data frame
-  dplyr::bind_rows() %>% 
-  # The p value column has an annoying name, let's change it, let's also reearrange so module name comes first
-  dplyr::select(module, dplyr::everything(), p_val = Pr..F.) %>% 
-  # Make a new column of adjusted p values based on multiple testing corrections
-  dplyr::mutate(p_adjust = p.adjust(p_val, method = "hochberg"))
+# Apply multiple testing correction and obtain stats
+stats_df <- limma::topTable(fit, number = nrow(module_eigengenes_t)) %>%
+  tibble::rownames_to_column("module")
 ```
 
 Let's arrange the results by the smallest adjusted p values. 
 
 ```{r}
-aov_results_df %>% 
-  dplyr::arrange(p_adjust)
+stats_df %>% 
+  dplyr::arrange(adj.P.Val)
 ```
 
 Module 52 seems to be the most differentially expressed across `refinebio_treatment` groups. 
@@ -520,7 +487,8 @@ module_52_genes <- gene_modules_df %>%
 Let's save this gene to module key to a TSV file for future use. 
 
 ```{r}
-readr::write_tsv(gene_modules_df, file = file.path(results, "SRP133573_wgcna_gene_to_module.tsv"))
+readr::write_tsv(gene_modules_df, 
+                 file = file.path("results", "SRP133573_wgcna_gene_to_module.tsv"))
 ```
 
 # Resources for further learning