-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathseparation_data_analysis.Rmd
297 lines (244 loc) · 12 KB
/
separation_data_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
---
title: "Separation RNA-seq analysis"
author: "Liza Brusman"
date: "2024-10-30"
output: github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
```
```{r, message=FALSE}
library(tidyverse)
library(ggplot2)
library(ggfortify)
library(ggrepel)
library(GGally)
library(wesanderson)
library(viridis)
library(readxl)
library(car)
library(gridExtra)
library(ez)
library(ggpubr)
library(DESeq2)
library(EnhancedVolcano)
library(RRHO2)
library(ggpubr)
library(UpSetR)
library(scales)
library(stringi)
library(SuperExactTest)
library(ComplexHeatmap)
library(colorRamp2)
library(ggridges)
library(vioplot)
source("separation_data_fxns.R")
```
import sequencing data and associated metadata
```{r}
#import metadata
metrics <- read_excel("../../docs/Input_RNAseq_metrics.xlsx", sheet = "Correct order prelim")
metrics <- mutate_at(metrics, vars("Animal", "Cohort", "Pairing", "Timepoint", "Separation", "ProcessingGroup", "PPrefCat", "DissectionOrder", "CageChange", "ParentCode", "PairingAgeCat", "SACAgeCat", "RIN", "InputRNACat", "TRAP", "InputRNA"), as.factor)
#remove animal 2945 that didn't have a partner preference
metrics <- metrics %>% filter(Animal != 2945)
#import RNA-seq data
inputs <- read.csv("../../docs/merged_all_inputs.txt", stringsAsFactors=FALSE)
inputs <- inputs %>% filter_all(all_vars(.>10)) %>% distinct(genes, .keep_all = TRUE) %>% filter(!is.na(genes)) %>% column_to_rownames("genes")
#remove animal 2945
inputs <- inputs %>% select(-c(Counts2945I))
```
first, filter for only cohorts i care about (cohorts analogous to snRNA-seq experiment)
```{r}
want_anis <- metrics$Animal[metrics$Cohort %in% c("SS48P", "SS4I", "OS48P", "OS4I")] %>% unfactor()
want_idxs <- match(want_anis, metrics$Animal)
want_inputs <- inputs[,want_idxs]
want_metrics <- metrics %>% filter(Animal %in% want_anis)
```
import module genes
```{r}
mod_genes <- read.csv("G:/My Drive/pateiv/snRNAseq_voles/src/hotspot/docs/new_clusts_hotspot-gene-modules.csv")
all_genes <- intersect(mod_genes$Gene, rownames(want_inputs))
#get just module-6 genes
mod6_genes <- mod_genes$Gene[mod_genes$Module == 6]
mod6_genes <- intersect(mod6_genes, rownames(want_inputs))
mod_inputs <- want_inputs[all_genes,]
```
get normalized counts
```{r}
norm_cts_all <- get_norm_cts(want_inputs, want_metrics)
```
data wrangling to get dfs that i can use for heatmaps, etc.
```{r}
#transpose normalized counts
mod_inputs_meta <- transpose_norm_cts(norm_cts_all)
#normalize counts (z-score)
norm_new <- zscore_norm_cts(mod_inputs_meta, all_genes)
#get group mean of z-scored counts
norm_new_summ_mtx <- avg_zscored_cts(norm_new)
```
first, let's check to see if any animals are outliers based on raw counts and z-scored counts \
look at histogram of z-scored counts per animal (and raw counts per animal if you want) \
animal 2599 appears to be an outlier based on the distribution of z-scored counts (a very wide distribution compared to the others)
```{r}
#for zscored counts
for (ani in norm_new$Animal) {
print(ani)
ani_df <- norm_new %>% filter(Animal == ani)
ani_cts <- ani_df[,-c(1:3)]
ani_cts <- t(ani_cts)
hist(ani_cts[,1], xlim = c(-4, 4), ylim = c(0, 800), main = paste0(ani, " ", ani_df$Cohort))
}
# #for raw counts
# for (ani in colnames(mod_inputs)) {
# ani_df <- mod_inputs %>% select(ani)
# ani_cts <- ani_df[1]
# hist(ani_cts[,1], xlim = c(0, 200000), main = ani)
# }
```
animals with medium partner preference scores: "Counts2599I", "Counts2887P", "Counts2758P", "Counts2647I" \
excluding animal 2599
```{r}
#filter out animal 2599
want_inputs_excl <- want_inputs %>% select(-c("Counts2599I")) #"Counts2599I" , "Counts2887I", "Counts2758I", "Counts2647I"
want_metrics_excl <- want_metrics %>% filter(!Animal %in% c("2599")) #, "2887", "2758", "2647"
norm_cts_all_excl <- get_norm_cts(want_inputs_excl, want_metrics_excl)
mod_inputs_meta_excl <- transpose_norm_cts(norm_cts_all_excl)
#normalize counts (z-score)
norm_new_excl <- zscore_norm_cts(mod_inputs_meta_excl, all_genes)
norm_new_summ_mtx_excl <- avg_zscored_cts(norm_new_excl)
```
```{r, fig.width = 7, fig.height = 22}
#to make and save heatmaps for each module
mods <- 6 #numbers here indicate the modules you want to make heatmap (all modules would be 1:23)
for (m in mods) {
make_heatmap(norm_new_summ_mtx_excl, mod = m, save = FALSE)
}
```
to get out clustering from heatmap and make ridgeline plots of z-scored expression for each Module-6 gene
```{r, fig.width = 7, fig.height = 22}
mods <- 6 #numbers here indicate the modules you want to make heatmap of
for (m in mods) {
make_ridgelines(norm_new_summ_mtx_excl, norm_new, norm_cts_all_excl, metrics, mod = m, save = FALSE)
}
```
this block is to look at the difference between TRUE distribution of gene change scores (per module) vs. the shuffled distribution (animal IDs shuffled). \
first, i calculated the median for each gene in "paired" and "separated" animals, then subtracted these values to find a change score \
then, i shuffled animal ID to create a null distribution of change scores (should be centered around zero) \
then, i used an F-test to compare these distributions \
if the module is truly "socially sensitive", the variance of the TRUE change scores should be greater than the variance of the shuffled change scores \
here i have combined OS and SS to JUST look at paired vs. isolated
```{r, warning = FALSE, eval = FALSE}
##for non-shuffled change scores - only looking at separation condition
combine_norm_new <- mod_inputs_meta_excl %>% select(c("Animal", "Separation", all_genes)) #mod_inputs_meta_excl
combine_norm_new[all_genes] <- scale(combine_norm_new[all_genes])
#take median value of each group (i.e. separation condition)
combine_OSSS <- combine_norm_new %>% select(c("Animal", "Separation", all_genes)) %>% group_by(Separation) %>% dplyr::summarise_if(is.numeric, median)
rownames(combine_OSSS) <- combine_OSSS$Separation
stash_names <- rownames(combine_OSSS)
#do some data wrangling to get the isolated - paired value for each gene
combine_OSSS_mat <- combine_OSSS[,-1]
combine_OSSS_mat <- t(combine_OSSS_mat) %>% as.data.frame()
colnames(combine_OSSS_mat) <- stash_names
combine_OSSS_mat$IsoMinPair <- combine_OSSS_mat$Isolated - combine_OSSS_mat$Paired
combine_OSSS_mat$Gene <- rownames(combine_OSSS_mat)
combine_OSSS_mat <- combine_OSSS_mat %>% merge(mod_genes, on = "Gene")
#set up a df to hold p-values from F-test to compare distributions
hold_comb_values <- data.frame()
#we need to iterate through seeds for the shuffle because there aren't that many animals, so the seed can make a big difference
for (s in 1:100) {
print(s)
set.seed(s)
#shuffle animal ID and then do same data manipulation as above
norm_cts_shuff <- norm_cts_all_excl
colnames(norm_cts_shuff) <- sample(colnames(norm_cts_shuff))
mod_inputs_meta_shuff <- transpose_norm_cts(norm_cts_shuff)
combine_norm_new_shuff <- mod_inputs_meta_shuff %>% select(c("Animal", "Separation", all_genes))
combine_norm_new_shuff[all_genes] <- scale(combine_norm_new_shuff[all_genes])
combine_OSSS_shuff <- combine_norm_new_shuff %>% select(c("Animal", "Separation", all_genes)) %>% group_by(Separation) %>% dplyr::summarise_if(is.numeric, median)
rownames(combine_OSSS_shuff) <- combine_OSSS_shuff$Separation
stash_names <- rownames(combine_OSSS_shuff)
combine_OSSS_mat_shuff <- combine_OSSS_shuff[,-1]
combine_OSSS_mat_shuff <- t(combine_OSSS_mat_shuff) %>% as.data.frame()
colnames(combine_OSSS_mat_shuff) <- stash_names
combine_OSSS_mat_shuff$IsoMinPair <- combine_OSSS_mat_shuff$Isolated - combine_OSSS_mat$Paired
combine_OSSS_mat_shuff$Gene <- rownames(combine_OSSS_mat_shuff)
combine_OSSS_mat_shuff <- combine_OSSS_mat_shuff %>% merge(mod_genes, on = "Gene")
#iterate through modules to do a separate stats test for each seed (true vs. shuffled). this will give you 100 p-values per module
for (mod in 1:23) {
mod_df <- combine_OSSS_mat %>% filter(Module == mod)
mod_df_shuff <- combine_OSSS_mat_shuff %>% filter(Module == mod)
all_var_test <- var.test(mod_df$IsoMinPair, mod_df_shuff$IsoMinPair)
##you can plot as a split violin if you want, but probably don't want to make 2300 plots
# vioplot(IsoMinPair~Module, data=mod_df, col = "palevioletred", plotCentre = "line", side = "left", ylim = c(-3, 3))
# vioplot(IsoMinPair~Module, data=mod_df_shuff, col = "lightblue", plotCentre = "line", side = "right", ylim = c(-3, 3), add = T, areaEqual = F)
mini_df <- data.frame(Module = mod, seed = s, pval = all_var_test[3]$p.value, Fstat = all_var_test$statistic)
hold_comb_values <- rbind(hold_comb_values, mini_df)
}
}
```
SAVE csv
```{r, eval = FALSE}
setwd("output/")
write.csv(hold_comb_values, "shuffled_seed_module_combineOSSS_pvals_100iter_excl2599.csv")
```
read back in csv
```{r}
hold_comb_values <- read.csv("output/shuffled_seed_module_combineOSSS_pvals_100iter_excl2599.csv")
```
get summary stats from these tests
```{r}
iter_summ_comb <- hold_comb_values %>% group_by(Module) %>% summarise(mean_pval = mean(pval), mean_F = mean(Fstat), median_pval = median(pval))
```
plot the data! \
first plot is bubble plot where size is % times p-value is under cutoff, mean/median pval under cutoff is color \
second plot is a boxplot of the log10(p-values) for each module
```{r}
hold_comb_values$log10pval <- log10(hold_comb_values$pval)
#get percent of iterations for each module with log10(p-val) < -5, then mean and median p-val
iter_summ_comb_cutoff <- hold_comb_values %>% group_by(Module) %>% summarise(perc_iters = sum(log10pval < -5), mean_pval = mean(log10pval), median_pval = median(log10pval))
#set up color palette for plotting mean p-values
myPalette <- colorRampPalette(c("whitesmoke", "#67001F"))(200) #, "#f0f921"
#plot bubble plot
p <- ggplot(iter_summ_comb_cutoff, aes(x = as.factor(Module), y = 1, fill = abs(mean_pval))) +
geom_point(aes(size = perc_iters), alpha = 1, shape = 21) +
scale_fill_gradientn(colors = myPalette, limits = c(0, max(abs(iter_summ_comb_cutoff$mean_pval))),
guide = guide_colorbar(frame.colour = "black", ticks.colour = "black")) +
# scale_size_continuous(limits = c(0, min(iter_summ_comb_cutoff$mean_pval))) +
# scale_fill_manual(values = clust_cols, guide = FALSE) +
# scale_color_manual(values = clust_cols, guide = FALSE) +
scale_y_discrete(labels = 1:23) +
theme(panel.background = element_blank(), panel.border = element_rect(colour = "grey70", fill = NA, size = 1))
print(p)
#plot boxplot
p2 <- ggplot(hold_comb_values, aes(x = as.factor(Module), y = log10(pval))) + geom_boxplot() + geom_hline(yintercept = -5, color = "red", linetype = "dashed") + theme_classic()
print(p2)
#pvalue plot
setwd("output/")
ggsave("bubbleplot_combineSSOS_pvals_cutoff_Ftests_excl2599.pdf", p)
ggsave("boxplot_combineSSOS_pvals_Ftests_excl2599.pdf", p2)
```
heatmap clustering animals by separation condition
```{r, fig.height = 8, fig.width = 10}
setwd("output/")
mods <- 6
for (mod in mods) {
m_genes <- mod_genes$Gene[mod_genes$Module == mod]
m_genes <- intersect(m_genes, rownames(norm_cts_all_excl))
norm_cts_mod <- norm_new_excl %>% select(Animal, Cohort, Separation, m_genes)
rownames(norm_cts_mod) <- norm_cts_mod$Animal
col_fun = colorRamp2(c(min(norm_cts_mod[,-c(1:3)]), 0, max(norm_cts_mod[,-c(1:3)])), c("#053061","white", "#67001F"))
# row_ha <- rowAnnotation(Cohort = norm_cts_mod$Cohort, col = list(Cohort = c("OS48P" = "deeppink3", "SS48P" = "darkgreen", "OS4I" = "lightpink", "SS4I" = "lightgreen")))
row_ha <- rowAnnotation(Separation = norm_cts_mod$Separation, col = list(Separation = c("Paired" = "darkblue", "Isolated" = "lightblue")))
ht <- Heatmap(norm_cts_mod[,-c(1:3)], right_annotation = row_ha, column_title = paste0("Module-", mod), col = col_fun) #km = 4,
draw(ht)
#get out dendrogram info
og_dend <- row_dend(ht)
#to save heatmap
pdf(paste0("module_", mod, "_heatmap_paired_isolated_excl2599.pdf"))
draw(Heatmap(norm_cts_mod[,-c(1:3)], right_annotation = row_ha, cluster_rows = og_dend, split = 4, column_title = paste0("Module-", mod), col = col_fun)) #km = 4,
dev.off()
}
```
```{r}
sessionInfo()
```