-
Notifications
You must be signed in to change notification settings - Fork 5
/
2_Co-expression_network_characterisation.Rmd
executable file
·271 lines (195 loc) · 12 KB
/
2_Co-expression_network_characterisation.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
## Co-expression network characterisation (Fig. 1e)
### Core- and tissue-specific co-expression
We characterised our tissue-specific co-expression networks based on GTEx. Our hypothesis is that genes that are highly co-expression across all tissues are likely required for cellular developments and survival, and should show a strong correlation with of essentiality. In this analysis, we downloaded the list of human essential genes from the [OGEE database (v2)]( http://ogee.medgenius.info), also included in `/data/OGEE_esential_genes_20190416.txt`.
```{r}
## coexpresssion - share edges
## goal: to observe whether the shared edges among co-expression networks are essential
# 0 - load required data
## load coexel sum
library(pacman)
p_load(tidyverse, cowplot, knitr)
coex_el_sum_grouped = readRDS("../cache/coexpression_edge_counts_by_group.RDS")
# create a vector of total probability for each class
coex_el_sum_score = coex_el_sum_grouped %>%
ungroup() %>%
group_by(essential_edge_score) %>%
summarise(count = sum(n)) %>% pull(count, name = essential_edge_score)
coex_el_sum_grouped %>%
dplyr::rename(n_tissues = n_binned_relabel) %>%
group_by(n_tissues) %>%
summarise(n_edges = sum(n), percent = sum(n)*100/sum(coex_el_sum_score)) %>%
kable
```
The plot for higher essentiality aas number of genes increased is shown below.
```{r bar plot essentiality per tissue pinning}
bar_essential = ggplot(coex_el_sum_grouped) +
geom_bar(aes(x = n_binned_relabel, y = n, fill = score), stat = "identity", position="fill") +
xlab("Number of tissues") + ylab("Edge proportion") + theme_cowplot() +
theme(legend.position = "bottom", axis.text.x = element_text(size = 10)) +
guides(fill = guide_legend(title = "Essential gene in edge")) + scale_fill_manual(values = c('grey60','#9ecae1','#3182bd'))
bar_essential
#ggsave("./Figs/essentiality_co-expression.pdf", bar_essential, width = 4, height = 4)
```
```{r}
coex_el_sum = readRDS("../cache/coexpression_raw_edge_counts.RDS") %>% ungroup
coex_el_sum_by_tissue <- coex_el_sum %>%
count(n, essential_edge_score, name = "count")
```
```{r edge count in absolute value, include=FALSE, eval=FALSE}
## save absolute values for number of edges in each bin
coex_bin_count = coex_el_sum_grouped %>%
group_by(n_binned_relabel) %>%
dplyr::rename(count = n)
EE_count <- coex_el_sum %>% ungroup() %>% count(n_binned, essential_edge_score)
coex_bin_count_plot = ggplot(coex_bin_count) + geom_bar(aes(x = n_binned_relabel, y = count), stat = "identity", alpha = 0.7) +
xlab("Tissues") + ylab("edge count") + theme_cowplot() #+ scale_y_log10()
coex_bin_count_plot
```
```{r assessment, include=FALSE, eval=FALSE}
#### Perform hypergeometric distribution of the combination
coex_bin_both_essential = coex_el_sum_grouped %>% filter(essential_edge_score==2) %>% select(n_binned_relabel, n)
coex_bin_not_essential = coex_el_sum_grouped %>% filter(essential_edge_score<2) %>% group_by(n_binned_relabel) %>% summarise(n_non = sum(n))
coex_bin_pval = inner_join(coex_bin_both_essential, coex_bin_not_essential, by = "n_binned_relabel")
coex_bin_pval$n_one = coex_el_sum_grouped %>% filter(essential_edge_score>0) %>% group_by(n_binned_relabel) %>% summarise(n_one = sum(n)) %>% pull(n_one)
#all_genes = unique(c(levels(coex_el_sum$A), levels(coex_el_sum$B)))
#n_essential = sum(all_genes %in% essential_genes_sym)
# use binomial distribution to estimate the significance
allessentialpairs = coex_el_sum %>% filter(essential_edge_score == 2) %>% nrow()
oneessentialpairs = coex_el_sum %>% filter(essential_edge_score == 1) %>% nrow()
allpairs = nrow(coex_el_sum)
p_essential = allessentialpairs/allpairs
p_oneessential = (oneessentialpairs+allessentialpairs)/allpairs
coex_bin_pval = coex_bin_pval %>% rowwise %>% mutate(pval = pbinom(n, n+n_non, p_essential, lower.tail = F, log.p = T),
ratio = n/(n+n_non),
oddratio = ratio/p_essential,
ratio_one = n_one/(n+n_non),
oddratio_one = ratio_one/p_oneessential)
ggplot(coex_bin_pval %>% ungroup) + geom_point(aes(x=n_binned_relabel, y = oddratio))
## roc
coex_el_sum = coex_el_sum %>% arrange(-n, -essential_edge_score)
coex_el_sum$rank = 1:nrow(coex_el_sum)
coex_el_sum= coex_el_sum %>% mutate(bothEssential = essential_edge_score ==2,
atLeastOneEssential = essential_edge_score > 0)
#coex_el_sum %>% bothEssentialCumSum = cumsum(coex_el_sum$bothEssential)
library(pROC)
coex_el_roc = coex_el_sum_grouped %>% group_by(n_binned_relabel) %>% tally()
coex_el_roc = coex_el_sum %>% group_by(n_binned) %>% arrange(-n) %>% mutate(EdgeRank = (1:n()),
EdgeRankProp = EdgeRank/n(),
bothEssentialCumSum = cumsum(bothEssential),
# TrueProb = (bothEssentialCumSum)/sum(),
FPR = (EdgeRank - bothEssentialCumSum)/(max(EdgeRank)-max(bothEssentialCumSum)),
TPR = (bothEssentialCumSum/max(bothEssentialCumSum))) %>%
select(n_binned, EdgeRank, EdgeRankProp, FPR, TPR) #%>% filter(!duplicated(n))
coex_el_roc = coex_el_sum %>% ungroup %>% arrange(-n) %>%
mutate(EdgeRank = (1:nrow(coex_el_roc)),
EdgeRankProp = EdgeRank/nrow(coex_el_roc),
bothEssentialCumSum = cumsum(bothEssential),
# TrueProb = (bothEssentialCumSum)/sum(),
FPR = (EdgeRank - bothEssentialCumSum)/(max(EdgeRank)-max(bothEssentialCumSum)),
TPR = (bothEssentialCumSum/max(bothEssentialCumSum))) %>%
select(EdgeRank, EdgeRankProp, FPR, TPR) #%>% filter(!duplicated(n))
coex_el_roc_sampled =coex_el_roc %>% group_by(n_binned) %>% dplyr::sample_n(1000)
coex_el_roc_sampled =coex_el_roc %>% dplyr::sample_frac(0.0005)
#roc1 = ggplot(coex_el_roc, aes(factor(n, levels = rev(1:53), ordered = T), maxval)) + geom_point() + theme_cowplot()
roc1 = ggplot(coex_el_roc_sampled, aes(FPR, TPR)) + geom_line(aes(col = factor(n_binned, ordered = T))) + theme_cowplot() +
xlab("number of tissues found") + ylab("Pboth edges are essential")
roc1 = ggplot(coex_el_roc_sampled, aes(FPR, TPR)) + geom_line() + theme_cowplot() +
xlab("number of tissues found") + ylab("Pboth edges are essential")
ggsave("./analysis/network_characterisation/essentiality/ROC_essential.pdf", roc1)
###################
## essentiality and rare disease genes
rarediseasegenes = process_disease_genes_data("./Disease_gene_assoc/Orphanet/output/table_disease_gene_assoc_orphanet.tsv", 20, 2000)
alldiseasegenes = unique(unlist(rarediseasegenes$disgene_list))
table(alldiseasegenes %in% essential_genes_sym)
disease_gene_essential = sapply(rarediseasegenes$disgene_list, function(x) table(x %in% essential_genes_sym))
disease_gene_essential = melt(disease_gene_essential)
colnames(disease_gene_essential) = c("essential", "count", "diseasename")
disease_gene_essential_prop = disease_gene_essential %>% group_by(diseasename) %>% spread(essential, count) %>% mutate(sum = `FALSE`+`TRUE`,
propTrue = `TRUE`/sum) %>% filter(sum > 20)
bar_essential = ggplot(disease_gene_essential %>% filter(count > 20)) + geom_bar(aes(diseasename, count, fill = essential), stat = "identity") + scale_y_log10() + coord_flip()+ theme_cowplot()
ggsave("./analysis/network_characterisation/essentiality/essential_by_disease.pdf", bar_essential)
#######################
# add the hypergeometric test to the results
coex_el_sum_stat = coex_el_sum_grouped %>% group_by(n_binned) %>% summarise(n_edge = sum(n))
# map the number of essential-essential edges
coex_el_sum_stat = left_join(coex_el_sum_stat,
coex_el_sum_grouped %>% dplyr::filter(score == 2) %>% ungroup %>% dplyr::select(n_binned, n))
n_essential = sum(unique(unique(coex_el_sum$A), unique(coex_el_sum$B)) %in% essential_genes_sym)
n_nonessential = sum(!unique(unique(coex_el_sum$A), unique(coex_el_sum$B)) %in% essential_genes_sym)
# possibiility of two essential genes connecting to each other (EE), essential-nonessential (NE), nonessential-nonessential (NN)
#n_EE = n_essential * (n_essential-1) / 2
#n_NN = n_nonessential * (n_nonessential-1) / 2
#n_NE = n_essential * n_nonessential
# computed based on what in the data
n_EE = sum(coex_el_sum$essential_edge_score == 2)
n_NN = sum(coex_el_sum$essential_edge_score == 0)
n_NE = sum(coex_el_sum$essential_edge_score == 1)
# compute the hypergeometric pval
binom_p = n_EE/(n_NE+n_NN)
coex_el_sum_stat = coex_el_sum_stat %>% rowwise() %>%
mutate(hyp_pval = phyper(n, n_EE, (n_NN + n_NE) - (n_edge - n), n_edge, lower.tail = F, log.p = F),
bin_pval = binom.test(n, n_edge, binom_p, alternative = "greater")$p.value)
```
```{r assessment2, include=FALSE, eval=FALSE}
# Feb 2021
pval_test_from_obs = function(group, method = "binomial"){
# this counts from the population
if(method == "binomial"){
counts = coex_el_sum_grouped %>% dplyr::filter(n_binned == group) %>% pull(n, name = score)
allcounts = sum(counts)
p_EE_expected = coex_el_sum_score['2']/sum(coex_el_sum_score)
testresult <- binom.test(counts['2'], allcounts, p_EE_expected, alternative = "greater")
}
return(testresult)
}
result_node_binom_from_obs = sapply(unique(coex_el_sum_grouped$n_binned), function(x) pval_test_from_obs(x))
#=======================
#2 - incorporate essential genes data
source("../functions/fn_source.R")
essential_genes = read_tsv("../data/OGEE_esential_genes_20190416.txt")
essential_genes$geneSymbol = IDconvert(essential_genes$locus, from = "ENSEMBL", to = "SYMBOL")
essential_genes_sym = essential_genes %>% filter(essential == "E") %>% pull(geneSymbol) %>% unique()
# new implementation
pval_test = function(group, method = "binomial"){
# group
if(method == "binomial"){
sub_edges = coex_el_sum[coex_el_sum$n_binned==group,]
sub_nodes = sub_edges %>% select(A,B) %>% as.list() %>% unlist() %>% unique()
N = length(sub_nodes)
nE = sum(sub_nodes %in% essential_genes_sym) # number of essential genes
nN = N - nE
nEE = sum(sub_edges$essential_edge_score == 2) # number of positive drawn = sum of EE edges
nALL = nrow(sub_edges)
p_EE_expected = (nE*(nE-1))/(N*(N-1))
testresult <- binom.test(nEE, nALL, p_EE_expected, alternative = "greater")
}
return(c(essential_nodes = nE, all_nodes = N, testresult))
}
result = sapply(unique(coex_el_sum_grouped$n_binned), function(x) pval_test(x))
#=========
# new implementation
pval_test_node = function(group, method = "binomial"){
print(group)
all_genes = unique(c(levels(coex_el_sum$A), levels(coex_el_sum$B)))
all_essential_genes = sum(all_genes %in% essential_genes_sym)
p_essential = all_essential_genes/length(all_genes)
# group
if(method == "binomial"){
sub_edges = coex_el_sum[coex_el_sum$n_binned==group,]
sub_nodes = sub_edges %>% select(A,B) %>% as.list() %>% unlist() %>% unique()
N = length(sub_nodes)
nE = sum(sub_nodes %in% essential_genes_sym) # number of essential genes
testresult <- binom.test(nE, N, p_essential, alternative = "greater")
}
if(method == "hypergeom"){
sub_edges = coex_el_sum[coex_el_sum$n_binned==group,]
sub_nodes = sub_edges %>% select(A,B) %>% as.list() %>% unlist() %>% unique()
N = length(sub_nodes)
nE = sum(sub_nodes %in% essential_genes_sym) # number of essential genes
testresult <- phyper(q = nE, m = all_essential_genes, n = length(all_genes) - all_essential_genes, k = N, lower.tail = F)
}
return(c(essential_nodes = nE, all_nodes = N, testresult))
}
result_node_binom = sapply(unique(coex_el_sum_grouped$n_binned), function(x) pval_test_node(x))
result_node_phyper = sapply(unique(coex_el_sum_grouped$n_binned), function(x) pval_test_node(x, method = "hypergeom"))
```