-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGene_ontology.r
106 lines (91 loc) · 6.47 KB
/
Gene_ontology.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Author: Iris Sammarco
# Date: 20/02/2021
# Description: performs Gene Ontology (GO) enrichment analysis using custom annotation files and plot the results
library(clusterProfiler)
library(ggplot2)
setwd("") #set working directory
#Import the custom GO files
#The files look like this:
#GO:0055114 FvH4_1g00020.t10 Biological Process:oxidation-reduction process
#GO:0055114 FvH4_1g00020.t11 Biological Process:oxidation-reduction process
#GO:0055114 FvH4_1g00020.t12 Biological Process:oxidation-reduction process
Fragaria_BP <- read.table("BP_genes2go.txt", sep = "\t", header = FALSE) #this is your custom annotation file containing all the GO terms for your species in a 3 tabs separated file: 1 column with the GO terms, 2nd with the Gene ID and 3rd with the GO term. I would reccommend to make 3 files, 1 for each GO category (BP, CC, MF), since it seems that with custom annotation it can't distinguish them and the results may be biased
Fragaria_CC <- read.table("CC_genes2go.txt", sep = "\t", header = FALSE) #same but for cellular component
Fragaria_MF <- read.table("MF_genes2go.txt", sep = "\t", header = FALSE) #molecular function
#Extract the TERM2GENE and TERM2NAME info (you'll need it later, for the enricher() function)
term2gene_BP <- Fragaria_BP[,c(1,2)] #extract first 2 columns (GO term and gene ID)
term2name_BP <- Fragaria_BP[,c(1,3)] #extract Go term and function
term2gene_CC <- Fragaria_CC[,c(1,2)] #extract first 2 columns (GO term and gene ID)
term2name_CC <- Fragaria_CC[,c(1,3)] #extract Go term and function
term2gene_MF <- Fragaria_MF[,c(1,2)] #extract first 2 columns (GO term and gene ID)
term2name_MF <- Fragaria_MF[,c(1,3)] #extract Go term and function
#Import the DMR-related genes: extract the gene ID from your annotated unionbed files. The file looks like this:
#FvH4_1g00010.t1
#FvH4_1g00010.t1
#FvH4_1g00010.t1
#FvH4_1g00010.t1
#I have 2 conditions (FF is field and GG is garden):
CpG_FF_genes <- read.table("CpG_FF_gene_t.txt", header = FALSE)
CpG_FF_genes <- as.vector(CpG_FF_genes[,1])
CpG_GG_genes <- read.table("CpG_GG_gene_t.txt", header = FALSE)
CpG_GG_genes <- as.vector(CpG_GG_genes[,1])
intersect(term2gene_BP$V2, CpG_FF_genes) #try to see if your dmr-related genes actually intersect with the universe of your species genes
#### CpG ####
#BP
CpG_FF_genes_GO_BP <- enricher(gene= CpG_FF_genes, pvalueCutoff = 0.05, pAdjustMethod = "fdr", TERM2GENE= term2gene_BP, TERM2NAME = term2name_BP) #run the enrichment function
head(CpG_FF_genes_GO_BP)
summary(head(CpG_FF_genes_GO_BP))
#dotplot(CpG_FF_genes_GO_BP, showCategory=30)
#barplot(CpG_FF_genes_GO_BP)
df_CpG_FF_genes_GO_BP <- CpG_FF_genes_GO_BP[,c(2,3,6,9)] # extract the columns of interest (e.g. function, adjusted p-value, gene count, gene ratio ..)
write.csv(df_CpG_FF_genes_GO_BP, 'CpG_FF_genes_GO_BP.csv', col.names = TRUE) #write them down to a file; I then checked them manually and selected for example only the top 10 categories (ordered by gene ratio)
#same for garden:
CpG_GG_genes_GO_BP <- enricher(gene= CpG_GG_genes, pvalueCutoff = 0.05, pAdjustMethod = "fdr", TERM2GENE= term2gene_BP, TERM2NAME = term2name_BP)
head(CpG_GG_genes_GO_BP)
df_CpG_GG_genes_GO_BP <- CpG_GG_genes_GO_BP[,c(2,3,6,9)] #function, p-value, count
write.csv(df_CpG_GG_genes_GO_BP, 'CpG_GG_genes_GO_BP.csv', col.names = TRUE)
#CC
CpG_FF_genes_GO_CC <- enricher(gene= CpG_FF_genes, pvalueCutoff = 0.05, pAdjustMethod = "fdr", TERM2GENE= term2gene_CC, TERM2NAME = term2name_CC)
head(CpG_FF_genes_GO_CC)
df_CpG_FF_genes_GO_CC <- CpG_FF_genes_GO_CC[,c(2,3,6,9)] #function, p-value, count
write.csv(df_CpG_FF_genes_GO_CC, 'CpG_FF_genes_GO_CC.csv', col.names = TRUE)
CpG_GG_genes_GO_CC <- enricher(gene= CpG_GG_genes, pvalueCutoff = 0.05, pAdjustMethod = "fdr", TERM2GENE= term2gene_CC, TERM2NAME = term2name_CC)
head(CpG_GG_genes_GO_CC)
df_CpG_GG_genes_GO_CC <- CpG_GG_genes_GO_CC[,c(2,3,6,9)] #function, p-value, count
write.csv(df_CpG_GG_genes_GO_CC, 'CpG_GG_genes_GO_CC.csv', col.names = TRUE)
#MF
CpG_FF_genes_GO_MF <- enricher(gene= CpG_FF_genes, pvalueCutoff = 0.05, pAdjustMethod = "fdr", TERM2GENE= term2gene_MF, TERM2NAME = term2name_MF)
head(CpG_FF_genes_GO_MF)
summary(head(CpG_FF_genes_GO_MF))
df_CpG_FF_genes_GO_MF <- CpG_FF_genes_GO_MF[,c(2,3,6,9)] #function, p-value, count
write.csv(df_CpG_FF_genes_GO_MF, 'CpG_FF_genes_GO_MF.csv', col.names = TRUE)
CpG_GG_genes_GO_MF <- enricher(gene= CpG_GG_genes, pvalueCutoff = 0.05, pAdjustMethod = "fdr", TERM2GENE= term2gene_MF, TERM2NAME = term2name_MF)
head(CpG_GG_genes_GO_MF)
df_CpG_GG_genes_GO_MF <- CpG_GG_genes_GO_MF[,c(2,3,6,9)] #function, p-value, count
write.csv(df_CpG_GG_genes_GO_MF, 'CpG_GG_genes_GO_MF.csv', col.names = TRUE)
#plot: barplot with gene ratio (or gene count or whatever you want) and GO terms; you can plot the BP, CC and MF together if you want, or separate them in different plots. Since I had many enriched terms, I decided to plot only the BP terms. You can add other columns of course, here at the end I'm using only GeneRatio% (it's the GeneRatio*100, Condition (Field or Garden) and if you put together the 3 GO catgories you'll use the GO_category probably):
#Read the file, it looks like this:
# Description pvalue Count Condition GeneRatio GeneRatio% GO_category
#GO:0007165 signal transduction 8.79E-04 148 Field 0.029892951 2.9892951 Biological Process
#GO:0006281 DNA repair 2.09E-05 80 Field 0.016158352 1.6158352 Biological Process
#GO:0007165 signal transduction 7.99E-06 150 Garden 0.031152648 3.1152648 Biological Process
#GO:0006281 DNA repair 2.78E-06 80 Garden 0.016614746 1.6614746 Biological Process
CpG_top10 = read.csv2('CpG_GO_top10_GeneRatio.csv', sep= ',', header = TRUE)
CpG_top10$Description <- as.factor(CpG_top10$Description)
CpG_top10$Condition <- factor(CpG_top10$Condition, levels = c('Field', 'Garden'), ordered = TRUE)
CpG_top10$GO_category <- as.character(CpG_top10$GO_category)
CpG_top10$GeneRatio. <- as.numeric(CpG_top10$GeneRatio.)
darkcyan_t <- adjustcolor( "darkcyan", alpha.f = 0.4)
png("CpG_GO_top10.png", width = 600, height = 600)
ggplot(data=CpG_top10, aes(x = Description)) +
geom_bar(stat="identity", aes(y = GeneRatio., fill = Condition)) +
xlab("") +
ylab("Gene ratio") +
scale_fill_manual("Condition", values = c("darkcyan", "#FF8C00")) +
ggtitle("CpG, GO enrichment") +
coord_flip() +
theme(axis.title.x = element_text(size = 17), axis.text.x = element_text(size = 14)) +
theme(axis.text.y = element_text(size = 16)) +
theme(legend.title = element_blank(), legend.text = element_text(size = 16)) +
theme(plot.title = element_text(size=20))
dev.off()