-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenrichmentAnalysis.py
153 lines (116 loc) · 5.86 KB
/
enrichmentAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 17 13:49:08 2023
@author: Ozan Ozisik
"""
from gprofiler import GProfiler
import os
def performEnrichmentAnalysis(geneGroupsDict, enrichmentFolder, summaryFolder, organism='hsapiens', sources=['GO:BP', 'GO:CC'], gProfiler_base_url = 'https://biit.cs.ut.ee/gprofiler', createOrsumInput=True):
'''
Parameters
----------
geneGroupsDict : dict
Dictionary mapping gene group names to gene lists.
enrichmentFolder : string
Enrichment folder path.
summaryFolder : string
Summary folder path, this will be used for orsum.
organism : string, optional
Organism name for g:Profiler. The default is 'hsapiens'.
sources : list, optional
List of source names for g:Profiler. The default is ['GO:BP', 'GO:CC'].
gProfiler_base_url : string, optional
The URL for the g:Profiler request. Should be changed if an older g:Profiler version is used. The default is the most recent version.
createOrsumInput : bool, optional
Whether enrichment ID files to use with orsum will be created. The default is True.
Returns
-------
None.
'''
if createOrsumInput:
if not os.path.exists(summaryFolder):
os.makedirs(summaryFolder)
print('Enrichment summary folder \"'+summaryFolder+'\" is created.')
gp = GProfiler(return_dataframe=True, base_url=gProfiler_base_url)
dfEnrichmentAll=gp.profile(organism=organism,
sources=sources,
query=geneGroupsDict,
no_evidences=False,#adds the intersecions and evidences columns
all_results=True
)
dfEnrichmentAll.to_csv(enrichmentFolder+os.sep+'enrichmentAllIncludingInsignificant.csv', sep='\t', index=False)
dfEnrichmentSignificant=dfEnrichmentAll[dfEnrichmentAll['significant']==True]
for group in geneGroupsDict.keys():
dfEnrichmentGroup=dfEnrichmentSignificant[dfEnrichmentSignificant['query']==group]
dfEnrichmentGroup=dfEnrichmentGroup.sort_values(by=['source', 'p_value'], ascending=[True, True])
dfEnrichmentGroup.to_csv(enrichmentFolder+os.sep+'enrichment'+ group +'.csv', sep='\t', index=False)
for source in sources:
dfEnrichmentGroupSource=dfEnrichmentGroup[dfEnrichmentGroup['source']==source]
dfEnrichmentGroupSource=dfEnrichmentGroupSource.sort_values(by='p_value', ascending=True)
dfEnrichmentGroupSource.to_csv(enrichmentFolder+os.sep+'enrichment'+ group + '-'+ source.replace(':','')+'.csv', sep='\t', index=False)
if createOrsumInput:
dfEnrichmentGroupSource['native'].to_csv(summaryFolder+os.sep+'enrichmentIDs'+ group + '-'+ source.replace(':','')+'.csv', index=False, header=None)
def applyOrsum(gmt, source, geneGroupsDict, summaryFolder, outputFolder=None, maxRepSize=None, maxTermSize=None, minTermSize=None, numberOfTermsToPlot=None):
'''
Parameters
----------
gmt : string
GMT file path.
source : string
Source name for g:Profiler, e.g. GO:CC.
geneGroupsDict : dict
Dictionary mapping gene group names to gene lists.
summaryFolder : string
Summary folder path, contains enrichment ID files for orsum.
outputFolder : string, optional
Output folder for orsum, will be created under summary folder. See orsum documentation.
maxRepSize : integer, optional
See orsum documentation. The default is None, orsum default will be used.
maxTermSize : integer, optional
See orsum documentation. The default is None, orsum default will be used.
minTermSize : integer, optional
See orsum documentation. The default is None, orsum default will be used.
numberOfTermsToPlot : integer, optional
See orsum documentation. The default is None, orsum default will be used.
Returns
-------
None.
'''
noEnrichmentGroup=set()
command='orsum.py '
command=command+'--gmt \"'+gmt+'\" '
command=command+'--files '
for group in geneGroupsDict.keys():
filePath=summaryFolder+os.sep+'enrichmentIDs'+ group + '-'+ source.replace(':','')+'.csv'
if os.stat(filePath).st_size == 0: # in a previous version orsum used to exit when one enrichment file was empty, now it is fixed but we can keep this code.
noEnrichmentGroup.add(group)
else:
command=command+'\"'+filePath+'\" '
command=command+'--fileAliases '
for group in geneGroupsDict.keys():
if group not in noEnrichmentGroup:
command=command+'\"'+group+'\" '
if outputFolder is not None:
command=command+'--outputFolder \"'+summaryFolder+os.sep+outputFolder+'\" '
if maxRepSize is not None:
command=command+'--maxRepSize ' + str(maxRepSize) + ' '
if maxTermSize is not None:
command=command+'--maxTermSize ' + str(maxTermSize) + ' '
if minTermSize is not None:
command=command+'--minTermSize ' + str(minTermSize) + ' '
if numberOfTermsToPlot is not None:
command=command+'--numberOfTermsToPlot '+str(numberOfTermsToPlot)
print('orsum command:')
print(command)
os.system(command)
'''
## This is an example command that the code above creates
os.system('orsum.py --gmt "Data/Other/hsapiens.GO:CC.name.gmt" --files '+
'"Results/Enrichment/Summary/enrichmentIDsDoGs-GO:CC.csv" '+
'"Results/Enrichment/Summary/enrichmentIDsCoGs-GO:CC.csv" '+
'"Results/Enrichment/Summary/enrichmentIDsPoGs-GO:CC.csv" '+
'--fileAliases DoGs CoGs PoGs '+
'--outputFolder "Results/Enrichment/Summary/orsumGOCC" '+
''--numberOfTermsToPlot 50)
'''