-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathresnik_isobase.py
196 lines (175 loc) · 7.93 KB
/
resnik_isobase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env/ python
import sys
import fastsemsim
#sys.path.append('/extra/wayne0/preserve/resnik/fastsemsim-code/fastsemsim/')
try:
import fastsemsim.ontology.Ontology
import fastsemsim.ac.AnnotationCorpus
import fastsemsim.semsim.ObjSetSemSim
import fastsemsim.semsim as SemSim
except ImportError:
print('Fastsemsim is not installed. Please install the fastsemsim package from SourceForge.')
sys.exit(-1)
from collections import defaultdict
from decimal import Decimal
from subprocess import call
import sys
import os
import argparse
# List of constants we will be using
GENE_ONTOLOGY_FILE = "__gene_ontology_ext.obo"
ONTOLOGY_CONVERT = {"BP" : "Process", "MF" : "Function", "CC" : "Component"}
ONTOLOGY_ROOT = {"BP" : 8150, "MF" : 3674, "CC" : 5575}
TAXON_CODES = {"Athaliana","CElegans","DMelanogaster","HSapiens", "MMusculus", "RNorvegicus",
"SCerevisiae","SPombe"}
TAXON_FILES = {key : "taxons/" + key + ".txt"
for key in TAXON_CODES }
def geneList(fileName):
with open(fileName,'r') as f:
# Skips first 4 lines of a LEDA graph file
for i in range(4):
f.readline()
count = int(f.readline().strip())
return [f.readline().strip()[2:-2] for i in range(count)]
def MNE(geneListA, commonA, geneListB, commonB, ac):
count = dict()
total = 0
for geneA, geneB in zip(geneListA, geneListB):
total += 2
a = getGOterms(ac, commonA, geneA)
b = getGOterms(ac, commonB, geneB)
for termA in a:
try:
count[termA] += 1
except KeyError:
count[termA] = 1
for termB in b:
try:
count[termB] += 1
except:
count[termB] = 1
frac = lambda x : Decimal(count[x]) / Decimal(total)
result = sum(map(lambda x : frac(x) * frac(x).log10() , count)) * Decimal(len(count)).log10()
return result
def getAlignment(filename):
with open(filename,'r') as f:
return map(int, f.readline().strip().split())
def sortGAF(filename, aspects, output):
sortString = "{{if ({}) print}}"
arg = " || ".join('$8=="{}"'.format(aspect) for aspect in aspects)
with open(output,'w') as f:
call(["awk", "-F", "\t", sortString.format(arg), filename], stdout=f)
def initParser():
parser = argparse.ArgumentParser()
parser.add_argument("-g1", "--graph1", required=True, type=str,
help="graph1, which graph2 is aligned to")
parser.add_argument("-g2", "--graph2", required=True, type=str,
help="graph2")
parser.add_argument("-t", "--taxon", type=str,
help="Taxonomies to include, separated by a '-'")
parser.add_argument("-so", "--sana-out", type=str,
help="Alignment file")
parser.add_argument("-a", "--alignment-file", type=str,
help="Two column alignment file")
parser.add_argument("--mode", default="max", type=str,
choices=["BMA","avg","max"], help="Evaluation mode")
parser.add_argument("-ec", "--evidence-codes", type=str, help="Evidence Codes")
parser.add_argument("-i", "--include-ec", action="store_true",
help="Include evidence codes, default set to exclude")
parser.add_argument("-oc", "--ontology-root", default="BP-MF-CC", type=str,
help="Ontology root, defaults to all. Separate by '-'")
parser.add_argument("-o", "--output", type=str,
help="Output file")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose")
parser.add_argument("--semsim", default="Resnik", type=str,
choices=[x for x in SemSim.term_SemSim_measures],
help="Method of evaluating similarity")
return parser
def getTaxons(name):
with open(TAXON_FILES[name], mode='r') as f:
return [line.strip() for line in f]
if __name__ == "__main__":
acSourceType = "ncbi"
parser = initParser()
args = parser.parse_args()
ontologyRoots = {ONTOLOGY_ROOT[root] for root in args.ontology_root.split("-")}
evidenceCodes = set(args.evidence_codes.split("-")) if args.evidence_codes else {}
taxonCodes = []
if args.taxon:
for taxon in args.taxon.split("-"):
taxonCodes.extend(getTaxons(taxon))
else:
taxon1 = os.path.basename(args.graph1)[:-3]
taxon2 = os.path.basename(args.graph2)[:-3]
taxonCodes.extend(getTaxons(taxon1) if taxon1 in TAXON_CODES else [])
taxonCodes.extend(getTaxons(taxon2) if taxon2 in TAXON_CODES else [])
taxonCodes = set(taxonCodes)
taxonInclude = True if len(taxonCodes) else False
# Display all parameters
if args.verbose:
verboseFormat = "{:20s} : {}"
print(verboseFormat.format("Graph 1", args.graph1))
print(verboseFormat.format("Graph 2", args.graph2))
print(verboseFormat.format("Annotation Corpus", "gene2go"))
print(verboseFormat.format("Alignment file", args.alignment_file))
print(verboseFormat.format("Ontology file", GENE_ONTOLOGY_FILE))
print(verboseFormat.format("Include taxons", taxonCodes))
choice = "{} evid. code".format(("Exclude","Include")[args.include_ec])
print(verboseFormat.format(choice, evidenceCodes))
print(verboseFormat.format("Include ontologies", ontologyCodes))
print(verboseFormat.format("Output File", args.output))
print(verboseFormat.format("Semantic Similarity", args.semsim))
print(verboseFormat.format("Mode", args.mode))
acFile = "gene2go"
acParams = {}
acParams["filter"] = {} # filter section to remove undesired annotations
acParams["filter"]["EC"] = {"EC" : evidenceCodes, "inclusive" : args.include_ec}
acParams["filter"]["taxonomy"] = {"taxonomy" : taxonCodes, "inclusive" : taxonInclude}
if args.sana_out:
geneListA = geneList(args.graph1)
geneListB = geneList(args.graph2)
alignment = getAlignment(args.sana_out)
compareList = [geneListB[i] for i in alignment]
if args.alignment_file:
geneListA = []
compareList = []
with open(args.alignment_file, mode='r') as f:
for line in f:
geneA,geneB = line.strip().split()
geneListA.append(geneA)
compareList.append(geneB)
# print (geneA + " - " + geneB)
if not (args.sana_out or args.alignment_file):
print("No alignment file")
sys.exit(-2);
ontology = fastsemsim.load_ontology(source_file = GENE_ONTOLOGY_FILE,ontology_type = 'GeneOntology', file_type = 'obo')
# ontology = fastsemsim.load_ontology(ontology_type = 'GeneOntology')
ac_source_file = 'all_isobase.go'
ac_source_file_type = 'plain'
ac = fastsemsim.load_ac(ontology, source_file=ac_source_file, file_type=ac_source_file_type, species=None)
# ac.parse(acFile, acSourceType, acParams)
# ac.isConsistent()
# semsim_type::: 'obj', 'term', 'termset', 'objset'
semsim_type='obj'
semsim_measure='Resnik'
mixing_strategy='max'
scoreFunction = fastsemsim.init_semsim(ontology = ontology, ac = ac, semsim_type = semsim_type, semsim_measure = args.semsim, mixing_strategy = args.mode)
if args.output:
with open(args.output,'w') as f:
for geneA,geneB in zip(geneListA, compareList):
scores = [scoreFunction.SemSim(geneA, geneB, root)
for root in ontologyRoots]
try:
maxScore = max(score for score in scores if score != None)
except:
maxScore = None
f.write("{}\t{}\t{}\t{}\r\n".format(args.mode,geneA,geneB,maxScore))
else:
for geneA,geneB in zip(geneListA, compareList):
scores = [scoreFunction.SemSim(geneA, geneB, root)
for root in ontologyRoots]
try:
score = max(x for x in scores if x != None)
except:
score = None
print(args.mode,geneA,geneB,score)