-
Notifications
You must be signed in to change notification settings - Fork 21
/
LilGimTestQuestion.py
176 lines (149 loc) · 7.26 KB
/
LilGimTestQuestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# This script will return X that are similar to Y based on high Jaccard index of common one-hop nodes Z (X<->Z<->Y)
import os
import sys
import argparse
# PyCharm doesn't play well with relative imports + python console + terminal
try:
from code.reasoningtool import ReasoningUtilities as RU
except ImportError:
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import ReasoningUtilities as RU
import FormatOutput
import networkx as nx
import QueryLilGIM
import CustomExceptions
import ast
class LilGim:
def __init__(self):
None
@staticmethod
def answer(tissue_id, input_protein_list, use_json=False, num_show=20, rev=True):
# Initialize the response class
response = FormatOutput.FormatResponse(6)
# Make sure everything exists in the graph
if not RU.node_exists_with_property(tissue_id, "id"):
tissue_id = RU.get_node_property(tissue_id, "id", node_label="anatomical_entity")
for i in range(len(input_protein_list)):
id = input_protein_list[i]
if not RU.node_exists_with_property(id, "id"):
input_protein_list[i] = RU.get_node_property(id, "id", node_label="protein")
# Initialize the QueryLilGim class
q = QueryLilGIM.QueryLilGIM()
# get the description
tissue_description = RU.get_node_property(tissue_id, 'name', node_label="anatomical_entity")
# Get the correlated proteins
try:
correlated_proteins_dict = q.query_neighbor_genes_for_gene_set_in_a_given_anatomy(tissue_id, tuple(input_protein_list))
#correlated_proteins_dict = {'UniProtKB:Q99618': 0.4276333333333333, 'UniProtKB:Q92698': 0.464, 'UniProtKB:P56282': 0.5810000000000001, 'UniProtKB:P49454': 0.4441, 'UniProtKB:P49642': 0.5188333333333334, 'UniProtKB:Q9BZD4': 0.5042666666666668, 'UniProtKB:P38398': 0.4464, 'UniProtKB:Q9BXL8': 0.5009, 'UniProtKB:P42166': 0.4263000000000001, 'UniProtKB:Q96CS2': 0.5844333333333332, 'UniProtKB:Q9BQP7': 0.4903333333333333, 'UniProtKB:O95997': 0.4743333333333333, 'UniProtKB:Q9H4K1': 0.4709, 'UniProtKB:Q9H967': 0.5646666666666667, 'UniProtKB:Q12834': 0.4478, 'UniProtKB:Q71F23': 0.4361, 'UniProtKB:Q9UQ84': 0.4800666666666666, 'UniProtKB:Q9NSP4': 0.4347}
except:
error_message = "Lil'GIM is experiencing a problem."
error_code = "LilGIMerror"
response.add_error_message(error_code, error_message)
response.print()
return 1
# as a list of tuples
correlated_proteins_tupes = []
for k, v in correlated_proteins_dict.items():
correlated_proteins_tupes.append((k, v))
# sort by freq
correlated_proteins_tupes_sorted = sorted(correlated_proteins_tupes, key=lambda x: x[1], reverse=rev)
correlated_proteins_tupes_sorted = correlated_proteins_tupes_sorted[0:num_show]
correlated_proteins_tupes = correlated_proteins_tupes_sorted
# return the results
if not use_json:
try:
protein_descriptions = RU.get_node_property(input_protein_list[0], "name", node_label="protein", name_type="id")
except:
protein_descriptions = input_protein_list[0]
for id in input_protein_list[1:-1]:
protein_descriptions += ", "
try:
protein_descriptions += RU.get_node_property(id, "name", node_label="protein", name_type="id")
except:
protein_descriptions += id
if len(input_protein_list) > 1:
try:
protein_descriptions += ", and %s" % RU.get_node_property(input_protein_list[-1], "name", node_label="protein", name_type="id")
except:
protein_descriptions += ", and %s" % input_protein_list[-1]
if rev:
to_print = "In the tissue: %s, the proteins that correlate most with %s" % (tissue_description, protein_descriptions)
else:
to_print = "In the tissue: %s, the proteins that correlate least with %s" % (tissue_description, protein_descriptions)
to_print += " according to Lil'GIM, are:\n"
for id, val in correlated_proteins_tupes_sorted:
try:
to_print += "protein: %s\t correlation %f\n" % (RU.get_node_property(id, "name", node_label="protein", name_type="id"), val)
except:
to_print += "protein: %s\t correlation %f\n" % (id, val)
print(to_print)
else:
# otherwise, you want a JSON output
protein_descriptions = []
is_in_KG_list = []
for protein, corr in correlated_proteins_tupes:
try:
description = RU.get_node_property(protein, "name", node_label="protein", name_type="id")
protein_descriptions.append(description)
is_in_KG_list.append(True)
except:
protein_description = protein
protein_descriptions.append(protein_description)
is_in_KG_list.append(False)
# just get the ones that are actually in the KG. TODO: do something with the ones that are not in the KG
correlated_proteins_tupes_in_KG = []
for i in range(len(correlated_proteins_tupes)):
if is_in_KG_list[i]:
correlated_proteins_tupes_in_KG.append(correlated_proteins_tupes[i])
# Return the results
full_g = RU.get_graph_from_nodes([id for id, val in correlated_proteins_tupes_in_KG], node_property_label="id")
id2node = dict()
for nx_id, node in full_g.nodes(data=True):
id2node[node['properties']['id']] = node
for id, corr in correlated_proteins_tupes_in_KG:
to_print = "In the tissue: %s, the protein %s has correlation %f with the given list of proteins." %(tissue_description, RU.get_node_property(id, "name", node_label="protein", name_type="id"), corr)
response.add_subgraph([(id, id2node[id])], [], to_print, corr)
response.print()
@staticmethod
def describe():
output = "Answers questions of the form: 'What proteins correlate with [$protein1, $protein2,...,$proteinK?] in blood?'" + "\n"
# TODO: subsample disease nodes
return output
def main():
parser = argparse.ArgumentParser(description="Answers questions of the form: 'What proteins correlate with [$protein1, $protein2,...,$proteinK?] in blood?'",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-t', '--tissue', type=str, help="Tissue id/name", default="UBERON:0002384")
parser.add_argument('-r', '--reverse', action='store_true', help="Include flag if you want the least correlations.")
parser.add_argument('-p', '--proteins', type=str, help="List of proteins.", default="[\'UniProtKB:P12004\']")
parser.add_argument('-j', '--json', action='store_true', help='Flag specifying that results should be printed in JSON format (to stdout)', default=False)
parser.add_argument('--describe', action='store_true', help='Print a description of the question to stdout and quit', default=False)
parser.add_argument('--num_show', type=int, help='Maximum number of results to return', default=20)
# Parse and check args
args = parser.parse_args()
tissue_id = args.tissue
is_reverse = args.reverse
proteins = args.proteins
use_json = args.json
describe_flag = args.describe
num_show = args.num_show
# Convert the string to an actual list
#print(proteins)
proteins_preserved = proteins
try:
proteins = proteins.replace(",", "','").replace("[", "['").replace("]", "']")
protein_list = ast.literal_eval(proteins)
protein_list_strip = []
for protein in protein_list:
protein_list_strip.append(protein.strip())
protein_list = protein_list_strip
except:
protein_list = eval(proteins_preserved)
# Initialize the question class
Q = LilGim()
if describe_flag:
res = Q.describe()
print(res)
else:
Q.answer(tissue_id, protein_list, use_json=use_json, num_show=num_show, rev=not(is_reverse))
if __name__ == "__main__":
main()