-
Notifications
You must be signed in to change notification settings - Fork 21
/
MarkovLearning.py
208 lines (177 loc) · 8.31 KB
/
MarkovLearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import numpy as np
np.warnings.filterwarnings('ignore')
import cypher
from collections import namedtuple
from neo4j.v1 import GraphDatabase, basic_auth
# import Q1Utils
import ReasoningUtilities as RU
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../") # code directory
from RTXConfiguration import RTXConfiguration
rtxConfig = RTXConfiguration()
# Connection information for the neo4j server, populated with orangeboard
driver = GraphDatabase.driver(rtxConfig.neo4j_bolt, auth=basic_auth(rtxConfig.neo4j_username, rtxConfig.neo4j_password))
session = driver.session()
# Connection information for the ipython-cypher package
connection = "http://" + rtxConfig.neo4j_username + ":" + rtxConfig.neo4j_password + "@" + rtxConfig.neo4j_database
DEFAULT_CONFIGURABLE = {
"auto_limit": 0,
"style": 'DEFAULT',
"short_errors": True,
"data_contents": True,
"display_limit": 0,
"auto_pandas": False,
"auto_html": False,
"auto_networkx": False,
"rest": False,
"feedback": False, # turn off verbosity in ipython-cypher
"uri": connection,
}
DefaultConfigurable = namedtuple(
"DefaultConfigurable",
", ".join([k for k in DEFAULT_CONFIGURABLE.keys()])
)
defaults = DefaultConfigurable(**DEFAULT_CONFIGURABLE)
# state space is a tuple (relationship_type, node_label), first order markov chain
def initialize_Markov_chain(connection, config):
"""
This initializes an empty Markov chain and returns the transition matrix and state space
:param connection: ipython-cypher connection string (eg: http://username:[email protected]/7474/db/data
:param config: ipython-cypher configuration named tuple
:return: transition matrix (numpy array) and state space (list of tuples: (rel, node))
"""
relationship_types = cypher.run("MATCH ()-[r]-() RETURN DISTINCT type(r)", conn=connection, config=config)
relationship_types = [item[0] for item in relationship_types]
node_labels = cypher.run("MATCH (n) RETURN DISTINCT labels(n)[1]", conn=connection, config=config)
node_labels = [item[0] for item in node_labels]
# Markov chain will have states = (relationship_label, node_label) since this is a multigraph
state_space = []
for relationship_type in relationship_types:
for node_label in node_labels:
state = (relationship_type, node_label)
state_space.append(state)
# trans_mat = np.zeros((len(state_space), len(state_space)))
quad_to_matrix_index = dict()
for state1 in state_space:
for state2 in state_space:
quad_to_matrix_index[state1 + state2] = (state_space.index(state1), state_space.index(state2))
return state_space, quad_to_matrix_index
# state_space, quad_to_matrix_index = initialize_Markov_chain(connection, config)
# Run this on each training example, then normalize
def train(state_space, quad_to_matrix_index, obs_dict, type='ML'):
"""
This function will train a Markov chain given a set of observations
:param trans_mat: current transition matrix
:param state_space: state space of the Markov chain
:param type: kind of training to perform (ML=Maximum likelihood, L=Laplace)
:return: trans_mat
"""
trans_mat = np.zeros((len(state_space), len(state_space)))
omims = obs_dict.keys()
for omim in omims:
path_names, path_types = obs_dict[omim]
for path in path_types:
tuple_list = []
for path_index in range(1, len(path) - 2 + 1):
if path_index % 2 == 1:
tup = tuple(path[path_index:path_index + 2])
tuple_list.append(tup)
for tup_index in range(len(tuple_list) - 2 + 1):
quad = tuple_list[tup_index] + tuple_list[tup_index + 1]
(i, j) = quad_to_matrix_index[quad]
trans_mat[i, j] += 1
# Then normalize the thing
if type == 'ML':
row_sums = trans_mat.sum(axis=1)
for index in range(len(row_sums)):
if row_sums[index] > 0:
trans_mat[index, :] /= row_sums[index]
elif type == 'L':
pseudo_count = 0.001
trans_mat += pseudo_count # add a psedo-count
row_sums = trans_mat.sum(axis=1)
for index in range(len(row_sums)):
if row_sums[index] > 0:
trans_mat[index, :] /= row_sums[index]
else:
raise (Exception("Unknown training type:" + str(type)))
return trans_mat
# trained = train(state_space, quad_to_matrix_index, paths_dict, type='L')
def path_probability(trans_mat, quad_to_matrix_index, path):
"""
Computes the probability of a given path
:param trans_mat: trained transition matrix (numpy matrix)
:param quad_to_matrix_index: dictionary to keep track of indicies
:param path: input path of neo4j types
:return: float representing probability of seeing that path generated by the MArkov chain
"""
product = 1
tuple_list = []
for path_index in range(1, len(path) - 2 + 1):
if path_index % 2 == 1:
tup = tuple(path[path_index:path_index + 2])
tuple_list.append(tup)
for tup_index in range(len(tuple_list) - 2 + 1):
quad = tuple_list[tup_index] + tuple_list[tup_index + 1]
(i, j) = quad_to_matrix_index[quad]
product *= trans_mat[i, j]
return product
# path_probability(trained, quad_to_matrix_index, paths_dict[omim][1][0])
def trained_MC():
"""
Trains the Markov chain using known information
:return: trained transition matrix, dictionary to keep track of indicies
"""
known_solutions = dict()
known_solutions['OMIM:249100'] = 'DOID:2841'
known_solutions['OMIM:134610'] = 'DOID:2841'
known_solutions['OMIM:613985'] = 'DOID:12365'
known_solutions['OMIM:205400'] = 'DOID:12365'
known_solutions['OMIM:219700'] = 'DOID:1498'
known_solutions['OMIM:143890'] = 'DOID:9352' # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5468445/
known_solutions['OMIM:603903'] = 'DOID:12365'
paths_dict = dict()
for omim in known_solutions.keys():
doid = known_solutions[omim]
# path_name, path_type = Q1Utils.interleave_nodes_and_relationships(session, omim, doid, max_path_len=5)
path_name, path_type = RU.interleave_nodes_and_relationships(session, omim, "disease", doid, "disease",
max_path_len=5)
paths_dict[omim] = (path_name, path_type)
state_space, quad_to_matrix_index = initialize_Markov_chain(connection,
defaults) # TODO: config is an unresolved reference, but doesn't throw an error. Investigate.
trained = train(state_space, quad_to_matrix_index, paths_dict, type='L')
return trained, quad_to_matrix_index
def test():
paths_dict = dict()
omim = "test"
paths_dict[omim] = ([['OMIM:249100',
'affects',
'O15553',
'participates_in',
'R-HSA-168643',
'participates_in',
'P01584',
'gene_associated_with_condition',
'DOID:2841']],
[['disease',
'affects',
'protein',
'participates_in',
'pathway',
'participates_in',
'protein',
'gene_associated_with_condition',
'disease']])
# state_space, quad_to_matrix_index = initialize_Markov_chain(connection, config)
# modified by Deqing
# config is not defined. If use None instead, the test case will be passed.
state_space, quad_to_matrix_index = initialize_Markov_chain(connection, None)
trained = train(state_space, quad_to_matrix_index, paths_dict, type='L')
# This can get messed up if you change the priors
# print(path_probability(trained, quad_to_matrix_index, paths_dict[omim][1][0]))
# Something is odd with this assertion, why should it be < .01?
# assert np.abs(path_probability(trained, quad_to_matrix_index, paths_dict[omim][1][0]) - 0.851746) < .01
trained = train(state_space, quad_to_matrix_index, paths_dict, type='ML')
# This should always == 1
assert path_probability(trained, quad_to_matrix_index, paths_dict[omim][1][0]) == 1.0