-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNMTF_link.py
174 lines (146 loc) · 7.37 KB
/
NMTF_link.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import warnings
from scripts import Network
import numpy as np
import matplotlib
from utils.utils import * # EvaluationMetric, StopCriterion
import pylab as plt
import statistics
import os
import yaml
import multiprocessing
from scripts.processNetwork import runNetworkRE, runNetworkMM
warnings.filterwarnings('ignore')
if __name__ == '__main__':
matplotlib.use('agg')
# current directory
current = os.getcwd()
_, filename_1, filename_2 = sys.argv # directory of your data and setting file, name of your setting file
dirname_1 = os.path.join(filename_1, filename_2)
dirname_2 = filename_1 # os.path.join(current, filename_1)
def plot_iteration(max_it, met_val):
X = np.arange(1, max_it, 10)
plt.plot(X, met_val)
def complete_plot(m):
plt.xlabel('Iteration')
if m == EvaluationMetric.APS:
plt.ylabel('Average Precision Score (APS)')
plt.ylim(0, 1)
elif m == EvaluationMetric.AUROC:
plt.ylabel('Area Under ROC Curve')
plt.ylim(0, 1)
elif m == EvaluationMetric.RMSE:
plt.ylabel('RMSE')
def predict(num_iterations, th, network=None, rng=np.random.default_rng()):
if network is None:
network = Network(dirname_1, dirname_2, True, rng, mask=0)
for i in range(num_iterations):
network.update()
print("iteration "+str(i)+", error =", network.get_error())
rebuilt_association_matrix = np.linalg.multi_dot(
[network.get_main().G_left, network.get_main().S, network.get_main().G_right.transpose()])
new_relations_matrix = rebuilt_association_matrix - network.get_main().original_matrix
n, m = new_relations_matrix.shape
with open(f"{dirname_2}/results/results.txt", "w") as outF:
for i in range(n):
for j in range(m):
if new_relations_matrix[i, j] > th:
line = network.get_main().left_sorted_term_list[i] + " "\
+ network.get_main().right_sorted_term_list[j] + " " + str(new_relations_matrix[i, j])
outF.write(line)
outF.write("\n")
with open(dirname_1, 'r') as f:
graph_topology = yaml.load(f, Loader=yaml.FullLoader)
metric = EvaluationMetric(graph_topology["metric"].upper())
stop_criterion = StopCriterion(graph_topology["stop.criterion"].upper())
try:
max_iter_value = graph_topology["number.of.iterations"]
max_iter = int(max_iter_value)
if max_iter > MAX_ITER:
raise ValueError()
except ValueError:
max_iter = MAX_ITER
print(f"Invalid number of iteration {max_iter_value}, set default value {MAX_ITER}")
try:
threshold = graph_topology["score.threshold"]
threshold = float(threshold)
if not (0 < threshold < 1):
raise ValueError()
except ValueError:
print(f"Invalid threshold {threshold}, set default value {default_threshold}")
threshold = default_threshold
try:
initialization = graph_topology["initialization"]
except ValueError:
print(f"No initialization method given")
print("\nmetric :", metric.value)
print("initialization : ", initialization)
print(f"number of iterations : {max_iter}")
print("stop criterion : ", stop_criterion.value)
print("threshold : ", threshold)
metric_vals = np.zeros(max_iter // 10)
if stop_criterion == StopCriterion.MAXIMUM_METRIC:
best_iter = 0
# contains the iterations with the best performance from each of N_ITERATIONS validation runs (j cycle)
best_iter_arr = []
ss = np.random.SeedSequence()
# Spawn off 10 child SeedSequences to pass to child processes.
child_seeds = ss.spawn(N_ITERATIONS + 1)
streams = [np.random.default_rng(s) for s in child_seeds]
processes = list()
results = multiprocessing.Array('i', range(N_ITERATIONS))
metricsArr = multiprocessing.Array('d', range(N_ITERATIONS * (max_iter // 10)))
for i in range(N_ITERATIONS):
p = multiprocessing.Process(target=runNetworkMM, args=(
[dirname_1, dirname_2, streams[i], metric, max_iter], results, metricsArr, i))
p.start()
processes.append(p)
for i in range(N_ITERATIONS):
processes[i].join()
best_iter_arr.append(results[i])
plot_iteration(max_iter, metricsArr[i * (max_iter // 10):(i + 1) * (max_iter // 10)])
complete_plot(metric)
res_best_iter = statistics.median(best_iter_arr)
plt.axvline(x=res_best_iter, color='k', label='selected stop iteration', linestyle='dashed')
plt.legend(loc=4)
plt.savefig(f'{dirname_2}/results/{metric.value}_{graph_topology["initialization"]}_{stop_criterion.value}.png')
plt.close("all")
predict(res_best_iter, threshold, rng=streams[N_ITERATIONS])
elif stop_criterion == StopCriterion.RELATIVE_ERROR:
best_epsilon_arr = []
ss = np.random.SeedSequence()
# Spawn off 10 child SeedSequences to pass to child processes.
# completamente indipendenti dato che usiamo numpy = 1.18 farò riferimento a
# https://albertcthomas.github.io/good-practices-random-number-generators/
# è necessitata SeedSequence spawing https://numpy.org/doc/1.18/reference/random/parallel.html
# essa implementa un algoritmo che garantisce un'alta probabilità che due seed genrati vicini sian
# molto diversi.
# SeedSequence avoids these problems by using successions of integer hashes with good avalanche properties
child_seeds = ss.spawn(N_ITERATIONS + 1)
streams = [np.random.default_rng(s) for s in child_seeds]
processes = list()
results = multiprocessing.Array('i', range(N_ITERATIONS))
metricsArr = multiprocessing.Array('d', range(N_ITERATIONS*(max_iter // 10)))
for i in range(N_ITERATIONS):
p = multiprocessing.Process(target=runNetworkRE, args=([dirname_1, dirname_2, streams[i], metric, max_iter], results, metricsArr, i))
p.start()
processes.append(p)
for i in range(N_ITERATIONS):
processes[i].join()
best_epsilon_arr.append(results[i])
plot_iteration(max_iter, metricsArr[i*(max_iter // 10):(i+1)*(max_iter // 10)])
print("best_epsilon_arr: "+str(best_epsilon_arr))
complete_plot(metric)
res_best_epsilon = statistics.median(best_epsilon_arr)
plt.axvline(x=res_best_epsilon, color='k', label='selected stop iteration', linestyle='dashed')
plt.legend(loc=4)
plt.savefig(f'{dirname_2}/results/{metric.value}_{graph_topology["initialization"]}_{stop_criterion.value}.png')
plt.close("all")
print("\nFinal run without masking, stop at iteration: " + str(res_best_epsilon))
predict(res_best_epsilon, threshold, rng=streams[N_ITERATIONS])
elif stop_criterion == StopCriterion.MAXIMUM_ITERATIONS:
network = Network(dirname_1, dirname_2, True, np.random.default_rng(), mask=0)
initial_error = network.get_error()
print("initial error: {}".format(initial_error))
print("\nUnique run of the algorithm without masking")
predict(max_iter, threshold, network=network)
#%%