-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCSLS_.py
131 lines (117 loc) · 3.86 KB
/
CSLS_.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gc
import time
import multiprocessing
import numpy as np
g = 1000000000
def div_list(ls, n):
ls_len = len(ls)
if n <= 0 or 0 == ls_len:
return [ls]
if n > ls_len:
return [ls]
elif n == ls_len:
return [[i] for i in ls]
else:
j = ls_len // n
k = ls_len % n
ls_num = [j] * n
for i in range(k):
ls_num[i] += 1
ls_return = []
ind = 0
for i in range(n):
ls_return.append(ls[ind : ind+ls_num[i]])
ind += ls_num[i]
return ls_return
def cal_rank_by_sim_mat(task, sim, top_k, accurate):
mean = 0
mrr = 0
num = [0 for k in top_k]
prec_set = set()
for i in range(len(task)):
ref = task[i]
if accurate:
rank = (-sim[i, :]).argsort()
else:
rank = np.argpartition(-sim[i, :], np.array(top_k) - 1)
prec_set.add((ref, rank[0]))
assert ref in rank
rank_index = np.where(rank == ref)[0][0]
mean += (rank_index + 1)
mrr += 1 / (rank_index + 1)
for j in range(len(top_k)):
if rank_index < top_k[j]:
num[j] += 1
return mean, mrr, num, prec_set
def eval_alignment_by_sim_mat(embed1, embed2, top_k, nums_threads, csls=0, accurate=False,output = True):
t = time.time()
sim_mat = sim_handler(embed1, embed2, csls, nums_threads)
ref_num = sim_mat.shape[0]
t_num = [0 for k in top_k]
t_mean = 0
t_mrr = 0
t_prec_set = set()
tasks = div_list(np.array(range(ref_num)), nums_threads)
pool = multiprocessing.Pool(processes=len(tasks))
reses = list()
for task in tasks:
reses.append(pool.apply_async(cal_rank_by_sim_mat, (task, sim_mat[task, :], top_k, accurate)))
pool.close()
pool.join()
for res in reses:
mean, mrr, num, prec_set = res.get()
t_mean += mean
t_mrr += mrr
t_num += np.array(num)
t_prec_set |= prec_set
assert len(t_prec_set) == ref_num
acc = np.array(t_num) / ref_num * 100
for i in range(len(acc)):
acc[i] = round(acc[i], 2)
t_mean /= ref_num
t_mrr /= ref_num
if output:
if accurate:
print("accurate results: hits@{} = {}, mr = {:.3f}, mrr = {:.3f}, time = {:.3f} s ".format(top_k, acc, t_mean,
t_mrr,
time.time() - t))
else:
print("hits@{} = {}, time = {:.3f} s ".format(top_k, acc, time.time() - t))
hits1 = acc[0]
del sim_mat
gc.collect()
return t_prec_set, acc, t_mrr
def cal_csls_sim(sim_mat, k):
sorted_mat = -np.partition(-sim_mat, k + 1, axis=1) # -np.sort(-sim_mat1)
nearest_k = sorted_mat[:, 0:k]
sim_values = np.mean(nearest_k, axis=1)
return sim_values
def CSLS_sim(sim_mat1, k, nums_threads):
tasks = div_list(np.array(range(sim_mat1.shape[0])), nums_threads)
pool = multiprocessing.Pool(processes=len(tasks))
reses = list()
for task in tasks:
reses.append(pool.apply_async(cal_csls_sim, (sim_mat1[task, :], k)))
pool.close()
pool.join()
sim_values = None
for res in reses:
val = res.get()
if sim_values is None:
sim_values = val
else:
sim_values = np.append(sim_values, val)
assert sim_values.shape[0] == sim_mat1.shape[0]
return sim_values
def sim_handler(embed1, embed2, k, nums_threads):
sim_mat = np.matmul(embed1, embed2.T)
if k <= 0:
print("k = 0")
return sim_mat
csls1 = CSLS_sim(sim_mat, k, nums_threads)
csls2 = CSLS_sim(sim_mat.T, k, nums_threads)
csls_sim_mat = 2 * sim_mat.T - csls1
csls_sim_mat = csls_sim_mat.T - csls2
del sim_mat
gc.collect()
return csls_sim_mat