-
Notifications
You must be signed in to change notification settings - Fork 1
/
importance.py
35 lines (29 loc) · 1.03 KB
/
importance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import numpy as np
from cleantext import clean
__all__ = ["read_vectors", "importance", "trim"]
def read_vectors(vfile):
word_cnt, vec_len = map(int, vfile[0].split())
vecdict = {}
for line in vfile[1:]:
words = line.split()
word = words[0]
vec = np.array(list(map(float, words[1:])))
vecdict[word] = vec
return vecdict, vec_len
def importance(line, vecdict, vec_len):
sm = np.zeros(vec_len)
words = line.split()
for word in words:
if word in vecdict:
sm += vecdict[word]
else:
vecdict[word] = np.zeros(vec_len)
sm /= np.sqrt(np.sum(sm**2))
return list(zip(words, map(lambda x: np.sum(vecdict[x] * sm)/np.sqrt(np.sum(vecdict[x]**2)), words)))
def trim(line, vecdict, vec_len, threshold=0.6):
line = clean(line)
words = importance(line, vecdict, vec_len)
if words == []:
return ""
max_val = max(map(lambda x: x[1], words))
return " ".join(map(lambda x: x[0], filter(lambda x: x[1]/max_val > threshold, words)))