-
Notifications
You must be signed in to change notification settings - Fork 0
/
utility.py
66 lines (62 loc) · 1.79 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import math
import numpy as np
def build_vocabulary_map(vocabulary_path):
print("Building vocabulary map...")
vocab_map = {}
t = []
with open(vocabulary_path, "r") as f:
i = 0
while True:
line = f.readline().split()
if not line:
break
vocab_map[line[0]] = i
t.append((float(line[1]), line[0]))
i += 1
f.close()
print("[+] Number of words:", len(vocab_map))
return vocab_map, t
def read_documents(document_path):
print("Rading documents...")
documents = []
f = open(document_path, "r")
documents = f.readlines()
f.close()
print("[+] Number of documents:", len(documents))
return documents
def compute_normalized_entropy(W, M, N, t):
e = []
r = -1.0 / math.log(N)
for i in range(M):
s = 0.0
for j in range(N):
p = float(W[i][j]) / t[i][0]
s += (p * math.log(p)) if p > 0.0001 else 0.0
e.append(r * s)
return e
def build_document_matrix(vocab_map, documents, t):
print("Building matrix W...")
M = len(vocab_map)
N = len(documents)
W = np.zeros((M, N))
document_length = []
print("Computing c_ij")
# compute c_ij
for i, d in enumerate(documents):
words = d.split()
length = 0
for w in words:
if w in vocab_map:
W[vocab_map[w]][i] += 1
length += 1
document_length.append(length)
# normalize to word entropy
print("Compute entropy...")
e = compute_normalized_entropy(W, M, N, t)
np.save('word_entropy', np.array(e))
print("Normalizing...")
for i in range(M):
for j in range(N):
W[i][j] = (1-e[i]) * W[i][j] / document_length[j]
print("[+] Completed")
return W