-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_word_distribution.py
57 lines (50 loc) · 1.95 KB
/
plot_word_distribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from utility import *
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
from matplotlib.font_manager import FontProperties
U = np.load('U.npy')
sigma = np.load('sigma.npy')
U = U[:, :150]
S = np.zeros([150, 150])
for i in range(150):
S[i][i] = sigma[i]
word_classes = []
with open("./word_cluster.txt", "r") as f:
while True:
x = f.readline()
if not x:
break
word_classes.append(x.split()[1:])
vocab_map, t = build_vocabulary_map('words.txt')
# n = len(words)
indices = []
# for i in range(100):
# indices.append([vocab_map[x] for x in word_classes[i]])
indices.append([vocab_map[i] for i in word_classes[6]])
indices.append([vocab_map[i] for i in word_classes[7]])
indices.append([vocab_map[i] for i in word_classes[25]])
indices.append([vocab_map[i] for i in word_classes[51]])
indices.append([vocab_map[i] for i in word_classes[79]])
word_n = []
word_n.append([i for i in word_classes[6]])
word_n.append([i for i in word_classes[7]])
word_n.append([i for i in word_classes[25]])
word_n.append([i for i in word_classes[51]])
word_n.append([i for i in word_classes[79]])
word_vector = np.matmul(U, S)
word_vector_first = np.transpose(word_vector[:, 5])
word_vector_second = np.transpose(word_vector[:, 6])
# plt.plot([word_vector[0][i] for i in indices], [word_vector[1][i] for i in indices], marker, label="marker='{0}'".format(marker))
font = FontProperties(fname=r"/Users/kevin/Downloads/msj.ttf", size=12)
fontdict = {'fontsize': 5}
fig, ax = plt.subplots()
for i in range(len(indices)):
a = [word_vector_first[x] for x in indices[i]]
b = [word_vector_second[x] for x in indices[i]]
plt.scatter(a, b, alpha=0.6)
for j in range(5):
ax.annotate(word_n[i][j], (a[j], b[j]), fontproperties=font)
# plt.scatter([word_vector_first[i] for i in indices_second], [word_vector_second[i] for i in indices_second], alpha=0.6)
plt.title("Visualize 5 clusters of words in 2D")
plt.show()