-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_controller.py
111 lines (86 loc) · 4.17 KB
/
data_controller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import codecs
import pickle
import io
import numpy as np
fasttext_200k_vocab = 'data/ft.wiki.en.300.vocab'
fasttext_200k_vectors = 'data/ft.wiki.en.300.vectors'
glove_200k_vocab = 'data/glove_200k.vocab'
glove_200k_vectors = 'data/glove_200k.vec'
cbow_200k_vocab = 'data/w2v_cbow_200k.vocab'
cbow_200k_vectors = 'data/w2v_cbow_200k.vec'
augmentations_postspec = 'data/augmentations_postspec.vocab'
simlex_999_path = 'data/simlex999/SimLex-999.txt'
wordsim_path = 'data/wsim353/wsim.txt'
def load_vocab_binary(path, inverse=False):
vocab = pickle.load(open(path, "rb"))
if inverse:
vocab_inv = {v: k for k, v in vocab.items()}
return vocab, vocab_inv
else:
return vocab
def load_vectors_binary(path, normalize=False):
vecs = np.load(path, allow_pickle=True)
if normalize:
vecs_norm = vecs / np.transpose([np.linalg.norm(vecs, 2, 1)])
return vecs, vecs_norm
else:
return vecs
def load_binary_embeddings(vocab_path, vecs_path, inverse=False, normalize=False):
vocab = load_vocab_binary(vocab_path, inverse)
vecs = load_vectors_binary(vecs_path, normalize)
return vocab, vecs
def load_augmentations(augmentations_path):
with open(augmentations_path, 'rb') as handle:
augmentations = pickle.load(handle)
return augmentations
def load_simlex(path):
simlex_data = [line.strip() for line in list(codecs.open(path, "r", encoding='utf8', errors='replace').readlines())]
# print(simlex_data)
simlex = [(line.split("\t")[0].lower(), line.split("\t")[1].lower(), float(line.split("\t")[3])) for line in
simlex_data]
# print(path)
# print(simlex)
return simlex
def load_wordsim(path):
wordsim_data = [line.strip() for line in
list(codecs.open(path, "r", encoding='utf8', errors='replace').readlines())]
wordsim = [(line.split("\t")[1].lower(), line.split("\t")[2].lower(), float(line.split("\t")[3])) for line in
wordsim_data]
return wordsim
def load_lex_by_start():
simlex_data = load_simlex(simlex_999_path)
wordsim_data = load_wordsim(wordsim_path)
simlex_vocab, wordsim_vocab = [], []
for s in simlex_data:
simlex_vocab.append(s[0])
simlex_vocab.append(s[1])
for w in wordsim_data:
wordsim_vocab.append(w[0])
wordsim_vocab.append(w[1])
return simlex_vocab, simlex_data, wordsim_vocab, wordsim_data
def load_fasttext():
fasttext_vocab, fasttext_vectors = load_binary_embeddings(fasttext_200k_vocab, fasttext_200k_vectors, inverse=False,
normalize=False)
return fasttext_vocab, fasttext_vectors
def load_glove():
glove_vocab, glove_vectors = load_binary_embeddings(glove_200k_vocab, glove_200k_vectors, inverse=False,
normalize=False)
return glove_vocab, glove_vectors
def load_cbow():
cbow_vocab, cbow_vectors = load_binary_embeddings(cbow_200k_vocab, cbow_200k_vectors, inverse=False,
normalize=False)
return cbow_vocab, cbow_vectors
def load_embeddings_by_start():
fasttext_vocab, fasttext_vectors = load_binary_embeddings(fasttext_200k_vocab, fasttext_200k_vectors, inverse=False,
normalize=False)
print(" Loaded fastText word embeddings.")
glove_vocab, glove_vectors = load_binary_embeddings(glove_200k_vocab, glove_200k_vectors, inverse=False,
normalize=False)
print(" Loaded GloVe word embeddings.")
cbow_vocab, cbow_vectors = load_binary_embeddings(cbow_200k_vocab, cbow_200k_vectors, inverse=False,
normalize=False)
print(" Loaded CBOW word embeddings.")
return fasttext_vocab, fasttext_vectors, glove_vocab, glove_vectors, cbow_vocab, cbow_vectors
fasttext_vocab, fasttext_vectors, glove_vocab, glove_vectors, cbow_vocab, cbow_vectors = load_embeddings_by_start()
simlex_vocab, simlex_data, wordsim_vocab, wordsim_data = load_lex_by_start()
augmentations = load_augmentations(augmentations_postspec)