-
Notifications
You must be signed in to change notification settings - Fork 0
/
embeddingholder.py
100 lines (76 loc) · 2.8 KB
/
embeddingholder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
class EmbeddingHolder:
"""
Load pretrained GloVe embeddings and makes them accessable.
Extra symbols are added for OOV and Padding.
"""
OOV = '@@OOV@@'
PADDING = '@@PADDING@@'
def __init__(self, path):
# red previously stored binary word embeddings and vocab
wv = np.load(path + '.npy')
vocab_file = open(path + '.vocab', 'r')
vocab = [w.rstrip('\n') for w in vocab_file]
words = dict([(vocab[i], i) for i in range(len(vocab))])
amount = wv.shape[0]
self.dimen = wv.shape[1]
# Add OOV and PADDING
words[self.OOV] = amount
self.oov_index = amount
words[self.PADDING] = amount+1
unk = np.random.random_sample((wv.shape[1],))
padding = np.zeros(self.dimen)
wv = np.vstack((wv, unk, padding))
self.words = words
self.embeddings = wv
def embedding_matrix(self):
"""
Get the embedding matrix of the form:
#vocab X #dimen i.e. every row represents one word
"""
return self.embeddings
def dim(self):
"""
Get the dimension of the embeddings
"""
return self.dimen
def word_index(self, word):
"""
Get the index of the given word within the embedding matrix.
"""
return self.words.get(word, self.oov_index)
def padding(self):
"""
Get the index of the Padding symbol.
"""
return self.word_index(self.PADDING)
def reverse(self):
"""
Get the reversed dictionary to lookup words from indizes
"""
return dict((v,k) for k,v in self.words.items())
def replace_unk(self, words):
'''
replaces tokens with "UNK" if they are not known for embeddings.
'''
return [w if w in self.words else w + '<UNK>' for w in words]
def add_unknowns_from(self, other):
'''
Add all word embeddings from another embeddingholder that are not known to this instance. Already
known words are untouched.
E.g. to increase the embeddings with new vocabulary from the test set.
@param other embedding_holder containing new words
'''
# find new words
words_this = list(self.words.keys())
words_other = list(other.words.keys())
new_words = np.setdiff1d(words_other, words_this)
# matrix of new embeddings
wv = np.asmatrix([other.embedding_matrix()[other.word_index(new_words[i])] for i in range(len(new_words))])
# add words to vocab
last_idx = len(self.words)
for w in new_words:
self.words[w] = last_idx
last_idx += 1
print('Added', len(new_words), 'vocabs.')
return wv