-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathw2v.py
executable file
·106 lines (87 loc) · 4.12 KB
/
w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from gensim.models import word2vec
from os.path import join, exists, split
import os
import numpy as np
def train_word2vec(sentence_matrix, vocabulary_inv,
num_features=300, min_word_count=1, context=10):
"""
Trains, saves, loads Word2Vec model
Returns initial weights for embedding layer.
inputs:
sentence_matrix # int matrix: num_sentences x max_sentence_len
vocabulary_inv # dict {str:int}
num_features # Word vector dimensionality
min_word_count # Minimum word count
context # Context window size
"""
model_dir = '../embedding_weights'
model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
model_name = join(model_dir, model_name)
if exists(model_name):
embedding_model = word2vec.Word2Vec.load(model_name)
#print 'Loading existing Word2Vec model \'%s\'' % split(model_name)[-1]
else:
# Set values for various parameters
num_workers = 2 # Number of threads to run in parallel
downsampling = 1e-3 # Downsample setting for frequent words
# Initialize and train the model
print( "Training Word2Vec model...")
sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
# If we don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
embedding_model.init_sims(replace=True)
# Saving the model for later use. You can load it later using Word2Vec.load()
if not exists(model_dir):
os.mkdir(model_dir)
print ('Saving Word2Vec model' + str(split(model_name)[-1]))
embedding_model.save(model_name)
# add unknown words
embedding_weights = [np.array([embedding_model[w] if w in embedding_model\
else np.random.uniform(-0.25,0.25,embedding_model.vector_size)\
for w in vocabulary_inv])]
return embedding_weights
def load_word2vec(params):
"""
loads Word2Vec model
Returns initial weights for embedding layer.
inputs:
model_type # GoogleNews / glove
vocabulary_inv # dict {str:int}
num_features # Word vector dimensionality
"""
model_dir = '../embedding_weights'
if params.model_type == 'GoogleNews':
model_name = join(model_dir, 'GoogleNews-vectors-negative300.bin.gz')
assert(params.num_features == 300)
assert(exists(model_name))
print('Loading existing Word2Vec model (GoogleNews-300)')
embedding_model = word2vec.Word2Vec.load_word2vec_format(model_name, binary=True)
elif params.model_type == 'glove':
model_name = join(model_dir, 'glove.6B.%dd.txt' % (params.num_features))
print(model_name)
assert(exists(model_name))
print('Loading existing Word2Vec model (Glove.6B.%dd)' % (params.num_features))
# dictionary, where key is word, value is word vectors
embedding_model = {}
for line in open(model_name, 'r'):
tmp = line.strip().split()
word, vec = tmp[0], map(float, tmp[1:])
assert(len(vec) == params.num_features)
if word not in embedding_model:
embedding_model[word] = vec
assert(len(embedding_model) == 400000)
else:
raise ValueError('Unknown pretrain model type: %s!' % (params.model_type))
embedding_weights = [embedding_model[w] if w in embedding_model
else np.random.uniform(-0.25, 0.25, params.num_features)
for w in params.vocabulary_inv]
embedding_weights = np.array(embedding_weights).astype('float32')
return embedding_weights
if __name__=='__main__':
import data_helpers
print("Loading data...")
x, _, _, params.vocabulary_inv = data_helpers.load_data()
w = train_word2vec(x, params.vocabulary_inv)