-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtrain_word2vec.py
executable file
·149 lines (121 loc) · 4.18 KB
/
train_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Create own data set
from datetime import datetime
import json
import pickle
import nltk
import re
# Read the json file
# def read_data(file_name=''):
# data = []
# print('Reading data start', datetime.now())
# # file_name = 'Electronics_5.json' # 13 secs loading time
# # file_name = 'Digital_Music_5.json' # 1 sec loading time
#
# f = open(file_name, 'r')
# for line in f.readlines():
# tmp = json.loads(line)
# data.append([tmp['reviewText'], tmp['overall']])
# f.close()
# print('Reading data finsh', datetime.now())
# return data
# file_name = 'data/books.pkl'
# data = read_data(file_name)
# with open('data/books.pkl', 'bw') as f:
# pickle.dump(data, f)
# del data
# Create splitted sentences for training word2vec
file_name = 'data/books.pkl'
with open(file_name, 'br') as f:
data = pickle.load(f)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def clean_str(s):
s = re.sub(r"[^A-Za-z0-9:(),!?\'\`]", " ", s)
s = re.sub(r" : ", ":", s)
s = re.sub(r"\'s", " \'s", s)
s = re.sub(r"\'ve", " \'ve", s)
s = re.sub(r"n\'t", " n\'t", s)
s = re.sub(r"\'re", " \'re", s)
s = re.sub(r"\'d", " \'d", s)
s = re.sub(r"\'ll", " \'ll", s)
s = re.sub(r",", " , ", s)
s = re.sub(r"!", " ! ", s)
s = re.sub(r"\(", " \( ", s)
s = re.sub(r"\)", " \) ", s)
s = re.sub(r"\?", " \? ", s)
s = re.sub(r"\s{2,}", " ", s)
return s.strip().lower()
def sentence_to_word_list(a_review):
# Use regular expressions to do a find-and-replace
tmp = a_review.split()
words = []
for word in tmp:
words.extend(clean_str(word).split())
return words
def split_sentences(review, __remove_stopwords=False):
# 1. Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(review.strip())
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
tmp = sentence_to_word_list(raw_sentence)
if len(tmp) > 0:
sentences.append(tmp)
return sentences
import multiprocessing
def work(idx):
__i = idx
print('Starting thread', str(__i + 1))
sentences = []
count = 0
batch = 2000000
for item in range(batch * __i, min(batch * (__i + 1), len(data))):
if count % 10000 == 1:
print('Thread', str(__i + 1), 'at', count, 'out of',
str(min(batch * (__i + 1), len(data))-batch*__i))
count += 1
sent = data[item][0]
sentences.extend(split_sentences(sent))
with open('data/book_sents_' + str(__i) + '.pkl', 'bw') as f:
pickle.dump(sentences, f)
del sentences
print('Finish thread', str(__i + 1))
# try:
# pool = multiprocessing.Pool(5)
# total_tasks = 5
# tasks = range(total_tasks)
# results = pool.map_async(work, tasks)
# pool.close()
# pool.join()
# except:
# print("Error: unable to start thread")
from gensim.models.word2vec import Word2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
level=logging.INFO)
num_features = 300 # Word vector dimensionality
min_word_count = 20 # Minimum word count
num_workers = 12 # Number of threads to run in parallel
context = 5 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
# Initialize and train the model (this will take some time)
i = 0
print('loading', str(i), 'th file')
with open('data/book_sents_' + str(i) + '.pkl', 'br') as f:
sentences = pickle.load(f)
print("Training model skip_gram...")
model = Word2Vec(sentences, workers=num_workers,
size=num_features, min_count=min_word_count,
window=context, sample=downsampling, sg=0)
del sentences
print('finish', str(i), 'th file')
for i in range(1, 5):
print('loading', str(i), 'th file')
with open('data/book_sents_' + str(i) + '.pkl', 'br') as f:
sentences = pickle.load(f)
print("Training model skip_gram...")
model.build_vocab(sentences, update=True)
model.train(sentences, total_examples=model.corpus_count)
print('finish', str(i), 'th file')
del sentences
with open('data/book_cbow', 'bw') as f:
pickle.dump(model, f)