-
Notifications
You must be signed in to change notification settings - Fork 0
/
senti_train.py
173 lines (129 loc) · 5.38 KB
/
senti_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# from __future__ import absolute_import, division, print_function, unicode_literals
import os
import pickle
from os import listdir
from string import punctuation
import numpy as np
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
BATCH_SIZE = 64
NUM_EPOCHS = 1
# load doc into memory
def load_doc(filename):
# open the file as read only
file = open(filename, "r", encoding="utf8")
# read all text
text = file.read()
# close the file
file.close()
return text
# turn a doc into clean tokens
def clean_doc(doc, vocab):
# split into tokens by white space
tokens = doc.split()
# remove punctuation from each token
table = str.maketrans("", "", punctuation)
tokens = [w.translate(table) for w in tokens]
# filter out tokens not in vocab
tokens = [w for w in tokens if w in vocab]
tokens = " ".join(tokens)
return tokens
# load all docs in a directory
def process_docs(directory, vocab):
documents = list()
# walk through all files in the folder
for filename in listdir(directory):
# create the full path of the file to open
path = os.path.join(directory, filename)
# load the doc
doc = load_doc(path)
# clean doc
tokens = clean_doc(doc, vocab)
# add to list
documents.append(tokens)
return documents
def get_data(data_dir, dataset_id, vocab, tokenizer, is_train=True):
if is_train:
positive_docs = process_docs(os.path.join(data_dir, "train/pos"), vocab)
negative_docs = process_docs(os.path.join(data_dir, "train/neg"), vocab)
num_rows = len(positive_docs)
num_divisions = 5
division_len = num_rows // num_divisions
start_ind = division_len * (dataset_id - 1)
end_ind = division_len * dataset_id
positive_docs = positive_docs[start_ind:end_ind]
num_rows = len(negative_docs)
num_divisions = 5
division_len = num_rows // num_divisions
start_ind = division_len * (dataset_id - 1)
end_ind = division_len * dataset_id
negative_docs = negative_docs[start_ind:end_ind]
else:
positive_docs = process_docs(os.path.join(data_dir, "test/pos"), vocab)
negative_docs = process_docs(os.path.join(data_dir, "test/neg"), vocab)
docs = negative_docs + positive_docs
positive_docs_len = len(positive_docs)
negative_docs_len = len(negative_docs)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(docs)
max_length = 80
X = pad_sequences(encoded_docs, maxlen=max_length, padding="post")
# define training labels
y = np.array(
[0 for _ in range(negative_docs_len)] + [1 for _ in range(positive_docs_len)],
)
return X, y
def train_on_device(data_dir, dataset_id, model_path, ckpt_path, weight_updates_path):
"""Returns (n, weight_updates) after training on local data."""
# Store pre-trained model and weights
old_model = load_model(model_path)
old_model.load_weights(ckpt_path)
# Initialize model and checkpoint, which are obtained from server
device_model = load_model(model_path)
device_model.load_weights(ckpt_path)
# print(device_model.summary())
# load the vocabulary
vocab_filename = os.path.join(data_dir, "vocab.txt")
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# create the tokenizer
tokenizer = Tokenizer()
# loading tokenizer from file
tokenizer_filename = os.path.join(data_dir, "tokenizer.pickle")
with open(tokenizer_filename, "rb") as handle:
tokenizer = pickle.load(handle)
# Get training data present on device
X_train, y_train = get_data(data_dir, dataset_id, vocab, tokenizer, is_train=True)
X_test, y_test = get_data(data_dir, dataset_id, vocab, tokenizer, is_train=False)
scores = device_model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy(before training): %.2f%%" % (scores[1] * 100))
# Train model
device_model.fit(
X_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE,
)
scores = device_model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy(after training): %.2f%%" % (scores[1] * 100))
# Load model to store weight updates
weight_updates = load_model(model_path)
# Number of batches trained on device
num_batches = X_train.shape[0] // BATCH_SIZE
# Calculate weight updates
for i in range(len(device_model.layers)):
# Pre-trained weights
old_layer_weights = old_model.layers[i].get_weights()
# Post-trained weights
new_layer_weights = device_model.layers[i].get_weights()
# Weight updates calculation
weight_updates.layers[i].set_weights(
num_batches
* (np.asarray(new_layer_weights) - np.asarray(old_layer_weights)),
)
# print("old weights: ", old_layer_weights)
# print("new weights: ", new_layer_weights)
# print("weight updates: ", weight_updates.layers[i].get_weights())
# Save weight updates
weight_updates.save_weights(weight_updates_path)
return (num_batches, weight_updates_path)