-
Notifications
You must be signed in to change notification settings - Fork 1
/
train_models.py
383 lines (334 loc) · 15.2 KB
/
train_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
import numpy as np
import os
import subprocess
import tempfile
class FM:
# modified from https://github.com/jfloff/pywFM/blob/master/pywFM/__init__.py
""" Class that wraps `libFM` parameters. For more information read
[libFM manual](http://www.libfm.org/libfm-1.42.manual.pdf)
Parameters
----------
task : string, MANDATORY
regression: for regression
classification: for binary classification
num_iter: int, optional
Number of iterations
Defaults to 100
init_stdev : double, optional
Standard deviation for initialization of 2-way factors
Defaults to 0.1
k0 : bool, optional
Use bias.
Defaults to True
k1 : bool, optional
Use 1-way interactions.
Defaults to True
k2 : int, optional
Dimensionality of 2-way interactions.
Defaults to 8
learning_method: string, optional
sgd: parameter learning with SGD
sgda: parameter learning with adpative SGD
als: parameter learning with ALS
mcmc: parameter learning with MCMC
Defaults to 'mcmc'
learn_rate: double, optional
Learning rate for SGD
Defaults to 0.1
r0_regularization: int, optional
bias regularization for SGD and ALS
Defaults to 0
r1_regularization: int, optional
1-way regularization for SGD and ALS
Defaults to 0
r2_regularization: int, optional
2-way regularization for SGD and ALS
Defaults to 0
rlog: bool, optional
Enable/disable rlog output
Defaults to True.
verbose: bool, optional
How much infos to print
Defaults to False.
seed: int, optional
seed used to reproduce the results
Defaults to None.
silent: bool, optional
Completly silences all libFM output
Defaults to False.
temp_path: string, optional
Sets path for libFM temporary files. Usefull when dealing with large data.
Defaults to None (default NamedTemporaryFile behaviour)
"""
"""
### unsused libFM flags
cache_size: cache size for data storage (only applicable if data is in binary format), default=infty
datafile is text so we don't need this parameter
relation: BS - filenames for the relations, default=''
not dealing with BS extensions since they are only used for binary files
"""
def __init__(self,
task,
num_iter=100,
init_stdev=0.1,
k0=True,
k1=True,
k2=8,
learning_method='mcmc',
learn_rate=0.1,
r0_regularization=0,
r1_regularization=0,
r2_regularization=0,
rlog=True,
verbose=False,
seed=None,
silent=False,
temp_path=None,
normalize_items=None):
# gets first letter of either regression or classification
self.__task = task[0]
self.__num_iter = num_iter
self.__init_stdev = init_stdev
self.__dim = "%d,%d,%d" % (int(k0), int(k1), k2)
self.__learning_method = learning_method
self.__learn_rate = learn_rate
self.__regularization = "%.5f,%.5f,%.5f" % (r0_regularization, r1_regularization, r2_regularization)
self.__rlog = rlog
self.__verbose = int(verbose)
self.__seed = int(seed) if seed else None
self.__silent = silent
self.__temp_path = temp_path
self.__normalize_items = normalize_items
# gets libfm path
self.__libfm_path = os.environ.get('LIBFM_PATH')
if self.__libfm_path is None:
raise OSError("`LIBFM_PATH` is not set. \n"
"Did you forget to run: export LIBFM_PATH=/path/to/instal/libfm/bin/?\n"
"Please install libFM and set the path variable "
"(https://github.com/jfloff/pywFM#installing).")
# #ShameShame
# Once upon a time, there was a bug in libFM that allowed any type of
# learning_method to save the model. I @jfloff built this package at
# that time, and did not find anything that showed me that MCMC couldn't
# use save_model flag. Nowadays only SGD and ALS can use this parameter.
# Hence, we need to reset the repo to this specific commit pre-fix, so
# we can use MCMC with save_model flag.
# Can we contribute to main libFM repo so this is possible again??
# GITHASH = '91f8504a15120ef6815d6e10cc7dee42eebaab0f'
# c_githash = subprocess.check_output(['git', '--git-dir', os.path.join(self.__libfm_path, "..", ".git"), 'rev-parse', 'HEAD']).strip()
# if c_githash.decode("utf-8") != GITHASH:
# raise OSError("libFM is not checked out to the correct commit."
# "(https://github.com/jfloff/pywFM#installing).")
self.train_fd = None
self.test_fd = None
def save_files(self, x_train, y_train, x_test, y_test, filename=None):
from sklearn.datasets import dump_svmlight_file
import time
TMP_SUFFIX = '.pywfm'
filename = time.strftime("%Y%m%d-%H%M%S") if filename is None else filename
self.train_fd = os.path.join(self.__temp_path, 'train'+filename+TMP_SUFFIX)
self.test_fd = os.path.join(self.__temp_path, 'test'+filename+TMP_SUFFIX)
# converts train and test data to libSVM format
if self.__verbose: "dumping training data"
dump_svmlight_file(x_train, y_train, self.train_fd)
if self.__verbose: "dumping testing data"
dump_svmlight_file(x_test, y_test, self.test_fd)
return filename
def run(self, x_validation_set=None, y_validation_set=None, meta=None):
"""Run factorization machine model against train and test data
Parameters
----------
x_train : {array-like, matrix}, shape = [n_train, n_features]
Training data
y_train : numpy array of shape [n_train]
Target values
x_test: {array-like, matrix}, shape = [n_test, n_features]
Testing data
y_test : numpy array of shape [n_test]
Testing target values
x_validation_set: optional, {array-like, matrix}, shape = [n_train, n_features]
Validation data (only for SGDA)
y_validation_set: optional, numpy array of shape [n_train]
Validation target data (only for SGDA)
meta: optional, numpy array of shape [n_features]
Grouping input variables
Return
-------
Returns `namedtuple` with the following properties:
predictions: array [n_samples of x_test]
Predicted target values per element in x_test.
global_bias: float
If k0 is True, returns the model's global bias w0
weights: array [n_features]
If k1 is True, returns the model's weights for each features Wj
pairwise_interactions: numpy matrix [n_features x k2]
Matrix with pairwise interactions Vj,f
rlog: pandas dataframe [nrow = num_iter]
`pandas` DataFrame with measurements about each iteration
"""
assert self.train_fd is not None and self.test_fd is not None
TMP_SUFFIX = '.pywfm'
out_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
model_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
# builds arguments array
args = [os.path.join(self.__libfm_path, "libFM"),
'-task', "%s" % self.__task,
'-train', "%s" % self.train_fd,
'-test', "%s" % self.test_fd,
'-dim', "%s" % self.__dim,
'-init_stdev', "%g" % self.__init_stdev,
'-iter', "%d" % self.__num_iter,
'-method', "%s" % self.__learning_method,
'-out', "%s" % out_fd.name,
'-verbosity', "%d" % self.__verbose,
'-save_model', "%s" % model_fd.name]
if self.__normalize_items is not None:
args.extend(['-normalize', "%s" % self.__normalize_items])
# appends rlog if true
rlog_fd = None
if self.__rlog:
rlog_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
args.extend(['-rlog', "%s" % rlog_fd.name])
# appends seed if given
if self.__seed:
args.extend(['-seed', "%d" % self.__seed])
# appends arguments that only work for certain learning methods
if self.__learning_method in ['sgd', 'sgda']:
args.extend(['-learn_rate', "%.5f" % self.__learn_rate])
if self.__learning_method in ['sgd', 'sgda', 'als']:
args.extend(['-regular', "%s" % self.__regularization])
# adds validation if sgda
# if validation_set is none, libFM will throw error hence, I'm not doing any validation
validation_fd = None
if self.__learning_method == 'sgda' and (x_validation_set is not None and y_validation_set is not None):
validation_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path)
dump_svmlight_file(x_validation_set, y_validation_set, validation_fd.name)
args.extend(['-validation', "%s" % validation_fd.name])
# if meta data is given
meta_fd = None
if meta is not None:
meta_fd = tempfile.NamedTemporaryFile(suffix=TMP_SUFFIX, dir=self.__temp_path, text=True)
# write group ids
for group_id in meta:
meta_fd.write("%s\n" % group_id)
args.extend(['-meta', "%s" % meta_fd.name])
meta_fd.seek(0)
# if silent redirects all output
stdout = None
if self.__silent:
stdout = open(os.devnull, 'wb')
# call libfm with parsed arguments
# had unkown bug with "-dim" option on array. At the time was forced to
# concatenate string `args = ' '.join(args)` but looks like its working
# needs further tests
[print(a, end=' ') for a in args]
[print(a) for a in args]
subprocess.call(args, shell=False, stdout=stdout)
# reads output file
preds = [float(p) for p in out_fd.read().decode("utf-8").split('\n') if p]
# "hidden" feature that allows users to save the model
# We use this to get the feature weights
# https://github.com/srendle/libfm/commit/19db0d1e36490290dadb530a56a5ae314b68da5d
import numpy as np
global_bias = None
weights = []
pairwise_interactions = []
# if 0 its global bias; if 1, weights; if 2, pairwise interactions
out_iter = 0
for line in model_fd.read().decode("utf-8").splitlines():
# checks which line is starting with #
if line.startswith('#'):
if "#global bias W0" in line:
out_iter = 0
elif "#unary interactions Wj" in line:
out_iter = 1
elif "#pairwise interactions Vj,f" in line:
out_iter = 2
else:
# check context get in previous step and adds accordingly
if out_iter == 0:
global_bias = float(line)
elif out_iter == 1:
weights.append(float(line))
elif out_iter == 2:
try:
pairwise_interactions.append([float(x) for x in line.split(' ')])
except ValueError as e:
pairwise_interactions.append(0.0) #Case: no pairwise interactions used
pairwise_interactions = np.matrix(pairwise_interactions)
# parses rlog into dataframe
if self.__rlog:
# parses rlog into
import pandas as pd
rlog_fd.seek(0)
print(os.stat(rlog_fd.name).st_size)
rlog = pd.read_csv(rlog_fd.name, sep='\t')
rlog_fd.close()
else:
rlog = None
if self.__learning_method == 'sgda' and (x_validation_set is not None and y_validation_set is not None):
validation_fd.close()
if meta is not None:
meta_fd.close()
# removes temporary output file after using
model_fd.close()
out_fd.close()
# return as named collection for multiple output
import collections
fm = collections.namedtuple('model', ['predictions',
'global_bias',
'weights',
'pairwise_interactions',
'rlog'])
return fm(preds, global_bias, weights, pairwise_interactions, rlog)
def train_model(latent_dim, REG, SS, temp_path,
train_fd, test_fd):
print(REG, SS)
# initializing factorization machine
os.system('export LIBFM_PATH={}libfm/bin/'.format(path))
fm = FM(task='regression', num_iter=128,
temp_path=temp_path, verbose=True, k2=latent_dim,
k1=True, k0=True, learning_method='sgd',
r0_regularization=REG, r1_regularization=REG,
r2_regularization=REG, learn_rate=SS)
fm.train_fd = train_fd
fm.test_fd = test_fd
model = fm.run()
return model
if __name__ == "__main__":
path = '/home/sarah/recsys/'
os.system('export LIBFM_PATH={}libfm/bin/'.format(path))
## LASTFM
datapath = path + 'data/lastfm/lastfm-dataset-1K'
train_fd = os.path.join(datapath, 'lfm1k-play-count.train.libfm')
test_fd = os.path.join(datapath, 'lfm1k-play-count.test.libfm')
filename = 'fm'
REG=0.08
SS=0.001
savetag = '_r={}_ss={}'.format(REG, SS)
for latent_dim in [16, 32, 64, 128, 256, 512]:
model = train_model(latent_dim=latent_dim, REG=REG, SS=SS,
temp_path=datapath, train_fd=train_fd, test_fd=test_fd)
# saving results
save_file_name = os.path.join(datapath, filename+'_res_k={}{}.npz'.format(latent_dim, savetag))
print('model predictions shape', np.array(model.predictions).shape)
np.savez(save_file_name, global_bias=np.array(model.global_bias), preds=model.predictions,
weights=np.array(model.weights),pairwise_interactions=np.array(model.pairwise_interactions))
print('result saved to:', save_file_name)
## ML 10M
datapath = path + 'data/ml-10M100K/'
train_fd = os.path.join(datapath, 'r1.train.libfm')
test_fd = os.path.join(datapath, 'r1.test.libfm')
filename = 'ml'
REG = 0.04
SS = 0.003
savetag = '_r={}_ss={}'.format(REG, SS)
for latent_dim in in [16, 32, 64, 128, 256, 512]:
model = train_model(latent_dim=latent_dim, REG=REG, SS=SS,
temp_path=datapath, train_fd=train_fd, test_fd=test_fd)
# saving results
save_file_name = os.path.join(datapath, filename+'_res_k={}{}.npz'.format(latent_dim, savetag))
print('model predictions shape', np.array(model.predictions).shape)
np.savez(save_file_name, global_bias=np.array(model.global_bias), preds=model.predictions,
weights=np.array(model.weights),pairwise_interactions=np.array(model.pairwise_interactions))
print('result saved to:', save_file_name)