-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmembership_strong.py
208 lines (170 loc) · 8.19 KB
/
membership_strong.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
from __future__ import print_function
from __future__ import division
from sklearn.utils import check_random_state
from lasagne_wrapper.network import Network
from utils.evaluation import evaluate
from utils.data import load_data, shape_data, show_data_splits
from utils.membership import tolist, shape_datapools, select_model, compute_membership_fix
import numpy as np
import argparse
import os
'''
Hybrid music playlist continuation based on playlist-song membership.
We learn a binary classifier to decide if any playlist-song pair (represented
by feature vectors) is a good match or not. Once it is learned, such classifier
is used to populate a matrix of song-playlist scores describing how well a song
and a playlist fit together. Thus, a playlist can be extended by selecting
the songs with highest score.
This approach is "hybrid" in the usual sense in the recommender systems
literature, i.e., it combines content (given by the song features) and
collaborative information (given by playlists examples).
In this program we explore the so-called strong generalization setting. That is,
the membership model is trained on a playlists collection independent from the
playlists that will be extended.
'''
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Hybrid music playlist continuation based on playlist-song membership.')
parser.add_argument('--model', type=str, help='path to the model specification file', metavar='')
parser.add_argument('--dataset', type=str, help='path to the playlists dataset directory', metavar='')
parser.add_argument('--msd', type=str, help='path to the MSD directory', metavar='')
parser.add_argument('--fit', action='store_true', help='fit the song-to-playist classifier with monitoring')
parser.add_argument('--test', action='store_true', help='evaluate the playlist continuations')
parser.add_argument('--ci', action='store_true', help='compute confidence intervals if True')
parser.add_argument('--song_occ', type=int, help='test on songs observed song_occ times during training', nargs='+', metavar='')
parser.add_argument('--metrics_file', type=str, help='file name to save metrics', metavar='')
parser.add_argument('--seed', type=int, help='set random behavior', metavar='')
args = parser.parse_args()
# set random behavior
rng = check_random_state(args.seed)
# load model configuration
model = select_model(args.model)
# prepare output directory
data_name = os.path.basename(os.path.normpath(args.dataset))
out_dir = os.path.join('params', 'membership', model.name + '_' + data_name + '_strong')
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# load data: playlists, splits, features and artist info
data = load_data(args.dataset, args.msd, model)
playlists_coo, split_weak, split_strong, features, song2artist = data
# playlists_coo are the playlists stored in coordinate format
playlists_idx, songs_idx, position, idx2song = playlists_coo
# split_weak provides a query/continuation split
train_idx_cnt, test_idx_cnt = np.hstack(split_weak[:2]), split_weak[2]
cont_output_l, Y_cont_l, Y_query_l, train_occ_l = [], [], [], []
for fold in range(5):
print('\nRunning fold {}...'.format(fold))
# split_strong defines a playlist-disjoint split
# chose one of the folds
fold_strong = split_strong[fold]
train_idx_dsj, valid_idx_dsj, test_idx_dsj = fold_strong
# define splits for this experiment
# train model on playlist-disjoint training split
# validate model on playlist-disjoint validation split
# fit model on playlist-disjoint training+validation split
# extend only the playlist-disjoint test split
train_idx = train_idx_dsj
valid_idx = valid_idx_dsj
fit_idx = np.hstack((train_idx_dsj, valid_idx_dsj))
query_idx = np.intersect1d(test_idx_dsj, train_idx_cnt)
cont_idx = np.intersect1d(test_idx_dsj, test_idx_cnt)
# shape data pools for the membership model
dp_train, dp_valid, dp_fit = shape_datapools(
playlists_idx=playlists_idx,
songs_idx=songs_idx,
position=position,
features=features,
idx2song=idx2song,
train=train_idx,
valid=valid_idx,
fit=fit_idx,
model=model,
random_state=rng
)
# provide data information
show_data_splits(playlists_idx, songs_idx, idx2song, song2artist,
train_idx, valid_idx, fit_idx, query_idx, cont_idx)
if args.fit:
#
# fit the membership model while validating on withheld playlists
#
# define network
print('\nCreating model ...')
net = model.create_model(
batch_size=model.BATCH_SIZE,
pl_length=dp_train.out_size - 1,
feat_dim=model.FEAT_DIM,
)
# initialize network
my_net = Network(net)
# compile training data
data = {'train': dp_train, 'valid': dp_valid, 'test': dp_valid}
# set up output files
params_file = os.path.join(out_dir, '{}_params{}.pkl'.format(model.name, fold))
log_file = os.path.join(out_dir, '{}_log_train{}.pkl'.format(model.name, fold))
# train model
my_net.fit(
data=data,
training_strategy=model.train_strategy,
dump_file=params_file,
log_file=log_file
)
if args.test:
#
# extend the playlists in the query split and evaluate the
# continuations by comparing them to actual withheld continuations
#
# define network (batch_size=num_songs, to evaluate all candidates)
print('\nCreating model ...')
net = model.create_model(
batch_size=len(idx2song),
pl_length=dp_train.out_size - 1,
feat_dim=model.FEAT_DIM,
)
# initialize network
my_net = Network(net)
# load previously fit parameters
params_file = os.path.join(out_dir, '{}_params{}.pkl'.format(model.name, fold))
my_net.load(file_path=params_file)
# shape withheld continuations for evaluation
_, Y_cont = shape_data(
playlists_idx, songs_idx, idx2song, features, subset=cont_idx
)
# songs in the "query" playlists need to be masked to make sure that
# they are not recommended as continuations
_, Y_query = shape_data(
playlists_idx, songs_idx, idx2song, features, subset=query_idx
)
# get num of song occs. when model was fit for cold-start analysis
_, Y_fit = shape_data(
playlists_idx, songs_idx, idx2song, features, subset=fit_idx
)
train_occ = np.asarray(Y_fit.sum(axis=1)).flatten()
# convert query playlists to list and get their row indices
query_playlists = tolist(playlists_idx, songs_idx, position, idx2song, subset=query_idx)
unique_pl_idx = np.unique(playlists_idx[query_idx])
# predict song-playlist probabilities
cont_output = compute_membership_fix(
playlists=query_playlists,
idx2song=idx2song,
features=features,
my_net=my_net,
random_state=rng
)
# cont_output = np.random.rand(len(idx2song), len(query_playlists))
# append arrays re-shaping for evaluation
cont_output_l.append(cont_output.T)
Y_cont_l.append(Y_cont.T.tocsr()[unique_pl_idx])
Y_query_l.append(Y_query.T.tocsr()[unique_pl_idx])
train_occ_l.append(train_occ)
if args.test:
# evaluate the continuations
evaluate(
scores=cont_output_l,
targets=Y_cont_l,
queries=Y_query_l,
train_occ=train_occ_l,
k_list=[10, 30, 100],
ci=args.ci,
song_occ=args.song_occ,
metrics_file=args.metrics_file
)