Skip to content

Commit dd846d7

Browse files
committed
clean version of code for embedding models with context selection
1 parent 9191b93 commit dd846d7

File tree

7 files changed

+1282
-0
lines changed

7 files changed

+1282
-0
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
*/__pycache__
2+
*.pyc
3+
*.swp
4+
*.swo

experiment/experiment.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
from __future__ import absolute_import
2+
from __future__ import division
3+
from __future__ import print_function
4+
5+
import sys
6+
import pickle
7+
8+
import numpy as np
9+
from six.moves import xrange # python2/3 compatible
10+
import tensorflow as tf
11+
import string
12+
import scipy
13+
import scipy.sparse as sparse
14+
import os
15+
16+
# import code of this project
17+
sys.path.insert(0, '../util/')
18+
from util import config_to_name
19+
sys.path.insert(0, '../model/')
20+
from embedding import fit_emb
21+
from embedding import evaluate_emb
22+
from embedding import dense_array_feeder
23+
from embedding import sparse_array_feeder
24+
25+
# structure of the data folder
26+
# train/test data: data_path + dataset_name + '/splits/' + {'train0', 'test0'} + '.pkl'
27+
28+
data_path = os.environ['EMB_DATA_PATH']
29+
30+
def load_data(dataset): # load ZIE data, drop attributes for ZIE
31+
# load data
32+
trfile = data_path + dataset + '/splits/train0.pkl'
33+
tsfile = data_path + dataset + '/splits/test0.pkl'
34+
35+
if sys.version_info >= (3, 0):
36+
trainset = pickle.load(open(trfile, 'rb'), encoding='latin1')
37+
testset = pickle.load(open(tsfile, 'rb'), encoding='latin1')
38+
else:
39+
trainset = pickle.load(open(trfile, 'rb'))
40+
testset = pickle.load(open(tsfile, 'rb'))
41+
42+
trainset = trainset['scores']
43+
testset = testset['scores']
44+
45+
# remove rows that contain less than 3 non-zero values
46+
if isinstance(trainset, sparse.csr_matrix):
47+
flag = np.squeeze(trainset.sum(axis=1) >= 3)
48+
trainset = trainset[flag.nonzero()[0], :]
49+
flag = np.squeeze(testset.sum(axis=1) >= 3)
50+
testset = testset[flag.nonzero()[0], :]
51+
52+
raise Exception('good code ')
53+
else:
54+
flag = np.sum(trainset > 0, axis=1) >= 3
55+
trainset = trainset[flag, :]
56+
flag = np.sum(testset > 0, axis=1) >= 3
57+
testset = testset[flag, :]
58+
59+
60+
print('Average number of movies per user is ', np.mean(np.sum(trainset > 0, axis=1)))
61+
62+
print('Overall %d training reviews and %d test reviews' % (trainset.shape[0], testset.shape[0]))
63+
return trainset, testset
64+
65+
66+
def embedding_experiment(config, dataset):
67+
np.random.seed(seed=27)
68+
69+
# batch_feeder is a function, which will be executed as batch_feeder(trainset[i])
70+
if dataset in ['movie', 'subset_pa']:
71+
trainset, testset = load_data(dataset)
72+
batch_feeder = dense_array_feeder
73+
74+
elif dataset in ['safeway_group_nov']:
75+
trainset, testset = load_data(dataset)
76+
batch_feeder = sparse_array_feeder
77+
78+
# fit an emb model
79+
print('Training set has size: ', trainset.shape)
80+
emb_model, logg = fit_emb(trainset, batch_feeder, config, save_path=data_path + dataset + '/splits/')
81+
print('Training done!')
82+
83+
print('Test set has size: ', testset.shape)
84+
test_llh = evaluate_emb(testset, batch_feeder, emb_model, config)
85+
print('Testing done!')
86+
87+
# Save result
88+
print('Check result...')
89+
emb_vec = emb_model['alpha']
90+
print('Embedding matrix has shape ', emb_vec.shape)
91+
# Save wherever you want
92+
93+
print('Done!')
94+
95+
if __name__ == '__main__':
96+
97+
dataset = 'movie'
98+
max_iter = 500
99+
dist = 'binomial' # N=3 for binomial distribution
100+
nprint = 50
101+
102+
config = dict(
103+
# the dimensionality of the embedding vectors
104+
K=50,
105+
# the embedding distribution
106+
dist=dist,
107+
# ratio of negative samples. if there are N0 zeros in one row, only sample (0.1 * N0) from these zero,
108+
# it is equivalent to downweight zero-targets with weight 0.1
109+
neg_ratio=0.1,
110+
# number of optimization iterations
111+
max_iter=max_iter,
112+
# number of iterations to print objective, training log-likelihood, and validation log-likelihood, and debug values
113+
nprint=nprint,
114+
# weight for regularization terms of embedding vectors
115+
ar_sigma2=1,
116+
# uncomment the following line to use the base model
117+
#model='base',
118+
# uncomment the following line to use context selection. Only the prior 'fixed_bern' works for now
119+
model='context_select', prior='fixed_bern', nsample=30, hidden_size=[30, 15], histogram_size=40, nsample_test=1000, selsize=10,
120+
)
121+
122+
print('The configuration is: ')
123+
print(config)
124+
125+
embedding_experiment(config, dataset)
126+
127+
128+

model/conbernarray.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import numpy as np
2+
import tensorflow as tf
3+
4+
5+
6+
def logprob(logits, samples):
7+
"""
8+
Calculate log probability
9+
10+
Args:
11+
logits: tensor,
12+
samples: tensor, ntarget x nsample x ncontext
13+
14+
Return:
15+
logprob: tensor, ntarget x nsample log probability of each sample
16+
"""
17+
18+
# scale up probabilities so that the maximum of logits is at least -10.0, and the minum of logits is at least -25.0
19+
20+
# If ((-10) - max_logit) > 0, then add ((-10) - max_logit) to all logits
21+
# the scaled up probability vector gets neglectable larger probability to obtain more than 1 bits with value one.
22+
max_logits = tf.reduce_max(logits, axis=-1, keep_dims=True)
23+
diff = tf.clip_by_value(-10.0 - max_logits, clip_value_min=0.0, clip_value_max=np.inf)
24+
logits = logits + diff
25+
26+
# clip min logits value, so that min_mu = sigmoid(min_logit) is not zero.
27+
# Since the largest logits is at least -10.0. The value clip will neglectably increase the probability of getting
28+
# value ones at these bits
29+
logits = tf.clip_by_value(logits, clip_value_min=-25.0, clip_value_max=np.inf)
30+
31+
32+
logp0 = - tf.nn.softplus(logits)
33+
34+
#logprob without constraints
35+
36+
# \sum_i b_i * logit_i
37+
logits_sum = tf.reduce_sum(samples * tf.expand_dims(logits, 1), axis=-1) # broadcast to samples
38+
39+
# \sum_i log(1 - p_i)
40+
minusa_sum = tf.expand_dims(tf.reduce_sum(logp0, axis=-1), 1)
41+
# \sum_i b_i * logit_i + log(1 - p_i)
42+
logprob_unc = logits_sum + minusa_sum # minusa_sum is broadcasted to samples
43+
44+
# log probability that at least one bit is 1
45+
46+
# the probability is calculated as
47+
# log(1 - exp(Sum_ - log(1 + exp(logit_i))))
48+
accurate = tf.log(1 - tf.exp(tf.reduce_sum(logp0, axis=-1)))
49+
50+
# when all logits are small, it can be approximated as
51+
# log(Sum exp(logit_i))
52+
approxim = tf.reduce_logsumexp(logits, axis=-1)
53+
54+
# use the approximate calculation when the logit is negatively large
55+
max_logit = tf.reduce_max(logits, axis=-1)
56+
logprob_non0 = tf.where(max_logit < -15.0, approxim, accurate)
57+
58+
logprob_cbs = logprob_unc - tf.expand_dims(logprob_non0, 1) # expand to samples
59+
60+
check_point = tf.assert_less(tf.reduce_mean(logprob_cbs), 0.001, data=[tf.reduce_mean(logprob_cbs), tf.reduce_mean(logits), tf.reduce_mean(samples)])
61+
with tf.control_dependencies([check_point]):
62+
logprob_cbs = tf.identity(logprob_cbs)
63+
64+
65+
return logprob_cbs
66+
67+
68+
def sample(logits, nsample):
69+
"""
70+
Sample an array of Bernoulli random variables under the constraint that at least one bit is 1
71+
72+
Args:
73+
logits: tf float32 array, the last dimension is treated as a Bernoulli array
74+
Return:
75+
samples: tf float32 array,
76+
77+
"""
78+
79+
80+
# scale up probabilities so that the maximum of logits is at least -10.0, and the minum of logits is at least -25.0
81+
82+
# If ((-10) - max_logit) > 0, then add ((-10) - max_logit) to all logits
83+
# the scaled up probability vector gets neglectable larger probability to obtain more than 1 bits with value one.
84+
max_logits = tf.reduce_max(logits, axis=-1, keep_dims=True)
85+
diff = tf.clip_by_value(-10.0 - max_logits, clip_value_min=0.0, clip_value_max=np.inf)
86+
logits = logits + diff
87+
88+
# clip min logits value, so that min_mu = sigmoid(min_logit) is not zero.
89+
# Since the largest logits is at least -10.0. The value clip will neglectably increase the probability of getting
90+
# value ones at these bits
91+
logits = tf.clip_by_value(logits, clip_value_min=-25.0, clip_value_max=np.inf)
92+
93+
94+
# sample Bernoulli bits freely
95+
logit_shape = tf.shape(logits)
96+
sample_shape = [logit_shape[0], nsample, logit_shape[1]]
97+
98+
prob = tf.sigmoid(logits)
99+
# bernoulli sampling with uniform dist
100+
samples = tf.ceil(tf.subtract(tf.expand_dims(prob, 1), tf.random_uniform(sample_shape)))
101+
102+
103+
# calculate log p(b_{1:i-1} = 0, b_i = 1)
104+
105+
# log(1 - mu_i)
106+
logp0 = - tf.nn.softplus(logits)
107+
# [0, log(1 - mu_1), log( (1 - mu_1)(1 - mu_2) ), ...]
108+
cum_logp0 = tf.cumsum(logp0, axis=-1, exclusive=True)
109+
# [log(mu_1), log( (1 - mu_1)mu_2 ), log( (1 - mu_1)(1 - mu_2)mu_3 ), ... ]
110+
log_p001 = cum_logp0 - tf.nn.softplus(- logits)
111+
112+
113+
# calculate the probability that p(b_{1 : i-1} > 0)
114+
115+
max_log = tf.reduce_max(log_p001, axis=-1, keep_dims=True)
116+
# [mu_1, (1 - mu_1)mu_2, (1 - mu_1)(1 - mu_2)mu_3, ... ]
117+
p001 = tf.exp(log_p001 - max_log)
118+
# [0, mu_1, 1 - (1 - mu_1)(1 - mu_2), ... ]
119+
pvalid = tf.cumsum(p001, axis=1, exclusive=True) # need a cumlative log-sum-exp here, but it's fine
120+
log_pvalid = tf.log(pvalid) + max_log # log(0) = -Inf
121+
122+
123+
# sample from bernouli with p001 and pvalid to get sample mask
124+
125+
first_one_prob = tf.sigmoid(log_p001 - log_pvalid) # probability of getting the first one value
126+
first_one_bits = tf.cast(tf.greater(tf.expand_dims(first_one_prob, 1), tf.random_uniform(sample_shape)), tf.int32)
127+
128+
129+
# cumsum twice, so the last one bit is still one. Bits proceeding it are greater than 1, and bits succeeding it are zero
130+
cum2_bits = tf.cumsum(tf.cumsum(first_one_bits, axis=2, reverse=True), axis=2, reverse=True)
131+
trunc_flag = tf.cast(tf.equal(cum2_bits, 1), tf.float32)
132+
trunc_mask = tf.cumsum(trunc_flag, exclusive=True, axis=2) # mask for bits after trunc_flag
133+
134+
# if i-th bit comes from p001, then set all preceeding bits as 1, set bit i as 1, and leave following bits
135+
samples = samples * trunc_mask + trunc_flag
136+
137+
138+
check_point = tf.assert_greater(tf.reduce_mean(samples), 0.0, data=[tf.reduce_mean(logits), tf.reduce_mean(samples)])
139+
with tf.control_dependencies([check_point]):
140+
samples = tf.identity(samples)
141+
142+
return samples
143+

0 commit comments

Comments
 (0)