add demo file

lipingliulp · lipingliulp · commit ae3ef360019e · 2017-11-02T10:56:34.000-04:00
diff --git a/experiment/demo.py b/experiment/demo.py
@@ -0,0 +1,104 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import pickle
+
+import numpy as np
+from six.moves import xrange  # python2/3 compatible 
+import tensorflow as tf
+import string
+import scipy 
+import scipy.sparse as sparse
+import os
+
+# import code of this project
+sys.path.insert(0, '../util/')
+from util import config_to_name
+sys.path.insert(0, '../model/')
+from embedding import fit_emb
+from embedding import evaluate_emb
+from embedding import dense_array_feeder
+from embedding import sparse_array_feeder
+from random_data import rand_data
+
+def embedding_experiment(config, dataset):
+    np.random.seed(seed=27)
+
+    ## Step 1: load data
+    print('Generating a dataset ...')
+
+    data = rand_data() # the training/test dataset generated by rand_data has two fields, but only 'scores' are needed here
+
+    trainset = data['trainset']['scores']
+    testset = data['testset']['scores']
+
+    """
+        trainset: scores: a sparse matrix, each ij entry is the rating of movie j given by person i, or the count of item j in basket i
+        testset:  [same structure as trainset]
+    """
+
+    # one can always redefine zie.generate_batch(reviews, rind) to use other format of trainset and testset 
+
+    print('The training set has %d rows and %d columns, and the test set has %d rows' % 
+                             (trainset.shape[0], trainset.shape[1], testset.shape[0]))
+    
+
+
+
+    # batch_feeder is a function, which will be executed as batch_feeder(trainset[i])
+    # its output will be fed into tf place holders
+    batch_feeder = sparse_array_feeder
+
+    # fit an emb model
+    print('Training set has size: ', trainset.shape)
+    emb_model, logg = fit_emb(trainset, batch_feeder, config)
+    print('Training done!')
+
+    print('Test set has size: ', testset.shape)
+    test_llh = evaluate_emb(testset, batch_feeder, emb_model, config)
+    print('Testing done!')
+
+    # Save result 
+    print('Check result...')
+    emb_vec = emb_model['alpha']
+    print('Embedding matrix has shape ', emb_vec.shape)
+    # Save wherever you want 
+ 
+    print('Done!')
+
+if __name__ == '__main__':
+
+    dataset = 'random'
+    dist = 'poisson'
+    max_iter = 500
+    nprint = 100
+
+    config = dict(
+                  # the dimensionality of the embedding vectors  
+                  K=50,              
+                  # the embedding distribution  'poisson' or 'binomial' (N=3)
+                  dist=dist,        
+                  # ratio of negative samples. if there are N0 zeros in one row, only sample (0.1 * N0) from these zero,  
+                  # it is equivalent to downweight zero-targets with weight 0.1 
+                  neg_ratio=0.1,    
+                  # number of optimization iterations
+                  max_iter=max_iter, 
+                  # number of iterations to print objective, training log-likelihood, and validation log-likelihood, and debug values
+                  nprint=nprint, 
+                  # weight for regularization terms of embedding vectors
+                  ar_sigma2=1, 
+                  # uncomment the following line to use the base model
+                  #model='base', 
+                  # uncomment the following line to use context selection. Only the prior 'fixed_bern' works for now 
+                  model='context_select', prior='fixed_bern', nsample=30, hidden_size=[30, 15], histogram_size=40, nsample_test=1000, selsize=10,
+                  ) 
+
+    print('The configuration is: ')
+    print(config)
+
+    embedding_experiment(config, dataset)
+    
+
+
diff --git a/experiment/random_data.py b/experiment/random_data.py
@@ -0,0 +1,46 @@
+import numpy as np
+from scipy import sparse 
+import pickle
+
+'''
+Generate a random copy of data
+
+input:
+output: dict with two fields:
+             trainset: dict with two fields  
+                            scores: a sparse matrix, each ij entry is the rating of movie j given by person i, or the count of item j in basket i
+                            atts  : a matrix, each row is a feature vector extracted from person i, or basket i
+             testset : [same structure as test set]
+
+'''
+
+def rand_data():
+    n_rows = 200
+    n_columns = 50
+    n_feat = 5
+    
+    
+    np.random.seed(27)
+    # allocate more rows than necessary, to make sure each row has at least 2 non-zero entries
+    score_mat = np.random.rand(n_rows * 2, n_columns)
+    
+    score_mat[score_mat < 0.88] = 0
+    score_mat[np.logical_and(0.96 <= score_mat, score_mat < 1)] = 3
+    score_mat[np.logical_and(0.92 <= score_mat, score_mat < 0.96)] = 2
+    score_mat[np.logical_and(0.88 <= score_mat, score_mat < 0.92)] = 1
+    
+    
+    row_sum = np.sum(score_mat > 0, axis=1)
+    score_mat = score_mat[row_sum >= 2, ]
+    score_mat = score_mat[0 : n_rows, ]
+    
+    feature = np.random.rand(n_rows, n_feat)
+    
+    trainset = dict(scores=sparse.csr_matrix(score_mat[0:(n_rows / 2)]), atts=feature[0:(n_rows / 2)]) 
+    testset = dict(scores=sparse.csr_matrix(score_mat[(n_rows / 2):]), atts=feature[(n_rows / 2):]) 
+
+    return dict(trainset=trainset, testset=testset)
+
+
+
+
diff --git a/model/conbernarray.py b/model/conbernarray.py
@@ -57,9 +57,9 @@ def logprob(logits, samples):
 
     logprob_cbs = logprob_unc - tf.expand_dims(logprob_non0, 1) # expand to samples
 
-    check_point = tf.assert_less(tf.reduce_mean(logprob_cbs), 0.001, data=[tf.reduce_mean(logprob_cbs), tf.reduce_mean(logits), tf.reduce_mean(samples)])
-    with tf.control_dependencies([check_point]):
-        logprob_cbs = tf.identity(logprob_cbs)
+    #check_point = tf.assert_less(tf.reduce_mean(logprob_cbs), 0.001, data=[tf.reduce_mean(logprob_cbs), tf.reduce_mean(logits), tf.reduce_mean(samples)])
+    #with tf.control_dependencies([check_point]):
+    #    logprob_cbs = tf.identity(logprob_cbs)
     
 
     return logprob_cbs 
@@ -135,9 +135,9 @@ def sample(logits, nsample):
     samples = samples * trunc_mask + trunc_flag 
 
 
-    check_point = tf.assert_greater(tf.reduce_mean(samples), 0.0, data=[tf.reduce_mean(logits), tf.reduce_mean(samples)])
-    with tf.control_dependencies([check_point]):
-        samples = tf.identity(samples)
+    #check_point = tf.assert_greater(tf.reduce_mean(samples), 0.0, data=[tf.reduce_mean(logits), tf.reduce_mean(samples)])
+    #with tf.control_dependencies([check_point]):
+    #    samples = tf.identity(samples)
     
     return samples
 
diff --git a/model/embedding.py b/model/embedding.py
@@ -64,7 +64,7 @@ def get_model(model_param, session, config):
 
 
 
-def fit_emb(reviews, batch_feeder, config, save_path):
+def fit_emb(reviews, batch_feeder, config):
 
     do_log_save = False
     do_profiling = False
@@ -171,8 +171,6 @@ def fit_emb(reviews, batch_feeder, config, save_path):
                 
 
                 model = get_model(model_param, session, config)
-                save_file = save_path + ('iter%d' % step) + config_to_name(config) + '.pkl'
-                pickle.dump(dict(model=model, logg=train_logg), open(save_file, "wb"))
 
                 if do_log_save:
                     tf.train.Saver().save(session, log_save_path, step)
@@ -232,7 +230,7 @@ def evaluate_emb(reviews, batch_feeder, model, config):
         return dict(pos_llh=pos_llh, neg_llh=neg_llh)
 
 def sparse_array_feeder(batch): 
-    _, nz_ind, values = scipy.sparse.find(batch)
+    _, nz_ind, values = sparse.find(batch)
     return nz_ind, values
  
 
diff --git a/model/graph_builder.py b/model/graph_builder.py
@@ -37,13 +37,6 @@ def sample_negatives(self, context, config):
         cat_dist = tf.contrib.distributions.Categorical(probs=prob)
         sample = cat_dist.sample(nneg)
 
-        # sanity check
-        context_zero = tf.assert_equal(tf.gather(prob, context), 0.0)
-        other_prob = tf.assert_equal(tf.reduce_sum(tf.cast(tf.abs(prob - (1.0 / normalizer)) < 1e-6, tf.int32)), \
-                                     movie_size - ncontext)
-        with tf.control_dependencies([context_zero, other_prob]):
-            sample = tf.identity(sample)
-
         return sample
 
     def log_dist_prob(self, target, target_label, emb_score, config, zero_labels=False):
@@ -198,7 +191,9 @@ def calculate_bernoulli_logpb(self, target, context, is_same_set, comb, comb_bin
     def calculate_noisy_elbo(self, target, target_label, context, context_label, is_same_set, training, config):
 
         if is_same_set:
-            with tf.control_dependencies([tf.assert_greater(tf.shape(context)[0], 2)]):
+            # if is_same_set, the variable "context" here contains both the index of the target item and also indices of context items. 
+            # it need to has at least 2 elements, otherwise the target item has no context items, and such row should be removed.
+            with tf.control_dependencies([tf.assert_greater(tf.shape(context)[0], 1)]):
                 context = tf.identity(context)
 
         # generate configurations
diff --git a/model/inference_network.py b/model/inference_network.py
@@ -157,28 +157,23 @@ def build_network(self, target_label, context_scores, b_logit, is_same_set, nsam
 
         logits = logits + b_logit
 
-        if is_same_set:
-            self.debug_var.append(h_fc1)
-            self.debug_var.append(h_fc2)
-            self.debug_var.append(logits)
-
-        # assert logits is not nan or -inf
         if is_same_set: # set the diagnal to be negative so that 1) diagnal element of a sample is always 0 2) it does not have gradient 
             logits = tf.matrix_set_diag(logits, tf.ones([ntarget]) * (-50.0))
 
-        check_point = tf.assert_greater(tf.reduce_mean(logits), -10000.0, data=[tf.reduce_mean(logits),  
-                                                                                tf.reduce_mean(context_scores), 
-                                                                                tf.reduce_mean(b_logit), 
-                                                                                tf.reduce_mean(feat),
-                                                                                tf.reduce_mean(self.W_fc1), 
-                                                                                tf.reduce_mean(self.W_fc2), 
-                                                                                tf.reduce_mean(self.W_fc3), 
-                                                                                tf.reduce_mean(self.b_fc1), 
-                                                                                tf.reduce_mean(self.b_fc2), 
-                                                                                tf.reduce_mean(self.b_fc3)
-                                                                                ])
-        with tf.control_dependencies([check_point]):
-            logits = tf.identity(logits)
+        # assert logits is not nan or -inf
+        #check_point = tf.assert_greater(tf.reduce_mean(logits), -10000.0, data=[tf.reduce_mean(logits),  
+        #                                                                        tf.reduce_mean(context_scores), 
+        #                                                                        tf.reduce_mean(b_logit), 
+        #                                                                        tf.reduce_mean(feat),
+        #                                                                        tf.reduce_mean(self.W_fc1), 
+        #                                                                        tf.reduce_mean(self.W_fc2), 
+        #                                                                        tf.reduce_mean(self.W_fc3), 
+        #                                                                        tf.reduce_mean(self.b_fc1), 
+        #                                                                        tf.reduce_mean(self.b_fc2), 
+        #                                                                        tf.reduce_mean(self.b_fc3)
+        #                                                                        ])
+        #with tf.control_dependencies([check_point]):
+        #    logits = tf.identity(logits)
 
         samples = cba.sample(logits, nsample) 
         logprob = cba.logprob(logits, samples)