usmanxia
diff --git a/Diff for: ‎README.md
+2-3 b/Diff for: ‎README.md
+2-3
diff --git a/Diff for: ‎dataloader.py
+10-8 b/Diff for: ‎dataloader.py
+10-8
diff --git a/Diff for: ‎dataloaderraw.py
+1-1 b/Diff for: ‎dataloaderraw.py
+1-1
diff --git a/Diff for: ‎eval.py
+25-27 b/Diff for: ‎eval.py
+25-27
diff --git a/Diff for: ‎misc/AttentionModel.py
+8-8 b/Diff for: ‎misc/AttentionModel.py
+8-8
@@ -20,6 +20,7 @@ Currently if you want to use my code, you need to train the model from scratch (
 - ~~sample_max~~
 - ~~eval on unseen images~~
 - eval on test
+- visualize attention map
 
 # Requirements
 Python 2.7
@@ -69,9 +70,7 @@ If you'd like to evaluate BLEU/METEOR/CIDEr scores during training in addition t
 
 ### Caption images after training
 
-In this case you want to run the evaluation script on a pretrained model checkpoint. 
-I trained a decent one on the [MS COCO dataset](http://mscoco.org/) that you can run on your images.
-The pretrained checkpoint can be downloaded here: [pretrained checkpoint link](http://cs.stanford.edu/people/karpathy/neuraltalk2/checkpoint_v1.zip) (600MB). It's large because it contains the weights of a finetuned VGGNet. Now place all your images of interest into a folder, e.g. `blah`, and run
+Now place all your images of interest into a folder, e.g. `blah`, and run
 the eval script:
 
 ```bash
 
@@ -51,21 +51,23 @@ def __init__(self, opt):
         self.label_end_ix = self.h5_file['label_end_ix'][:]
 
         # separate out indexes for each of the provided splits
-        self.split_ix = [[],[],[]]
+        self.split_ix = {'train': [], 'val': [], 'test': []}
         for ix in range(len(self.info['images'])):
             img = self.info['images'][ix]
             if img['split'] == 'train':
-                self.split_ix[0].append(ix)
+                self.split_ix['train'].append(ix)
             elif img['split'] == 'val':
-                self.split_ix[1].append(ix)
+                self.split_ix['val'].append(ix)
             elif img['split'] == 'test':
-                self.split_ix[2].append(ix)
+                self.split_ix['test'].append(ix)
+            elif opt.train_only == 0: # restval
+                self.split_ix['train'].append(ix)
 
-        print('assigned %d images to split train' %len(self.split_ix[0]))
-        print('assigned %d images to split val' %len(self.split_ix[1]))
-        print('assigned %d images to split test' %len(self.split_ix[2]))
+        print('assigned %d images to split train' %len(self.split_ix['train']))
+        print('assigned %d images to split val' %len(self.split_ix['val']))
+        print('assigned %d images to split test' %len(self.split_ix['test']))
 
-        self.iterators = [0, 0, 0] # train, val, test
+        self.iterators = {'train': 0, 'val': 0, 'test': 0}
 
     def get_vocab_size(self):
         return self.vocab_size
 
@@ -62,7 +62,7 @@ def isImage(f):
         self.iterator = 0
 
     def get_batch(self, split, batch_size=None):
-        batch_size = batch_size or 5
+        batch_size = batch_size or self.batch_size
 
         # pick an index of the datapoint to load next
         img_batch = np.ndarray([batch_size, 224,224,3], dtype = 'float32')
 
@@ -28,9 +28,9 @@
 parser.add_argument('--infos_path', type=str, default='',
                 help='path to infos to evaluate')
 # Basic options
-parser.add_argument('--batch_size', type=int, default=1,
+parser.add_argument('--batch_size', type=int, default=0,
                 help='if > 0 then overrule, otherwise load from checkpoint.')
-parser.add_argument('--num_images', type=int, default=100,
+parser.add_argument('--num_images', type=int, default=-1,
                 help='how many images to use when periodically evaluating the loss? (-1 = all)')
 parser.add_argument('--language_eval', type=int, default=0,
                 help='Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')
@@ -41,18 +41,6 @@
 parser.add_argument('--dump_path', type=int, default=0,
                 help='Write image paths along with predictions into vis json? (1=yes,0=no)')
 
-# Model settings
-parser.add_argument('--caption_model', type=str, default="show_tell",
-                help='show_tell, show_attend_tell, attention')
-parser.add_argument('--rnn_size', type=int, default=512,
-                help='size of the rnn in number of hidden nodes in each layer')
-parser.add_argument('--num_layers', type=int, default=1,
-                help='number of layers in the RNN')
-parser.add_argument('--rnn_type', type=str, default='lstm',
-                help='rnn, gru, or lstm')
-parser.add_argument('--input_encoding_size', type=int, default=512,
-                help='the encoding size of each token in the vocabulary, and the image.')
-
 # Sampling options
 parser.add_argument('--sample_max', type=int, default=1,
                 help='1 = sample argmax words. 0 = sample from distributions.')
@@ -91,10 +79,13 @@
     opt.input_json = infos['opt'].input_json
 if opt.batch_size == 0:
     opt.batch_size = infos['opt'].batch_size
-fetch = ["caption_model", "rnn_type", "rnn_size", "num_layers", "seq_length",
-    "input_encoding_size", "drop_prob_lm", "seq_per_img", "vocab_size", "cnn_model", "grad_clip"]
-for k in fetch: 
-  vars(opt).update({k: vars(infos['opt'])[k]}) # copy over options from model
+ignore = ["id", "batch_size", "beam_size", "start_from"]
+for k in vars(infos['opt']).keys():
+    if k not in ignore:
+        if k in vars(opt):
+            assert vars(opt)[k] == vars(infos['opt'])[k], k + ' option not consistent'
+        else:
+            vars(opt).update({k: vars(infos['opt'])[k]}) # copy over options from model
 
 vocab = infos['vocab'] # ix -> word mapping
 
@@ -110,16 +101,13 @@
 else:
   loader = DataLoaderRaw({'folder_path': opt.image_folder, 
                             'coco_json': opt.coco_json,
-                            'language_eval': opt.language_eval,
-                            'split': opt.split,
-                            'num_images': opt.num_images,
                             'batch_size': opt.batch_size})
 
 # Evaluation fun(ction)
 def eval_split(sess, model, loader, eval_kwargs):
     verbose = eval_kwargs.get('verbose', True)
     num_images = eval_kwargs.get('num_images', -1)
-    split = eval_kwargs.get('split', 1)
+    split = eval_kwargs.get('split', 'test')
     language_eval = eval_kwargs.get('language_eval', 0)
     dataset = eval_kwargs.get('dataset', 'coco')
 
@@ -141,7 +129,7 @@ def eval_split(sess, model, loader, eval_kwargs):
             n = n + 1
         else:
             data = loader.get_batch(split, opt.batch_size)
-            n = n + loader.batch_size
+            n = n + opt.batch_size
 
         #evaluate loss if we have the labels
         loss = 0
@@ -193,7 +181,12 @@ def eval_split(sess, model, loader, eval_kwargs):
 
         # if we wrapped around the split or used up val imgs budget then bail
         ix0 = data['bounds']['it_pos_now']
-        ix1 = min(data['bounds']['it_max'], num_images)
+        ix1 = data['bounds']['it_max']
+        if num_images != -1:
+            ix1 = min(ix1, num_images)
+        for i in range(n - ix1):
+            predictions.pop()
+
         if verbose:
             print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss))
 
@@ -211,8 +204,10 @@ def eval_split(sess, model, loader, eval_kwargs):
     sess.run(tf.assign(model.cnn_training, True))
     return loss_sum/loss_evals, predictions, lang_stats
 
-
-with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)) as sess:
+tf_config = tf.ConfigProto()
+tf_config.intra_op_parallelism_threads=NUM_THREADS
+tf_config.gpu_options.allow_growth = True
+with tf.Session(config=tf_config) as sess:
     # Initilize the variables
     sess.run(tf.global_variables_initializer())
     # Load the model checkpoint to evaluate
@@ -223,7 +218,10 @@ def eval_split(sess, model, loader, eval_kwargs):
     sess.run(tf.assign(model.sample_max, opt.sample_max == 1))
     sess.run(tf.assign(model.sample_temperature, opt.temperature))
 
-    loss, split_predictions, lang_stats = eval_split(sess, model, loader, {'num_images': opt.num_images, 'split': opt.split})
+    loss, split_predictions, lang_stats = eval_split(sess, model, loader, 
+        {'num_images': opt.num_images,
+        'language_eval': opt.language_eval,
+        'split': opt.split})
 
 print('loss: ', loss)
 if lang_stats:
 
@@ -133,15 +133,15 @@ def build_model(self):
         grads = utils.clip_by_value(tf.gradients(self.cost, tvars), -self.opt.grad_clip, self.opt.grad_clip)
         #grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
         #        self.opt.grad_clip)
-        optimizer = tf.train.AdamOptimizer(self.lr)
+        optimizer = utils.get_optimizer(self.opt, self.lr)
         self.train_op = optimizer.apply_gradients(zip(grads, tvars))
 
         # Collect the cnn variables, and create the optimizer of cnn
         cnn_tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='cnn')
         cnn_grads = utils.clip_by_value(tf.gradients(self.cost, cnn_tvars), -self.opt.grad_clip, self.opt.grad_clip)
         #cnn_grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, cnn_tvars),
         #        self.opt.grad_clip)
-        cnn_optimizer = tf.train.AdamOptimizer(self.cnn_lr)     
+        cnn_optimizer = utils.get_cnn_optimizer(self.opt, self.cnn_lr)
         self.cnn_train_op = cnn_optimizer.apply_gradients(zip(cnn_grads, cnn_tvars))
 
         tf.summary.scalar('training loss', self.cost)
@@ -235,7 +235,7 @@ def decode(self, img, beam_size, sess, max_steps=30):
         #   "state": RNN state when generating the last word of the candidate
         good_sentences = [] # store sentences already ended with <eos>
         cur_best_cand = [] # store current best candidates
-        highest_score = - np.inf # hightest log-likelihodd in good sentences
+        highest_score = 0.0 # hightest log-likelihodd in good sentences
 
         # Get the initial logit and state
         cand = {'indexes': [], 'score': 0}
@@ -275,11 +275,11 @@ def decode(self, img, beam_size, sess, max_steps=30):
             # move candidates end with <eos> to good_sentences or remove it
             cand_left = []
             for cand in cur_best_cand:
-                if len(good_sentences) > beam_size and - cand['score'] > highest_score:
+                if len(good_sentences) > beam_size and cand['score'] > highest_score:
                     continue # No need to expand that candidate
                 if cand['indexes'][-1] == 0: #end of sentence
                     good_sentences.append(cand)
-                    highest_score = max(highest_score, - cand['score'])
+                    highest_score = max(highest_score, cand['score'])
                 else:
                     cand_left.append(cand)
             cur_best_cand = cand_left
@@ -288,12 +288,12 @@ def decode(self, img, beam_size, sess, max_steps=30):
 
         # Add candidate left in cur_best_cand to good sentences 
         for cand in cur_best_cand:
-            if len(good_sentences) > beam_size and - cand['score'] > highest_score:
+            if len(good_sentences) > beam_size and cand['score'] > highest_score:
                 continue
             if cand['indexes'][-1] != 0:
                 cand['indexes'].append(0)
             good_sentences.append(cand)
-            highest_score = max(highest_score, - cand['score'])
+            highest_score = max(highest_score, cand['score'])
 
         # Sort good sentences and return the final list
         good_sentences = sorted(good_sentences, key=lambda cand: cand['score'])
@@ -318,6 +318,6 @@ def get_probs_cont(self, prev_state, img, prev_word, sess):
         placeholders = [self.images, self.decoder_prev_word] + self.decoder_flattened_state
         feeded = [img, prev_word] + prev_state
 
-        probs, state = sess.run(m, {placeholders[i]: feeded[i] for i in xrange(len(pointer))})
+        probs, state = sess.run(m, {placeholders[i]: feeded[i] for i in xrange(len(placeholders))})
 
         return (probs, state)