Skip to content

Commit a23af83

Browse files
committed
Important commit!!
1. dataloader.py: change the index from 0,1,2 to train, val, test (easier to understand) 2. Use allow growth gpu option configuration. (in eval.py and train.py) 3. eval.py: use a better way to merge eval options and saved options in infos. 4. Add an att_hid_size option, so that the hidden size of attention in show attend tell could change. 5. 3 models: fix the beam search code.(I introduced a bug) 6. 3 models: allow different settings of optimization. 7. ShowAttenTellModel: use fc7 to initialize the state; the old version is in ShowAttendTellModel_old.py 8. ShowAttendTellModel: fix a huge bug!!!!!! (I used initial state as the state input at all time steps) 9. ShowAttendTellModel: shorten the code, put some reusable part in a function.
1 parent 95e9e5f commit a23af83

12 files changed

+556
-108
lines changed

Diff for: README.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Currently if you want to use my code, you need to train the model from scratch (
2020
- ~~sample_max~~
2121
- ~~eval on unseen images~~
2222
- eval on test
23+
- visualize attention map
2324

2425
# Requirements
2526
Python 2.7
@@ -69,9 +70,7 @@ If you'd like to evaluate BLEU/METEOR/CIDEr scores during training in addition t
6970

7071
### Caption images after training
7172

72-
In this case you want to run the evaluation script on a pretrained model checkpoint.
73-
I trained a decent one on the [MS COCO dataset](http://mscoco.org/) that you can run on your images.
74-
The pretrained checkpoint can be downloaded here: [pretrained checkpoint link](http://cs.stanford.edu/people/karpathy/neuraltalk2/checkpoint_v1.zip) (600MB). It's large because it contains the weights of a finetuned VGGNet. Now place all your images of interest into a folder, e.g. `blah`, and run
73+
Now place all your images of interest into a folder, e.g. `blah`, and run
7574
the eval script:
7675

7776
```bash

Diff for: dataloader.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -51,21 +51,23 @@ def __init__(self, opt):
5151
self.label_end_ix = self.h5_file['label_end_ix'][:]
5252

5353
# separate out indexes for each of the provided splits
54-
self.split_ix = [[],[],[]]
54+
self.split_ix = {'train': [], 'val': [], 'test': []}
5555
for ix in range(len(self.info['images'])):
5656
img = self.info['images'][ix]
5757
if img['split'] == 'train':
58-
self.split_ix[0].append(ix)
58+
self.split_ix['train'].append(ix)
5959
elif img['split'] == 'val':
60-
self.split_ix[1].append(ix)
60+
self.split_ix['val'].append(ix)
6161
elif img['split'] == 'test':
62-
self.split_ix[2].append(ix)
62+
self.split_ix['test'].append(ix)
63+
elif opt.train_only == 0: # restval
64+
self.split_ix['train'].append(ix)
6365

64-
print('assigned %d images to split train' %len(self.split_ix[0]))
65-
print('assigned %d images to split val' %len(self.split_ix[1]))
66-
print('assigned %d images to split test' %len(self.split_ix[2]))
66+
print('assigned %d images to split train' %len(self.split_ix['train']))
67+
print('assigned %d images to split val' %len(self.split_ix['val']))
68+
print('assigned %d images to split test' %len(self.split_ix['test']))
6769

68-
self.iterators = [0, 0, 0] # train, val, test
70+
self.iterators = {'train': 0, 'val': 0, 'test': 0}
6971

7072
def get_vocab_size(self):
7173
return self.vocab_size

Diff for: dataloaderraw.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def isImage(f):
6262
self.iterator = 0
6363

6464
def get_batch(self, split, batch_size=None):
65-
batch_size = batch_size or 5
65+
batch_size = batch_size or self.batch_size
6666

6767
# pick an index of the datapoint to load next
6868
img_batch = np.ndarray([batch_size, 224,224,3], dtype = 'float32')

Diff for: eval.py

+25-27
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@
2828
parser.add_argument('--infos_path', type=str, default='',
2929
help='path to infos to evaluate')
3030
# Basic options
31-
parser.add_argument('--batch_size', type=int, default=1,
31+
parser.add_argument('--batch_size', type=int, default=0,
3232
help='if > 0 then overrule, otherwise load from checkpoint.')
33-
parser.add_argument('--num_images', type=int, default=100,
33+
parser.add_argument('--num_images', type=int, default=-1,
3434
help='how many images to use when periodically evaluating the loss? (-1 = all)')
3535
parser.add_argument('--language_eval', type=int, default=0,
3636
help='Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')
@@ -41,18 +41,6 @@
4141
parser.add_argument('--dump_path', type=int, default=0,
4242
help='Write image paths along with predictions into vis json? (1=yes,0=no)')
4343

44-
# Model settings
45-
parser.add_argument('--caption_model', type=str, default="show_tell",
46-
help='show_tell, show_attend_tell, attention')
47-
parser.add_argument('--rnn_size', type=int, default=512,
48-
help='size of the rnn in number of hidden nodes in each layer')
49-
parser.add_argument('--num_layers', type=int, default=1,
50-
help='number of layers in the RNN')
51-
parser.add_argument('--rnn_type', type=str, default='lstm',
52-
help='rnn, gru, or lstm')
53-
parser.add_argument('--input_encoding_size', type=int, default=512,
54-
help='the encoding size of each token in the vocabulary, and the image.')
55-
5644
# Sampling options
5745
parser.add_argument('--sample_max', type=int, default=1,
5846
help='1 = sample argmax words. 0 = sample from distributions.')
@@ -91,10 +79,13 @@
9179
opt.input_json = infos['opt'].input_json
9280
if opt.batch_size == 0:
9381
opt.batch_size = infos['opt'].batch_size
94-
fetch = ["caption_model", "rnn_type", "rnn_size", "num_layers", "seq_length",
95-
"input_encoding_size", "drop_prob_lm", "seq_per_img", "vocab_size", "cnn_model", "grad_clip"]
96-
for k in fetch:
97-
vars(opt).update({k: vars(infos['opt'])[k]}) # copy over options from model
82+
ignore = ["id", "batch_size", "beam_size", "start_from"]
83+
for k in vars(infos['opt']).keys():
84+
if k not in ignore:
85+
if k in vars(opt):
86+
assert vars(opt)[k] == vars(infos['opt'])[k], k + ' option not consistent'
87+
else:
88+
vars(opt).update({k: vars(infos['opt'])[k]}) # copy over options from model
9889

9990
vocab = infos['vocab'] # ix -> word mapping
10091

@@ -110,16 +101,13 @@
110101
else:
111102
loader = DataLoaderRaw({'folder_path': opt.image_folder,
112103
'coco_json': opt.coco_json,
113-
'language_eval': opt.language_eval,
114-
'split': opt.split,
115-
'num_images': opt.num_images,
116104
'batch_size': opt.batch_size})
117105

118106
# Evaluation fun(ction)
119107
def eval_split(sess, model, loader, eval_kwargs):
120108
verbose = eval_kwargs.get('verbose', True)
121109
num_images = eval_kwargs.get('num_images', -1)
122-
split = eval_kwargs.get('split', 1)
110+
split = eval_kwargs.get('split', 'test')
123111
language_eval = eval_kwargs.get('language_eval', 0)
124112
dataset = eval_kwargs.get('dataset', 'coco')
125113

@@ -141,7 +129,7 @@ def eval_split(sess, model, loader, eval_kwargs):
141129
n = n + 1
142130
else:
143131
data = loader.get_batch(split, opt.batch_size)
144-
n = n + loader.batch_size
132+
n = n + opt.batch_size
145133

146134
#evaluate loss if we have the labels
147135
loss = 0
@@ -193,7 +181,12 @@ def eval_split(sess, model, loader, eval_kwargs):
193181

194182
# if we wrapped around the split or used up val imgs budget then bail
195183
ix0 = data['bounds']['it_pos_now']
196-
ix1 = min(data['bounds']['it_max'], num_images)
184+
ix1 = data['bounds']['it_max']
185+
if num_images != -1:
186+
ix1 = min(ix1, num_images)
187+
for i in range(n - ix1):
188+
predictions.pop()
189+
197190
if verbose:
198191
print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss))
199192

@@ -211,8 +204,10 @@ def eval_split(sess, model, loader, eval_kwargs):
211204
sess.run(tf.assign(model.cnn_training, True))
212205
return loss_sum/loss_evals, predictions, lang_stats
213206

214-
215-
with tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=NUM_THREADS)) as sess:
207+
tf_config = tf.ConfigProto()
208+
tf_config.intra_op_parallelism_threads=NUM_THREADS
209+
tf_config.gpu_options.allow_growth = True
210+
with tf.Session(config=tf_config) as sess:
216211
# Initilize the variables
217212
sess.run(tf.global_variables_initializer())
218213
# Load the model checkpoint to evaluate
@@ -223,7 +218,10 @@ def eval_split(sess, model, loader, eval_kwargs):
223218
sess.run(tf.assign(model.sample_max, opt.sample_max == 1))
224219
sess.run(tf.assign(model.sample_temperature, opt.temperature))
225220

226-
loss, split_predictions, lang_stats = eval_split(sess, model, loader, {'num_images': opt.num_images, 'split': opt.split})
221+
loss, split_predictions, lang_stats = eval_split(sess, model, loader,
222+
{'num_images': opt.num_images,
223+
'language_eval': opt.language_eval,
224+
'split': opt.split})
227225

228226
print('loss: ', loss)
229227
if lang_stats:

Diff for: misc/AttentionModel.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -133,15 +133,15 @@ def build_model(self):
133133
grads = utils.clip_by_value(tf.gradients(self.cost, tvars), -self.opt.grad_clip, self.opt.grad_clip)
134134
#grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
135135
# self.opt.grad_clip)
136-
optimizer = tf.train.AdamOptimizer(self.lr)
136+
optimizer = utils.get_optimizer(self.opt, self.lr)
137137
self.train_op = optimizer.apply_gradients(zip(grads, tvars))
138138

139139
# Collect the cnn variables, and create the optimizer of cnn
140140
cnn_tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='cnn')
141141
cnn_grads = utils.clip_by_value(tf.gradients(self.cost, cnn_tvars), -self.opt.grad_clip, self.opt.grad_clip)
142142
#cnn_grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, cnn_tvars),
143143
# self.opt.grad_clip)
144-
cnn_optimizer = tf.train.AdamOptimizer(self.cnn_lr)
144+
cnn_optimizer = utils.get_cnn_optimizer(self.opt, self.cnn_lr)
145145
self.cnn_train_op = cnn_optimizer.apply_gradients(zip(cnn_grads, cnn_tvars))
146146

147147
tf.summary.scalar('training loss', self.cost)
@@ -235,7 +235,7 @@ def decode(self, img, beam_size, sess, max_steps=30):
235235
# "state": RNN state when generating the last word of the candidate
236236
good_sentences = [] # store sentences already ended with <eos>
237237
cur_best_cand = [] # store current best candidates
238-
highest_score = - np.inf # hightest log-likelihodd in good sentences
238+
highest_score = 0.0 # hightest log-likelihodd in good sentences
239239

240240
# Get the initial logit and state
241241
cand = {'indexes': [], 'score': 0}
@@ -275,11 +275,11 @@ def decode(self, img, beam_size, sess, max_steps=30):
275275
# move candidates end with <eos> to good_sentences or remove it
276276
cand_left = []
277277
for cand in cur_best_cand:
278-
if len(good_sentences) > beam_size and - cand['score'] > highest_score:
278+
if len(good_sentences) > beam_size and cand['score'] > highest_score:
279279
continue # No need to expand that candidate
280280
if cand['indexes'][-1] == 0: #end of sentence
281281
good_sentences.append(cand)
282-
highest_score = max(highest_score, - cand['score'])
282+
highest_score = max(highest_score, cand['score'])
283283
else:
284284
cand_left.append(cand)
285285
cur_best_cand = cand_left
@@ -288,12 +288,12 @@ def decode(self, img, beam_size, sess, max_steps=30):
288288

289289
# Add candidate left in cur_best_cand to good sentences
290290
for cand in cur_best_cand:
291-
if len(good_sentences) > beam_size and - cand['score'] > highest_score:
291+
if len(good_sentences) > beam_size and cand['score'] > highest_score:
292292
continue
293293
if cand['indexes'][-1] != 0:
294294
cand['indexes'].append(0)
295295
good_sentences.append(cand)
296-
highest_score = max(highest_score, - cand['score'])
296+
highest_score = max(highest_score, cand['score'])
297297

298298
# Sort good sentences and return the final list
299299
good_sentences = sorted(good_sentences, key=lambda cand: cand['score'])
@@ -318,6 +318,6 @@ def get_probs_cont(self, prev_state, img, prev_word, sess):
318318
placeholders = [self.images, self.decoder_prev_word] + self.decoder_flattened_state
319319
feeded = [img, prev_word] + prev_state
320320

321-
probs, state = sess.run(m, {placeholders[i]: feeded[i] for i in xrange(len(pointer))})
321+
probs, state = sess.run(m, {placeholders[i]: feeded[i] for i in xrange(len(placeholders))})
322322

323323
return (probs, state)

0 commit comments

Comments
 (0)