Update dataloader;

1. change h5 feature file to individual npy(npz) files. 2. Split prepro to prepro_feats and prepro_labels 3. Add shuffle to the dataloader
miracle24 · May 4, 2017 · 04a3d1b · 04a3d1b
1 parent 397f1a8
commit 04a3d1b
Show file tree

Hide file tree

Showing 8 changed files with 170 additions and 336 deletions.
diff --git a/dataloader_pool.py → dataloader.py b/dataloader_pool.py → dataloader.py
@@ -7,17 +7,19 @@
 import os
 import numpy as np
 import random
-import skimage
-import skimage.io
-import scipy.misc
-from multiprocessing.dummy import Process, Queue, Pool
+from multiprocessing.dummy import Pool
+
+def get_npy_data(ix, fc_file, att_file):
+    return (np.load(fc_file),
+        np.load(att_file)['feat'],
+        ix)
 
 class DataLoader():
 
     def reset_iterator(self, split):
         self._prefetch_process[split].terminate()
         self._prefetch_process[split].join()
-        self._prefetch_process[split] = BlobFetcher(split, self)
+        self._prefetch_process[split] = BlobFetcher(split, self, split=='train')
         self.iterators[split] = 0
 
     def get_vocab_size(self):
@@ -42,18 +44,11 @@ def __init__(self, opt):
         print('vocab size is ', self.vocab_size)
 
         # open the hdf5 file
-        print('DataLoader loading h5 file: ', opt.input_fc_h5, opt.input_att_h5, opt.input_label_h5)
-        self.h5_fc_file = h5py.File(self.opt.input_fc_h5, 'r')
-        self.h5_att_file = h5py.File(self.opt.input_att_h5, 'r')
+        print('DataLoader loading h5 file: ', opt.input_fc_dir, opt.input_att_dir, opt.input_label_h5)
         self.h5_label_file = h5py.File(self.opt.input_label_h5, 'r', driver='core')
 
-
-        # extract image size from dataset
-        fc_size = self.h5_fc_file['fc'].shape
-        att_size = self.h5_att_file['att'].shape
-        assert fc_size[0] == att_size[0], 'fc and att same numer'
-        self.num_images = fc_size[0]
-        print('read %d image features' %(self.num_images))
+        self.input_fc_dir = self.opt.input_fc_dir
+        self.input_att_dir = self.opt.input_att_dir
 
         # load in the sequence data
         seq_size = self.h5_label_file['labels'].shape
@@ -63,6 +58,9 @@ def __init__(self, opt):
         self.label_start_ix = self.h5_label_file['label_start_ix'][:]
         self.label_end_ix = self.h5_label_file['label_end_ix'][:]
 
+        self.num_images = self.label_start_ix.shape[0]
+        print('read %d image features' %(self.num_images))
+
         # separate out indexes for each of the provided splits
         self.split_ix = {'train': [], 'val': [], 'test': []}
         for ix in range(len(self.info['images'])):
@@ -84,7 +82,7 @@ def __init__(self, opt):
 
         self._prefetch_process = {} # The three prefetch process
         for split in self.iterators.keys():
-            self._prefetch_process[split] = BlobFetcher(split, self)
+            self._prefetch_process[split] = BlobFetcher(split, self, split=='train')
             # Terminate the child process when the parent exists
         def cleanup():
             print('Terminating BlobFetcher')
@@ -98,8 +96,8 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
         batch_size = batch_size or self.batch_size
         seq_per_img = seq_per_img or self.seq_per_img
 
-        fc_batch = np.ndarray((batch_size * seq_per_img,) + self.h5_fc_file['fc'].shape[1:], dtype = 'float32')
-        att_batch = np.ndarray((batch_size * seq_per_img,) + self.h5_att_file['att'].shape[1:], dtype = 'float32')
+        fc_batch = [] # np.ndarray((batch_size * seq_per_img, self.opt.fc_feat_size), dtype = 'float32')
+        att_batch = [] # np.ndarray((batch_size * seq_per_img, 14, 14, self.opt.att_feat_size), dtype = 'float32')
         label_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype = 'int')
         mask_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype = 'float32')
 
@@ -112,10 +110,10 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
             import time
             t_start = time.time()
             # fetch image
-            #tmp_fc, tmp_att, tmp_label, ix, tmp_wrapped = self._prefetch_process[split].get()
-            fc_batch[i * seq_per_img:(i+1) * seq_per_img], \
-                att_batch[i * seq_per_img:(i+1) * seq_per_img], \
+            tmp_fc, tmp_att,\
                 ix, tmp_wrapped = self._prefetch_process[split].get()
+            fc_batch += [tmp_fc] * seq_per_img
+            att_batch += [tmp_att] * seq_per_img
 
             # fetch the sequence labels
             ix1 = self.label_start_ix[ix] - 1 #label_start_ix starts from 1
@@ -128,7 +126,7 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
                 seq = np.zeros([seq_per_img, self.seq_length], dtype = 'int')
                 for q in range(seq_per_img):
                     ixl = random.randint(ix1,ix2)
-                    seq[q, :] = self.dataloader.h5_label_file['labels'][ixl, :self.seq_length]
+                    seq[q, :] = self.h5_label_file['labels'][ixl, :self.seq_length]
             else:
                 ixl = random.randint(ix1, ix2 - seq_per_img + 1)
                 seq = self.h5_label_file['labels'][ixl: ixl + seq_per_img, :self.seq_length]
@@ -139,7 +137,7 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
                 wrapped = True
 
             # Used for reward evaluation
-            gts.append(self.h5_label_file['labels'][self.label_start_ix[ix] - 1: self.label_end_ix[ix] - 1])
+            gts.append(self.h5_label_file['labels'][self.label_start_ix[ix] - 1: self.label_end_ix[ix]])
 
             # record associated info as well
             info_dict = {}
@@ -157,8 +155,8 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
         #print('mask', time.time() - t_start)
 
         data = {}
-        data['fc_feats'] = fc_batch
-        data['att_feats'] = att_batch
+        data['fc_feats'] = np.stack(fc_batch)
+        data['att_feats'] = np.stack(att_batch)
         data['labels'] = label_batch
         data['gts'] = gts
         data['masks'] = mask_batch 
@@ -169,60 +167,62 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
 
 class BlobFetcher():
     """Experimental class for prefetching blobs in a separate process."""
-    def __init__(self, split, dataloader):
+    def __init__(self, split, dataloader, if_shuffle=False):
         """
         db is a list of tuples containing: imcrop_name, caption, bbox_feat of gt box, imname
         """
         self.split = split
         self.dataloader = dataloader
+        self.if_shuffle = if_shuffle
 
-        self.pool = Pool(4)
+        self.pool = Pool()
         self.fifo = []
 
     # Add more in the queue
     def reset(self):
         if len(self.fifo) == 0:
             self.cur_idx = self.dataloader.iterators[self.split]
-        split_ix = self.dataloader.split_ix[self.split]
+            self.cur_split_ix = self.dataloader.split_ix[self.split][:] # copy
         for i in xrange(512 - len(self.fifo)):
-            ix = split_ix[self.cur_idx]
-            if self.cur_idx + 1 >= len(split_ix):
+            ix = self.cur_split_ix[self.cur_idx]
+            if self.cur_idx + 1 >= len(self.cur_split_ix):
                 self.cur_idx = 0
+                if self.if_shuffle:
+                    random.shuffle(self.cur_split_ix)
             else:
                 self.cur_idx += 1
-            self.fifo.append(self.pool.apply_async(self._get_minibatch, (ix, )))
+            self.fifo.append(self.pool.apply_async(get_npy_data, \
+                (ix, \
+                os.path.join(self.dataloader.input_fc_dir, str(self.dataloader.info['images'][ix]['id']) + '.npy'),
+                os.path.join(self.dataloader.input_att_dir, str(self.dataloader.info['images'][ix]['id']) + '.npz')
+                )))
 
     def terminate(self):
+        while len(self.fifo) > 0:
+            self.fifo.pop(0).get()
         self.pool.terminate()
+        print(self.split, 'terminated')
 
     def join(self):
         self.pool.join()
+        print(self.split, 'joined')
 
     def _get_next_minibatch_inds(self):
-        split_ix = self.dataloader.split_ix[self.split]
-        max_index = len(split_ix)
+        max_index = len(self.cur_split_ix)
         wrapped = False
 
         ri = self.dataloader.iterators[self.split]
+        ix = self.dataloader.split_ix[self.split][ri]
+
         ri_next = ri + 1
         if ri_next >= max_index:
             ri_next = 0
+            self.dataloader.split_ix[self.split] = self.cur_split_ix[:] # copy
             wrapped = True
         self.dataloader.iterators[self.split] = ri_next
-        ix = split_ix[ri]
 
         return ix, wrapped
-
-    def _get_minibatch(self, ix):
-        wrapped = False
-        if ix == self.dataloader.split_ix[self.split][-1]:
-            wrapped = True
-
-        return (self.dataloader.h5_fc_file['fc'][ix, :].astype('float32'),
-            self.dataloader.h5_att_file['att'][ix, :, :, :].astype('float32'),
-            ix,
-            wrapped)
-
+
     def get(self):
         if len(self.fifo) < 400:
             self.reset()
@@ -231,6 +231,5 @@ def get(self):
         tmp = self.fifo.pop(0).get()
 
         assert tmp[2] == ix, "ix not equal"
-        assert tmp[3] == wrapped, "wrapped not equal"
 
-        return tmp
+        return tmp + (wrapped,)
diff --git a/eval.py b/eval.py
@@ -11,15 +11,13 @@
 
 import opts
 import models
-from dataloader_pool import *
+from dataloader import *
 from dataloaderraw import *
 import eval_utils
 import argparse
 import misc.utils as utils
 import torch
 
-NUM_THREADS = 2 #int(os.environ['OMP_NUM_THREADS'])
-
 # Input arguments and options
 parser = argparse.ArgumentParser()
 # Input paths
@@ -54,9 +52,9 @@
 parser.add_argument('--image_root', type=str, default='', 
                 help='In case the image paths have to be preprended with a root path to an image folder')
 # For evaluation on MSCOCO images from some split:
-parser.add_argument('--input_fc_h5', type=str, default='',
+parser.add_argument('--input_fc_dir', type=str, default='',
                 help='path to the h5file containing the preprocessed dataset')
-parser.add_argument('--input_att_h5', type=str, default='',
+parser.add_argument('--input_att_dir', type=str, default='',
                 help='path to the h5file containing the preprocessed dataset')
 parser.add_argument('--input_label_h5', type=str, default='',
                 help='path to the h5file containing the preprocessed dataset')

diff --git a/opts.py b/opts.py
@@ -5,10 +5,10 @@ def parse_opt():
     # Data input settings
     parser.add_argument('--input_json', type=str, default='data/coco.json',
                     help='path to the json file containing additional info and vocab')
-    parser.add_argument('--input_fc_h5', type=str, default='data/coco_fc.h5',
-                    help='path to the h5file containing the preprocessed dataset')
-    parser.add_argument('--input_att_h5', type=str, default='data/coco_att.h5',
-                    help='path to the h5file containing the preprocessed dataset')
+    parser.add_argument('--input_fc_dir', type=str, default='data/cocotalk_fc',
+                    help='path to the directory containing the preprocessed fc feats')
+    parser.add_argument('--input_att_dir', type=str, default='data/cocotalk_att',
+                    help='path to the directory containing the preprocessed att feats')
     parser.add_argument('--input_label_h5', type=str, default='data/coco_label.h5',
                     help='path to the h5file containing the preprocessed dataset')
     parser.add_argument('--start_from', type=str, default=None,
@@ -62,12 +62,14 @@ def parse_opt():
                     help='every how many iterations thereafter to drop LR?(in epoch)')
     parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8, 
                     help='every how many iterations thereafter to drop LR?(in epoch)')
-    parser.add_argument('--optim_alpha', type=float, default=0.8,
+    parser.add_argument('--optim_alpha', type=float, default=0.9,
                     help='alpha for adam')
     parser.add_argument('--optim_beta', type=float, default=0.999,
                     help='beta used for adam')
     parser.add_argument('--optim_epsilon', type=float, default=1e-8,
                     help='epsilon that goes into denominator for smoothing')
+    parser.add_argument('--weight_decay', type=float, default=0,
+                    help='weight_decay')
 
     parser.add_argument('--scheduled_sampling_start', type=int, default=-1, 
                     help='at what iteration to start decay gt probability')

diff --git a/scripts/prepro_feats_npy.py b/scripts/prepro_feats_npy.py
@@ -0,0 +1,106 @@
+"""
+Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua
+
+Input: json file that has the form
+[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
+example element in this list would look like
+{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
+
+This script reads this json, does some basic preprocessing on the captions
+(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
+
+Output: a json file and an hdf5 file
+The hdf5 file contains several fields:
+/images is (N,3,256,256) uint8 array of raw image data in RGB format
+/labels is (M,max_length) uint32 array of encoded labels, zero padded
+/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the 
+  first and last indices (in range 1..M) of labels for each image
+/label_length stores the length of the sequence for each of the M sequences
+
+The json file has a dict that contains:
+- an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
+- an 'images' field that is a list holding auxiliary information for each image, 
+  such as in particular the 'split' it was assigned to.
+"""
+
+import os
+import json
+import argparse
+from random import shuffle, seed
+import string
+# non-standard dependencies:
+import h5py
+from six.moves import cPickle
+import numpy as np
+import torch
+import torchvision.models as models
+from torch.autograd import Variable
+import skimage.io
+
+from torchvision import transforms as trn
+preprocess = trn.Compose([
+        #trn.ToTensor(),
+        trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+
+from misc.resnet_utils import myResnet
+import misc.resnet as resnet
+
+def main(params):
+  net = getattr(resnet, params['model'])()
+  net.load_state_dict(torch.load('/home-nfs/rluo/rluo/model/pytorch-resnet/'+params['model']+'.pth'))
+  my_resnet = myResnet(net)
+  my_resnet.cuda()
+  my_resnet.eval()
+
+  imgs = json.load(open(params['input_json'], 'r'))
+  imgs = imgs['images']
+  N = len(imgs)
+
+  seed(123) # make reproducible
+
+  dir_fc = params['ourput_dir']+'_fc'
+  dir_att = params['ourput_dir']+'_att'
+  if not os.path.isdir(dir_fc):
+    os.mkdir(dir_fc)
+  if not os.path.isdir(dir_att):
+    os.mkdir(dir_att)
+
+  for i,img in enumerate(imgs):
+    # load the image
+    I = skimage.io.imread(os.path.join(params['images_root'], img['filepath'], img['filename']))
+    # handle grayscale input images
+    if len(I.shape) == 2:
+      I = I[:,:,np.newaxis]
+      I = np.concatenate((I,I,I), axis=2)
+
+    I = I.astype('float32')/255.0
+    I = torch.from_numpy(I.transpose([2,0,1])).cuda()
+    I = Variable(preprocess(I), volatile=True)
+    tmp_fc, tmp_att = my_resnet(I, params['att_size'])
+    # write to pkl
+    np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
+    np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
+
+    if i % 1000 == 0:
+      print 'processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N)
+  print 'wrote ', params['ourput_dir']
+
+if __name__ == "__main__":
+
+  parser = argparse.ArgumentParser()
+
+  # input json
+  parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
+  parser.add_argument('--ourput_dir', default='data', help='output h5 file')
+
+  # options
+  parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
+  parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
+  parser.add_argument('--model', default='resnet101', type=str, help='resnet101, resnet152')
+
+  args = parser.parse_args()
+  params = vars(args) # convert to ordinary dict
+  print 'parsed input parameters:'
+  print json.dumps(params, indent = 2)
+  main(params)