Skip to content

Commit

Permalink
Update dataloader;
Browse files Browse the repository at this point in the history
1. change h5 feature file to individual npy(npz) files.
2. Split prepro to prepro_feats and prepro_labels
3. Add shuffle to the dataloader
  • Loading branch information
ruotianluo committed May 4, 2017
1 parent 397f1a8 commit 04a3d1b
Show file tree
Hide file tree
Showing 8 changed files with 170 additions and 336 deletions.
93 changes: 46 additions & 47 deletions dataloader_pool.py → dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,19 @@
import os
import numpy as np
import random
import skimage
import skimage.io
import scipy.misc
from multiprocessing.dummy import Process, Queue, Pool
from multiprocessing.dummy import Pool

def get_npy_data(ix, fc_file, att_file):
return (np.load(fc_file),
np.load(att_file)['feat'],
ix)

class DataLoader():

def reset_iterator(self, split):
self._prefetch_process[split].terminate()
self._prefetch_process[split].join()
self._prefetch_process[split] = BlobFetcher(split, self)
self._prefetch_process[split] = BlobFetcher(split, self, split=='train')
self.iterators[split] = 0

def get_vocab_size(self):
Expand All @@ -42,18 +44,11 @@ def __init__(self, opt):
print('vocab size is ', self.vocab_size)

# open the hdf5 file
print('DataLoader loading h5 file: ', opt.input_fc_h5, opt.input_att_h5, opt.input_label_h5)
self.h5_fc_file = h5py.File(self.opt.input_fc_h5, 'r')
self.h5_att_file = h5py.File(self.opt.input_att_h5, 'r')
print('DataLoader loading h5 file: ', opt.input_fc_dir, opt.input_att_dir, opt.input_label_h5)
self.h5_label_file = h5py.File(self.opt.input_label_h5, 'r', driver='core')


# extract image size from dataset
fc_size = self.h5_fc_file['fc'].shape
att_size = self.h5_att_file['att'].shape
assert fc_size[0] == att_size[0], 'fc and att same numer'
self.num_images = fc_size[0]
print('read %d image features' %(self.num_images))
self.input_fc_dir = self.opt.input_fc_dir
self.input_att_dir = self.opt.input_att_dir

# load in the sequence data
seq_size = self.h5_label_file['labels'].shape
Expand All @@ -63,6 +58,9 @@ def __init__(self, opt):
self.label_start_ix = self.h5_label_file['label_start_ix'][:]
self.label_end_ix = self.h5_label_file['label_end_ix'][:]

self.num_images = self.label_start_ix.shape[0]
print('read %d image features' %(self.num_images))

# separate out indexes for each of the provided splits
self.split_ix = {'train': [], 'val': [], 'test': []}
for ix in range(len(self.info['images'])):
Expand All @@ -84,7 +82,7 @@ def __init__(self, opt):

self._prefetch_process = {} # The three prefetch process
for split in self.iterators.keys():
self._prefetch_process[split] = BlobFetcher(split, self)
self._prefetch_process[split] = BlobFetcher(split, self, split=='train')
# Terminate the child process when the parent exists
def cleanup():
print('Terminating BlobFetcher')
Expand All @@ -98,8 +96,8 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
batch_size = batch_size or self.batch_size
seq_per_img = seq_per_img or self.seq_per_img

fc_batch = np.ndarray((batch_size * seq_per_img,) + self.h5_fc_file['fc'].shape[1:], dtype = 'float32')
att_batch = np.ndarray((batch_size * seq_per_img,) + self.h5_att_file['att'].shape[1:], dtype = 'float32')
fc_batch = [] # np.ndarray((batch_size * seq_per_img, self.opt.fc_feat_size), dtype = 'float32')
att_batch = [] # np.ndarray((batch_size * seq_per_img, 14, 14, self.opt.att_feat_size), dtype = 'float32')
label_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype = 'int')
mask_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype = 'float32')

Expand All @@ -112,10 +110,10 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
import time
t_start = time.time()
# fetch image
#tmp_fc, tmp_att, tmp_label, ix, tmp_wrapped = self._prefetch_process[split].get()
fc_batch[i * seq_per_img:(i+1) * seq_per_img], \
att_batch[i * seq_per_img:(i+1) * seq_per_img], \
tmp_fc, tmp_att,\
ix, tmp_wrapped = self._prefetch_process[split].get()
fc_batch += [tmp_fc] * seq_per_img
att_batch += [tmp_att] * seq_per_img

# fetch the sequence labels
ix1 = self.label_start_ix[ix] - 1 #label_start_ix starts from 1
Expand All @@ -128,7 +126,7 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
seq = np.zeros([seq_per_img, self.seq_length], dtype = 'int')
for q in range(seq_per_img):
ixl = random.randint(ix1,ix2)
seq[q, :] = self.dataloader.h5_label_file['labels'][ixl, :self.seq_length]
seq[q, :] = self.h5_label_file['labels'][ixl, :self.seq_length]
else:
ixl = random.randint(ix1, ix2 - seq_per_img + 1)
seq = self.h5_label_file['labels'][ixl: ixl + seq_per_img, :self.seq_length]
Expand All @@ -139,7 +137,7 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
wrapped = True

# Used for reward evaluation
gts.append(self.h5_label_file['labels'][self.label_start_ix[ix] - 1: self.label_end_ix[ix] - 1])
gts.append(self.h5_label_file['labels'][self.label_start_ix[ix] - 1: self.label_end_ix[ix]])

# record associated info as well
info_dict = {}
Expand All @@ -157,8 +155,8 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):
#print('mask', time.time() - t_start)

data = {}
data['fc_feats'] = fc_batch
data['att_feats'] = att_batch
data['fc_feats'] = np.stack(fc_batch)
data['att_feats'] = np.stack(att_batch)
data['labels'] = label_batch
data['gts'] = gts
data['masks'] = mask_batch
Expand All @@ -169,60 +167,62 @@ def get_batch(self, split, batch_size=None, seq_per_img=None):

class BlobFetcher():
"""Experimental class for prefetching blobs in a separate process."""
def __init__(self, split, dataloader):
def __init__(self, split, dataloader, if_shuffle=False):
"""
db is a list of tuples containing: imcrop_name, caption, bbox_feat of gt box, imname
"""
self.split = split
self.dataloader = dataloader
self.if_shuffle = if_shuffle

self.pool = Pool(4)
self.pool = Pool()
self.fifo = []

# Add more in the queue
def reset(self):
if len(self.fifo) == 0:
self.cur_idx = self.dataloader.iterators[self.split]
split_ix = self.dataloader.split_ix[self.split]
self.cur_split_ix = self.dataloader.split_ix[self.split][:] # copy
for i in xrange(512 - len(self.fifo)):
ix = split_ix[self.cur_idx]
if self.cur_idx + 1 >= len(split_ix):
ix = self.cur_split_ix[self.cur_idx]
if self.cur_idx + 1 >= len(self.cur_split_ix):
self.cur_idx = 0
if self.if_shuffle:
random.shuffle(self.cur_split_ix)
else:
self.cur_idx += 1
self.fifo.append(self.pool.apply_async(self._get_minibatch, (ix, )))
self.fifo.append(self.pool.apply_async(get_npy_data, \
(ix, \
os.path.join(self.dataloader.input_fc_dir, str(self.dataloader.info['images'][ix]['id']) + '.npy'),
os.path.join(self.dataloader.input_att_dir, str(self.dataloader.info['images'][ix]['id']) + '.npz')
)))

def terminate(self):
while len(self.fifo) > 0:
self.fifo.pop(0).get()
self.pool.terminate()
print(self.split, 'terminated')

def join(self):
self.pool.join()
print(self.split, 'joined')

def _get_next_minibatch_inds(self):
split_ix = self.dataloader.split_ix[self.split]
max_index = len(split_ix)
max_index = len(self.cur_split_ix)
wrapped = False

ri = self.dataloader.iterators[self.split]
ix = self.dataloader.split_ix[self.split][ri]

ri_next = ri + 1
if ri_next >= max_index:
ri_next = 0
self.dataloader.split_ix[self.split] = self.cur_split_ix[:] # copy
wrapped = True
self.dataloader.iterators[self.split] = ri_next
ix = split_ix[ri]

return ix, wrapped

def _get_minibatch(self, ix):
wrapped = False
if ix == self.dataloader.split_ix[self.split][-1]:
wrapped = True

return (self.dataloader.h5_fc_file['fc'][ix, :].astype('float32'),
self.dataloader.h5_att_file['att'][ix, :, :, :].astype('float32'),
ix,
wrapped)


def get(self):
if len(self.fifo) < 400:
self.reset()
Expand All @@ -231,6 +231,5 @@ def get(self):
tmp = self.fifo.pop(0).get()

assert tmp[2] == ix, "ix not equal"
assert tmp[3] == wrapped, "wrapped not equal"

return tmp
return tmp + (wrapped,)
8 changes: 3 additions & 5 deletions eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,13 @@

import opts
import models
from dataloader_pool import *
from dataloader import *
from dataloaderraw import *
import eval_utils
import argparse
import misc.utils as utils
import torch

NUM_THREADS = 2 #int(os.environ['OMP_NUM_THREADS'])

# Input arguments and options
parser = argparse.ArgumentParser()
# Input paths
Expand Down Expand Up @@ -54,9 +52,9 @@
parser.add_argument('--image_root', type=str, default='',
help='In case the image paths have to be preprended with a root path to an image folder')
# For evaluation on MSCOCO images from some split:
parser.add_argument('--input_fc_h5', type=str, default='',
parser.add_argument('--input_fc_dir', type=str, default='',
help='path to the h5file containing the preprocessed dataset')
parser.add_argument('--input_att_h5', type=str, default='',
parser.add_argument('--input_att_dir', type=str, default='',
help='path to the h5file containing the preprocessed dataset')
parser.add_argument('--input_label_h5', type=str, default='',
help='path to the h5file containing the preprocessed dataset')
Expand Down
12 changes: 7 additions & 5 deletions opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ def parse_opt():
# Data input settings
parser.add_argument('--input_json', type=str, default='data/coco.json',
help='path to the json file containing additional info and vocab')
parser.add_argument('--input_fc_h5', type=str, default='data/coco_fc.h5',
help='path to the h5file containing the preprocessed dataset')
parser.add_argument('--input_att_h5', type=str, default='data/coco_att.h5',
help='path to the h5file containing the preprocessed dataset')
parser.add_argument('--input_fc_dir', type=str, default='data/cocotalk_fc',
help='path to the directory containing the preprocessed fc feats')
parser.add_argument('--input_att_dir', type=str, default='data/cocotalk_att',
help='path to the directory containing the preprocessed att feats')
parser.add_argument('--input_label_h5', type=str, default='data/coco_label.h5',
help='path to the h5file containing the preprocessed dataset')
parser.add_argument('--start_from', type=str, default=None,
Expand Down Expand Up @@ -62,12 +62,14 @@ def parse_opt():
help='every how many iterations thereafter to drop LR?(in epoch)')
parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8,
help='every how many iterations thereafter to drop LR?(in epoch)')
parser.add_argument('--optim_alpha', type=float, default=0.8,
parser.add_argument('--optim_alpha', type=float, default=0.9,
help='alpha for adam')
parser.add_argument('--optim_beta', type=float, default=0.999,
help='beta used for adam')
parser.add_argument('--optim_epsilon', type=float, default=1e-8,
help='epsilon that goes into denominator for smoothing')
parser.add_argument('--weight_decay', type=float, default=0,
help='weight_decay')

parser.add_argument('--scheduled_sampling_start', type=int, default=-1,
help='at what iteration to start decay gt probability')
Expand Down
106 changes: 106 additions & 0 deletions scripts/prepro_feats_npy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua
Input: json file that has the form
[{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
example element in this list would look like
{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
This script reads this json, does some basic preprocessing on the captions
(e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
Output: a json file and an hdf5 file
The hdf5 file contains several fields:
/images is (N,3,256,256) uint8 array of raw image data in RGB format
/labels is (M,max_length) uint32 array of encoded labels, zero padded
/label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
first and last indices (in range 1..M) of labels for each image
/label_length stores the length of the sequence for each of the M sequences
The json file has a dict that contains:
- an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
- an 'images' field that is a list holding auxiliary information for each image,
such as in particular the 'split' it was assigned to.
"""

import os
import json
import argparse
from random import shuffle, seed
import string
# non-standard dependencies:
import h5py
from six.moves import cPickle
import numpy as np
import torch
import torchvision.models as models
from torch.autograd import Variable
import skimage.io

from torchvision import transforms as trn
preprocess = trn.Compose([
#trn.ToTensor(),
trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

from misc.resnet_utils import myResnet
import misc.resnet as resnet

def main(params):
net = getattr(resnet, params['model'])()
net.load_state_dict(torch.load('/home-nfs/rluo/rluo/model/pytorch-resnet/'+params['model']+'.pth'))
my_resnet = myResnet(net)
my_resnet.cuda()
my_resnet.eval()

imgs = json.load(open(params['input_json'], 'r'))
imgs = imgs['images']
N = len(imgs)

seed(123) # make reproducible

dir_fc = params['ourput_dir']+'_fc'
dir_att = params['ourput_dir']+'_att'
if not os.path.isdir(dir_fc):
os.mkdir(dir_fc)
if not os.path.isdir(dir_att):
os.mkdir(dir_att)

for i,img in enumerate(imgs):
# load the image
I = skimage.io.imread(os.path.join(params['images_root'], img['filepath'], img['filename']))
# handle grayscale input images
if len(I.shape) == 2:
I = I[:,:,np.newaxis]
I = np.concatenate((I,I,I), axis=2)

I = I.astype('float32')/255.0
I = torch.from_numpy(I.transpose([2,0,1])).cuda()
I = Variable(preprocess(I), volatile=True)
tmp_fc, tmp_att = my_resnet(I, params['att_size'])
# write to pkl
np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())

if i % 1000 == 0:
print 'processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N)
print 'wrote ', params['ourput_dir']

if __name__ == "__main__":

parser = argparse.ArgumentParser()

# input json
parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
parser.add_argument('--ourput_dir', default='data', help='output h5 file')

# options
parser.add_argument('--images_root', default='', help='root location in which images are stored, to be prepended to file_path in input json')
parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
parser.add_argument('--model', default='resnet101', type=str, help='resnet101, resnet152')

args = parser.parse_args()
params = vars(args) # convert to ordinary dict
print 'parsed input parameters:'
print json.dumps(params, indent = 2)
main(params)
Loading

0 comments on commit 04a3d1b

Please sign in to comment.