From 76e7bf6b1058f2b824816b17321a595c0635e543 Mon Sep 17 00:00:00 2001 From: Artsiom Date: Tue, 3 May 2016 21:35:42 +0800 Subject: [PATCH 1/2] Add a larger MJSynth dataset of cropped words images --- fuel/converters/__init__.py | 4 +- fuel/converters/mjsynth.py | 202 +++++++++++++++++++++++++++++++++++ fuel/datasets/__init__.py | 1 + fuel/datasets/mjsynth.py | 23 ++++ fuel/downloaders/__init__.py | 4 +- fuel/downloaders/mjsynth.py | 19 ++++ tests/test_mjsynth.py | 47 ++++++++ 7 files changed, 298 insertions(+), 2 deletions(-) create mode 100644 fuel/converters/mjsynth.py create mode 100644 fuel/datasets/mjsynth.py create mode 100644 fuel/downloaders/mjsynth.py create mode 100644 tests/test_mjsynth.py diff --git a/fuel/converters/__init__.py b/fuel/converters/__init__.py index 8e50b3dfc..a9db67b64 100644 --- a/fuel/converters/__init__.py +++ b/fuel/converters/__init__.py @@ -22,6 +22,7 @@ from fuel.converters import svhn from fuel.converters import ilsvrc2010 from fuel.converters import youtube_audio +from fuel.converters import mjsynth __version__ = '0.2' all_converters = ( @@ -36,4 +37,5 @@ ('mnist', mnist.fill_subparser), ('svhn', svhn.fill_subparser), ('ilsvrc2010', ilsvrc2010.fill_subparser), - ('youtube_audio', youtube_audio.fill_subparser)) + ('youtube_audio', youtube_audio.fill_subparser), + ('mjsynth', mjsynth.fill_subparser)) diff --git a/fuel/converters/mjsynth.py b/fuel/converters/mjsynth.py new file mode 100644 index 000000000..17a8cb6a1 --- /dev/null +++ b/fuel/converters/mjsynth.py @@ -0,0 +1,202 @@ +import io +import os +import tarfile +import linecache +import shutil + +import h5py +import numpy +from PIL import Image + +from fuel.converters.base import progress_bar, check_exists +from fuel.datasets.hdf5 import H5PYDataset + +DISTRIBUTION_FILE = 'mjsynth.tar.gz' +TAR_PATH = 'mnt/ramdisk/max/90kDICT32px' +TRAIN_SIZE = 7224586 +TEST_SIZE = 891924 +VAL_SIZE = 802731 +BAD = {'train': ['./2194/2/334_EFFLORESCENT_24742.jpg', + './2128/2/369_REDACTED_63458.jpg', + './2069/4/192_whittier_86389.jpg', + './2025/2/364_SNORTERS_72304.jpg', + './2013/2/370_refract_63890.jpg', + './1881/4/225_Marbling_46673.jpg', + './1863/4/223_Diligently_21672.jpg', + './1817/2/363_actuating_904.jpg', + './1730/2/361_HEREON_35880.jpg', + './1696/4/211_Queened_61779.jpg', + './1650/2/355_stony_74902.jpg', + './1332/4/224_TETHERED_78397.jpg', + './936/2/375_LOCALITIES_44992.jpg', + './913/4/231_randoms_62372.jpg', + './905/4/234_Postscripts_59142.jpg', + './869/4/234_TRIASSIC_80582.jpg', + './627/6/83_PATRIARCHATE_55931.jpg', + './596/2/372_Ump_81662.jpg', + './554/2/366_Teleconferences_77948.jpg', + './495/6/81_MIDYEAR_48332.jpg', + './429/4/208_Mainmasts_46140.jpg', + './384/4/220_bolts_8596.jpg', + './368/4/232_friar_30876.jpg', + './275/6/96_hackle_34465.jpg', + './264/2/362_FORETASTE_30276.jpg', + './173/2/358_BURROWING_10395.jpg'], + 'test': ['./2911/6/77_heretical_35885.jpg', + './2852/6/60_TOILSOME_79481.jpg', + './2749/6/101_Chided_13155.jpg', ], + 'val': ['./2557/2/351_DOWN_23492.jpg', + './2540/4/246_SQUAMOUS_73902.jpg', + './2489/4/221_snored_72290.jpg']} + + +@check_exists(required_files=[DISTRIBUTION_FILE]) +def convert_mjsynth(directory, output_directory, + output_filename='mjsynth.hdf5'): + """Converts the MJSynth dataset to HDF5. + + Converts the MJSynth dataset to an HDF5 dataset compatible with + :class:`fuel.datasets.MJSynth`. The converted dataset is saved as + 'mjsynth.hdf5'. + + It assumes the existence of the following file: + + * `mjsynth.tar.gz` + + Parameters + ---------- + directory : str + Directory in which input files reside. + output_directory : str + Directory in which to save the converted dataset. + output_filename : str, optional + Name of the saved dataset. Defaults to 'mjsynth.hdf5'. + + Returns + ------- + output_paths : tuple of str + Single-element tuple containing the path to the converted dataset. + + """ + num_examples = TRAIN_SIZE + TEST_SIZE + VAL_SIZE + output_path = os.path.join(output_directory, output_filename) + h5file = h5py.File(output_path, mode='w') + dtype = h5py.special_dtype(vlen=numpy.dtype('uint8')) + hdf_features = h5file.create_dataset('features', (num_examples,), + dtype=dtype) + hdf_features_shapes = h5file.create_dataset('features_shapes', + (num_examples, 3), + dtype='int32') + dtype = h5py.special_dtype(vlen=numpy.dtype('S1')) + hdf_labels = h5file.create_dataset('targets', (num_examples,), dtype=dtype) + hdf_labels_shapes = h5file.create_dataset('targets_shapes', + (num_examples, 1), + dtype='int32') + + # Attach shape annotations and scales + hdf_features.dims.create_scale(hdf_features_shapes, 'shapes') + hdf_features.dims[0].attach_scale(hdf_features_shapes) + + hdf_labels.dims.create_scale(hdf_labels_shapes, 'shapes') + hdf_labels.dims[0].attach_scale(hdf_labels_shapes) + + hdf_shapes_labels = h5file.create_dataset('features_shapes_labels', + (3,), dtype='S7') + hdf_shapes_labels[...] = ['channel'.encode('utf8'), + 'height'.encode('utf8'), + 'width'.encode('utf8')] + hdf_features.dims.create_scale(hdf_shapes_labels, 'shape_labels') + hdf_features.dims[0].attach_scale(hdf_shapes_labels) + + hdf_shapes_labels = h5file.create_dataset('targets_shapes_labels', + (1,), dtype='S5') + + hdf_shapes_labels[...] = ['index'.encode('utf8')] + hdf_labels.dims.create_scale(hdf_shapes_labels, 'shape_labels') + hdf_labels.dims[0].attach_scale(hdf_shapes_labels) + + # Add axis annotations + hdf_features.dims[0].label = 'batch' + hdf_labels.dims[0].label = 'batch' + + # Extract the temp files from TAR + if not os.path.exists(os.path.join(output_directory, TAR_PATH)): + input_file = os.path.join(directory, DISTRIBUTION_FILE) + with progress_bar('tar', os.path.getsize(input_file), + prefix='Extracting') as bar: + class ProgressFileObject(io.FileIO): + def read(self, size=-1): + bar.update(self.tell()) + return io.FileIO.read(self, size) + + tar_file = tarfile.open(fileobj=ProgressFileObject(input_file)) + tar_file.extractall(path=output_directory) + tar_file.close() + + # Convert + i = 0 + for split, split_size in zip(['train', 'test', 'val'], + [TRAIN_SIZE, TEST_SIZE, VAL_SIZE]): + annotation_file = os.path.join(output_directory, TAR_PATH, + 'annotation_%s.txt' % split) + # Convert from JPEG to NumPy arrays + with progress_bar(split, split_size) as bar: + for line in open(annotation_file): + # Save image + filename, word_index = line.split() + if filename not in BAD[split]: + image = numpy.array( + Image.open( + os.path.join(output_directory, TAR_PATH, + filename[2:]))) + + image = image.transpose(2, 0, 1) + hdf_features[i] = image.flatten() + hdf_features_shapes[i] = image.shape + + # Read word from lexicon without '\n' + word = list(linecache.getline( + os.path.join(output_directory, TAR_PATH, + 'lexicon.txt'), + int(word_index))[:-1]) + hdf_labels[i] = numpy.array(word) + hdf_labels_shapes[i] = len(word) + + if i % 1000 == 0: + h5file.flush() + # Update progress + i += 1 + value = i if split == 'train' else \ + i - TRAIN_SIZE if split == 'test' else \ + i - TRAIN_SIZE - TEST_SIZE + bar.update(value) + + # Add the labels + split_dict = {} + sources = ['features', 'targets'] + split_dict['train'] = dict(zip(sources, [(0, TRAIN_SIZE)] * 2)) + split_dict['test'] = dict( + zip(sources, [(TRAIN_SIZE, TRAIN_SIZE + TEST_SIZE)] * 2)) + split_dict['val'] = dict(zip(sources, [ + (TRAIN_SIZE + TEST_SIZE, TRAIN_SIZE + TEST_SIZE + VAL_SIZE)] * 2)) + h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict) + + h5file.flush() + h5file.close() + + # Remove the extracted temp files + shutil.rmtree(os.path.join(directory, TAR_PATH.split('/')[0])) + + return (output_path,) + + +def fill_subparser(subparser): + """Sets up a subparser to convert the MJSynth dataset files. + + Parameters + ---------- + subparser : :class:`argparse.ArgumentParser` + Subparser handling the `mjsynth` command. + + """ + return convert_mjsynth diff --git a/fuel/datasets/__init__.py b/fuel/datasets/__init__.py index 376ad0574..106260273 100644 --- a/fuel/datasets/__init__.py +++ b/fuel/datasets/__init__.py @@ -15,3 +15,4 @@ from fuel.datasets.svhn import SVHN from fuel.datasets.text import TextFile from fuel.datasets.billion import OneBillionWord +from fuel.datasets.mjsynth import MJSynth diff --git a/fuel/datasets/mjsynth.py b/fuel/datasets/mjsynth.py new file mode 100644 index 000000000..06979619c --- /dev/null +++ b/fuel/datasets/mjsynth.py @@ -0,0 +1,23 @@ +from fuel.datasets import H5PYDataset +from fuel.transformers import ScaleAndShift +from fuel.utils import find_in_data_path + + +class MJSynth(H5PYDataset): + """The MJSynth dataset of cropped words images. + + Parameters + ---------- + which_sets : tuple of str + Which split to load. Valid values are 'train', 'test' and 'val'. + + """ + filename = 'mjsynth.hdf5' + + default_transformers = ((ScaleAndShift, [1 / 255.0, 0], + {'which_sources': ('features',)}),) + + def __init__(self, which_sets, **kwargs): + super(MJSynth, self).__init__( + file_or_path=find_in_data_path(self.filename), + which_sets=which_sets, **kwargs) diff --git a/fuel/downloaders/__init__.py b/fuel/downloaders/__init__.py index f4393a891..9f0d0eb4b 100644 --- a/fuel/downloaders/__init__.py +++ b/fuel/downloaders/__init__.py @@ -18,6 +18,7 @@ from fuel.downloaders import svhn from fuel.downloaders import ilsvrc2010 from fuel.downloaders import youtube_audio +from fuel.downloaders import mjsynth all_downloaders = ( ('adult', adult.fill_subparser), @@ -31,4 +32,5 @@ ('svhn', svhn.fill_subparser), ('ilsvrc2010', ilsvrc2010.fill_subparser), ('dogs_vs_cats', dogs_vs_cats.fill_subparser), - ('youtube_audio', youtube_audio.fill_subparser)) + ('youtube_audio', youtube_audio.fill_subparser), + ('mjsynth', mjsynth.fill_subparser)) diff --git a/fuel/downloaders/mjsynth.py b/fuel/downloaders/mjsynth.py new file mode 100644 index 000000000..f8fdae7fb --- /dev/null +++ b/fuel/downloaders/mjsynth.py @@ -0,0 +1,19 @@ +from fuel.downloaders.base import default_downloader + + +def fill_subparser(subparser): + """Set up a subparser to download the MJSynth dataset file. + + The MJSynth dataset file `mjsynth.tar.gz` is downloaded from the VGG + Text Recognition Data webpage [VGG]. + + Parameters + ---------- + subparser : :class:`argparse.ArgumentParser` + Subparser handling the `mjsynth` command. + + """ + subparser.set_defaults( + urls=['http://www.robots.ox.ac.uk/~vgg/data/text/mjsynth.tar.gz'], + filenames=['mjsynth.tar.gz']) + return default_downloader diff --git a/tests/test_mjsynth.py b/tests/test_mjsynth.py new file mode 100644 index 000000000..625be6b7f --- /dev/null +++ b/tests/test_mjsynth.py @@ -0,0 +1,47 @@ +import numpy + +from fuel.datasets import MJSynth +from fuel.streams import DataStream +from fuel.schemes import SequentialScheme + + +def test_mjsynth(): + train = MJSynth(('train',), load_in_memory=False) + assert train.num_examples == 7224586 + handle = train.open() + features, targets = train.get_data(handle, slice(7224576, 7224586)) + + assert features.shape[:1] == (10,) + assert targets.shape[:1] == (10,) + train.close(handle) + + test = MJSynth(('test',), load_in_memory=False) + assert test.num_examples == 891924 + handle = test.open() + features, targets = test.get_data(handle, slice(0, 15)) + + assert features.shape[:1] == (15,) + assert targets.shape[:1] == (15,) + + assert features[0].dtype == numpy.uint8 + assert targets[0].dtype == numpy.dtype('S1') + + test.close(handle) + + val = MJSynth(('val',), load_in_memory=False) + assert val.num_examples == 802731 + handle = val.open() + features, targets = val.get_data(handle, slice(49990, 50000)) + + assert features.shape[:1] == (10,) + assert targets.shape[:1] == (10,) + val.close(handle) + + stream = DataStream.default_stream( + test, iteration_scheme=SequentialScheme(10, 10)) + features = next(stream.get_epoch_iterator())[0] + + def test_feature(feature): + assert feature.min() >= 0.0 and feature.max() <= 1.0 + + [test_feature(feature) for feature in features] From bad1c9bd6381d1a6085db6f210616982e5911d1e Mon Sep 17 00:00:00 2001 From: Artsiom Date: Tue, 3 May 2016 22:09:33 +0800 Subject: [PATCH 2/2] Fix the dataset' tests --- tests/test_mjsynth.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_mjsynth.py b/tests/test_mjsynth.py index 625be6b7f..3d2e1cd54 100644 --- a/tests/test_mjsynth.py +++ b/tests/test_mjsynth.py @@ -3,9 +3,12 @@ from fuel.datasets import MJSynth from fuel.streams import DataStream from fuel.schemes import SequentialScheme +from tests import skip_if_not_available def test_mjsynth(): + skip_if_not_available(datasets=['mjsynth.hdf5']) + train = MJSynth(('train',), load_in_memory=False) assert train.num_examples == 7224586 handle = train.open()