Skip to content

Commit

Permalink
adding dataframe pipeline commands, tests, and notebook (mdbloice#94)
Browse files Browse the repository at this point in the history
  • Loading branch information
kmader committed Mar 27, 2018
1 parent cc9946f commit b520717
Show file tree
Hide file tree
Showing 6 changed files with 638 additions and 4 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ install:
- source activate test-environment
# Attempt to install torchvision; on failure, revert back to pre-conda environment.
- conda install -q -y torchvision -c soumith || export PATH="$OPATH"
# Install pandas
- conda install -q -y pandas
- pip install -r requirements.txt
# command to run tests
script: py.test -v
28 changes: 28 additions & 0 deletions Augmentor/ImageUtilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,34 @@ def scan(source_directory, output_directory):

return augmentor_images, class_labels

def scan_dataframe(source_dataframe, image_col, category_col, output_directory):
try:
import pandas as pd
except ImportError:
raise ImportError('Pandas is required to use the scan_dataframe function!\nrun pip install pandas and try again')

# ensure column is categorical
cat_col_series = pd.Categorical(source_dataframe[category_col])
abs_output_directory = os.path.abspath(output_directory)
class_labels = list(enumerate(cat_col_series.categories))

augmentor_images = []

for image_path, cat_name, cat_id in zip(source_dataframe[image_col].values,
cat_col_series.get_values(),
cat_col_series.codes):

a = AugmentorImage(image_path=image_path, output_directory=abs_output_directory)
a.class_label = cat_name
a.class_label_int = cat_id
categorical_label = np.zeros(len(class_labels), dtype=np.uint32)
categorical_label[cat_id] = 1
a.categorical_label = categorical_label
a.file_format = os.path.splitext(image_path)[1].split(".")[1]
augmentor_images.append(a)

return augmentor_images, class_labels


def scan_directory(source_directory):
"""
Expand Down
56 changes: 54 additions & 2 deletions Augmentor/Pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from builtins import *

from .Operations import *
from .ImageUtilities import scan_directory, scan, AugmentorImage
from .ImageUtilities import scan_directory, scan, scan_dataframe, AugmentorImage

import os
import sys
Expand Down Expand Up @@ -128,6 +128,14 @@ def _populate(self, source_directory, output_directory, ground_truth_directory,
# Scan the directory that user supplied.
self.augmentor_images, self.class_labels = scan(source_directory, abs_output_directory)

self._check_images(abs_output_directory)

def _check_images(self, abs_output_directory):
"""
Private method. Used to check and get the dimensions of all of the images
:param abs_output_directory: the absolute path of the output directory
:return:
"""
# Make output directory/directories
if len(set(self.class_labels)) <= 1: # Fixed bad bug by adding set() function here.
if not os.path.exists(abs_output_directory):
Expand All @@ -142,7 +150,6 @@ def _populate(self, source_directory, output_directory, ground_truth_directory,
os.makedirs(os.path.join(abs_output_directory, str(class_label[0])))
except IOError:
print("Insufficient rights to read or write output directory (%s)" % abs_output_directory)

# Check the images, read their dimensions, and remove them if they cannot be read
# TODO: Do not throw an error here, just remove the image and continue.
for augmentor_image in self.augmentor_images:
Expand Down Expand Up @@ -1526,3 +1533,48 @@ def get_ground_truth_paths(self):
paths.append((augmentor_image.image_path, augmentor_image.ground_truth))

return paths

class DataFramePipeline(Pipeline):
def __init__(self, source_dataframe, image_col, category_col, output_directory="output", save_format=None):
"""
Create a new Pipeline object pointing to dataframe containing the paths
to your original image dataset.
Create a new Pipeline object, using the :attr:`source_dataframe`
and the columns :attr:`image_col` for the path of the image and
:attr:`category_col` for the name of the cateogry
:param source_dataframe: A Pandas DataFrame where the images are located
:param output_directory: Specifies where augmented images should be
saved to the disk. Default is the absolute path
:param save_format: The file format to use when saving newly created,
augmented images. Default is JPEG. Legal options are BMP, PNG, and
GIF.
:return: A :class:`Pipeline` object.
"""
super(DataFramePipeline, self).__init__(source_directory = None,
output_directory=output_directory,
save_format=save_format)
self._populate(source_dataframe,
image_col,
category_col,
output_directory,
save_format)

def _populate(self,
source_dataframe,
image_col,
category_col,
output_directory,
save_format):
# Assume we have an absolute path for the output
# Scan the directory that user supplied.
self.augmentor_images, self.class_labels = scan_dataframe(source_dataframe,
image_col,
category_col,
output_directory)

self._check_images(output_directory)



4 changes: 2 additions & 2 deletions Augmentor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
"""

from .Pipeline import Pipeline
from .Pipeline import Pipeline, DataFramePipeline

__author__ = """Marcus D. Bloice"""
__email__ = '[email protected]'
__version__ = '0.2.0'

__all__ = ['Pipeline']
__all__ = ['Pipeline', 'DataFramePipeline']
Loading

0 comments on commit b520717

Please sign in to comment.