adding dataframe pipeline commands, tests, and notebook (mdbloice#94)

mazzz56 · Mar 27, 2018 · b520717 · b520717
1 parent cc9946f
commit b520717
Show file tree

Hide file tree

Showing 6 changed files with 638 additions and 4 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -21,6 +21,8 @@ install:
   - source activate test-environment
   # Attempt to install torchvision; on failure, revert back to pre-conda environment.
   - conda install -q -y torchvision -c soumith || export PATH="$OPATH"
+  # Install pandas
+  - conda install -q -y pandas
   - pip install -r requirements.txt
 # command to run tests
 script: py.test -v
diff --git a/Augmentor/ImageUtilities.py b/Augmentor/ImageUtilities.py
@@ -256,6 +256,34 @@ def scan(source_directory, output_directory):
 
         return augmentor_images, class_labels
 
+def scan_dataframe(source_dataframe, image_col, category_col, output_directory):
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError('Pandas is required to use the scan_dataframe function!\nrun pip install pandas and try again')
+
+    # ensure column is categorical
+    cat_col_series = pd.Categorical(source_dataframe[category_col])
+    abs_output_directory = os.path.abspath(output_directory)
+    class_labels = list(enumerate(cat_col_series.categories))
+
+    augmentor_images = []
+
+    for image_path, cat_name, cat_id in zip(source_dataframe[image_col].values,
+                                            cat_col_series.get_values(),
+                                            cat_col_series.codes):
+
+        a = AugmentorImage(image_path=image_path, output_directory=abs_output_directory)
+        a.class_label = cat_name
+        a.class_label_int = cat_id
+        categorical_label = np.zeros(len(class_labels), dtype=np.uint32)
+        categorical_label[cat_id] = 1
+        a.categorical_label = categorical_label
+        a.file_format = os.path.splitext(image_path)[1].split(".")[1]
+        augmentor_images.append(a)
+
+    return augmentor_images, class_labels
+
 
 def scan_directory(source_directory):
     """

diff --git a/Augmentor/Pipeline.py b/Augmentor/Pipeline.py
@@ -16,7 +16,7 @@
 from builtins import *
 
 from .Operations import *
-from .ImageUtilities import scan_directory, scan, AugmentorImage
+from .ImageUtilities import scan_directory, scan, scan_dataframe, AugmentorImage
 
 import os
 import sys
@@ -128,6 +128,14 @@ def _populate(self, source_directory, output_directory, ground_truth_directory,
         # Scan the directory that user supplied.
         self.augmentor_images, self.class_labels = scan(source_directory, abs_output_directory)
 
+        self._check_images(abs_output_directory)
+
+    def _check_images(self, abs_output_directory):
+        """
+        Private method. Used to check and get the dimensions of all of the images
+        :param abs_output_directory: the absolute path of the output directory
+        :return:
+        """
         # Make output directory/directories
         if len(set(self.class_labels)) <= 1:  # Fixed bad bug by adding set() function here.
             if not os.path.exists(abs_output_directory):
@@ -142,7 +150,6 @@ def _populate(self, source_directory, output_directory, ground_truth_directory,
                         os.makedirs(os.path.join(abs_output_directory, str(class_label[0])))
                     except IOError:
                         print("Insufficient rights to read or write output directory (%s)" % abs_output_directory)
-
         # Check the images, read their dimensions, and remove them if they cannot be read
         # TODO: Do not throw an error here, just remove the image and continue.
         for augmentor_image in self.augmentor_images:
@@ -1526,3 +1533,48 @@ def get_ground_truth_paths(self):
             paths.append((augmentor_image.image_path, augmentor_image.ground_truth))
 
         return paths
+
+class DataFramePipeline(Pipeline):
+    def __init__(self, source_dataframe, image_col, category_col, output_directory="output", save_format=None):
+        """
+        Create a new Pipeline object pointing to dataframe containing the paths
+        to your original image dataset.
+
+        Create a new Pipeline object, using the :attr:`source_dataframe`
+        and the columns :attr:`image_col` for the path of the image and
+        :attr:`category_col` for the name of the cateogry
+
+        :param source_dataframe: A Pandas DataFrame where the images are located
+        :param output_directory: Specifies where augmented images should be
+         saved to the disk. Default is the absolute path
+        :param save_format: The file format to use when saving newly created,
+         augmented images. Default is JPEG. Legal options are BMP, PNG, and
+         GIF.
+        :return: A :class:`Pipeline` object.
+        """
+        super(DataFramePipeline, self).__init__(source_directory = None,
+                                                output_directory=output_directory,
+                                                save_format=save_format)
+        self._populate(source_dataframe,
+                  image_col,
+                  category_col,
+                  output_directory,
+                  save_format)
+
+    def _populate(self,
+                  source_dataframe,
+                  image_col,
+                  category_col,
+                  output_directory,
+                  save_format):
+        # Assume we have an absolute path for the output
+        # Scan the directory that user supplied.
+        self.augmentor_images, self.class_labels = scan_dataframe(source_dataframe,
+                                                                   image_col,
+                                                                   category_col,
+                                                                  output_directory)
+
+        self._check_images(output_directory)
+
+
+
diff --git a/Augmentor/__init__.py b/Augmentor/__init__.py
@@ -11,10 +11,10 @@
 
 """
 
-from .Pipeline import Pipeline
+from .Pipeline import Pipeline, DataFramePipeline
 
 __author__ = """Marcus D. Bloice"""
 __email__ = '[email protected]'
 __version__ = '0.2.0'
 
-__all__ = ['Pipeline']
+__all__ = ['Pipeline', 'DataFramePipeline']