twitter · gsteelman · Aug 11, 2014 · Aug 11, 2014 · Aug 14, 2014 · Sep 1, 2014
diff --git a/src/main/java/com/hadoop/compression/lzo/DistributedLzoIndexer.java b/src/main/java/com/hadoop/compression/lzo/DistributedLzoIndexer.java
@@ -9,8 +9,10 @@
 import com.hadoop.mapreduce.LzoIndexOutputFormat;
 import com.hadoop.mapreduce.LzoSplitInputFormat;
 import com.hadoop.mapreduce.LzoSplitRecordReader;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -25,57 +27,108 @@
 
 public class DistributedLzoIndexer extends Configured implements Tool {
   private static final Log LOG = LogFactory.getLog(DistributedLzoIndexer.class);
-  private final String LZO_EXTENSION = new LzopCodec().getDefaultExtension();
 
+  private static final String LZO_EXTENSION = new LzopCodec().getDefaultExtension();
+
+  private static final String LZO_SKIP_INDEXING_SMALL_FILES_KEY = "lzo_skip_indexing_small_files";
+  private static final String LZO_SMALL_FILE_SIZE_KEY = "lzo_small_file_size";
+  private static final String LZO_RECURSIVE_INDEXING_KEY = "lzo_recursive_indexing";
+  private static final boolean LZO_SKIP_INDEXING_SMALL_FILES_DEFAULT = false;
+  private static final boolean LZO_RECURSIVE_INDEXING_DEFAULT = true;
+  private static final long LZO_SMALL_FILE_SIZE_DEFAULT = 0;
+  private boolean lzoSkipIndexingSmallFiles = LZO_SKIP_INDEXING_SMALL_FILES_DEFAULT;
+  private boolean lzoRecursiveIndexing = LZO_RECURSIVE_INDEXING_DEFAULT;
+  private long lzoSmallFileSize = LZO_SMALL_FILE_SIZE_DEFAULT;
+
+  private static final String TEMP_FILE_EXTENSION = "/_temporary";
+
+  private Configuration conf = getConf();
+
+  /**
+   * Accepts paths which don't end in TEMP_FILE_EXTENSION
+   */
   private final PathFilter nonTemporaryFilter = new PathFilter() {
+    @Override
     public boolean accept(Path path) {
-      return !path.toString().endsWith("/_temporary");
+      return !path.toString().endsWith(TEMP_FILE_EXTENSION);
     }
   };
 
-  private void walkPath(Path path, PathFilter pathFilter, List<Path> accumulator) {
+  /**
+   * Returns whether a file should be considered small enough to skip indexing.
+   */
+  private boolean isSmallFile(FileStatus status) {
+    return status.getLen() < lzoSmallFileSize;
+  }
+
+  private void visitPath(Path path, PathFilter pathFilter, List<Path> accumulator, boolean recursive) {
     try {
-      FileSystem fs = path.getFileSystem(getConf());
+      FileSystem fs = path.getFileSystem(this.conf);
       FileStatus fileStatus = fs.getFileStatus(path);
 
-      if (fileStatus.isDir()) {
-        FileStatus[] children = fs.listStatus(path, pathFilter);
-        for (FileStatus childStatus : children) {
-          walkPath(childStatus.getPath(), pathFilter, accumulator);
-        }
-      } else if (path.toString().endsWith(LZO_EXTENSION)) {
-        Path lzoIndexPath = path.suffix(LzoIndex.LZO_INDEX_SUFFIX);
-        if (fs.exists(lzoIndexPath)) {
-          // If the index exists and is of nonzero size, we're already done.
-          // We re-index a file with a zero-length index, because every file has at least one block.
-          if (fs.getFileStatus(lzoIndexPath).getLen() > 0) {
-            LOG.info("[SKIP] LZO index file already exists for " + path);
-            return;
-          } else {
-            LOG.info("Adding LZO file " + path + " to indexing list (index file exists but is zero length)");
-            accumulator.add(path);
+      if (fileStatus.isDirectory()) {
+        if (recursive) {
+          FileStatus[] children = fs.listStatus(path, pathFilter);
+          for (FileStatus childStatus : children) {
+            visitPath(childStatus.getPath(), pathFilter, accumulator, recursive);
           }
         } else {
-          // If no index exists, we need to index the file.
-          LOG.info("Adding LZO file " + path + " to indexing list (no index currently exists)");
-          accumulator.add(path);
+          LOG.info("[SKIP] Path " + path + " is a directory and recursion is not enabled.");
         }
+      } else if (shouldIndexPath(fileStatus, fs)) {
+        accumulator.add(path);
       }
     } catch (IOException ioe) {
       LOG.warn("Error walking path: " + path, ioe);
     }
   }
 
+  private boolean shouldIndexPath(FileStatus fileStatus, FileSystem fs) throws IOException {
+    Path path = fileStatus.getPath();
+    if (path.toString().endsWith(LZO_EXTENSION)) {
+      if (this.lzoSkipIndexingSmallFiles && isSmallFile(fileStatus)) {
+        LOG.info("[SKIP] Skip indexing small files enabled and " + path + " is too small");
+        return false;
+      }
+
+      Path lzoIndexPath = new Path(path, LzoIndex.LZO_INDEX_SUFFIX);
+      if (fs.exists(lzoIndexPath)) {
+        // If the index exists and is of nonzero size, we're already done.
+        // We re-index a file with a zero-length index, because every file has at least one block.
+        if (fileStatus.getLen() > 0) {
+          LOG.info("[SKIP] LZO index file already exists for " + path);
+          return false;
+        } else {
+          LOG.info("Adding LZO file " + path + " to indexing list (index file exists but is zero length)");
+          return true;
+        }
+      } else {
+        // If no index exists, we need to index the file.
+        LOG.info("Adding LZO file " + path + " to indexing list (no index currently exists)");
+        return true;
+      }
+    }
+    return false;
+  }
+
   public int run(String[] args) throws Exception {
     if (args.length == 0 || (args.length == 1 && "--help".equals(args[0]))) {
       printUsage();
       ToolRunner.printGenericCommandUsage(System.err);
       return -1;
     }
+
+    this.lzoSkipIndexingSmallFiles =
+        this.conf.getBoolean(LZO_SKIP_INDEXING_SMALL_FILES_KEY, LZO_SKIP_INDEXING_SMALL_FILES_DEFAULT);
+
+    this.lzoSmallFileSize =
+        this.conf.getLong(LZO_SMALL_FILE_SIZE_KEY, LZO_SMALL_FILE_SIZE_DEFAULT);
 
+    // Find paths to index based on recursive/not
+    this.lzoRecursiveIndexing = this.conf.getBoolean(LZO_RECURSIVE_INDEXING_KEY, LZO_RECURSIVE_INDEXING_DEFAULT);
     List<Path> inputPaths = new ArrayList<Path>();
-    for (String strPath: args) {
-      walkPath(new Path(strPath), nonTemporaryFilter, inputPaths);
+    for (String strPath : args) {
+      visitPath(new Path(strPath), nonTemporaryFilter, inputPaths, this.lzoRecursiveIndexing);
     }
 
     if (inputPaths.isEmpty()) {
@@ -84,7 +137,7 @@ public int run(String[] args) throws Exception {
       return 0;
     }
 
-    Job job = new Job(getConf());
+    Job job = new Job(this.conf);
     job.setJobName("Distributed Lzo Indexer " + Arrays.toString(args));
 
     job.setOutputKeyClass(Path.class);
@@ -134,7 +187,13 @@ public static void main(String[] args) throws Exception {
     System.exit(exitCode);
   }
 
-  public static void printUsage() {
-    System.err.println("Usage: hadoop jar /path/to/this/jar com.hadoop.compression.lzo.DistributedLzoIndexer <file.lzo | directory> [file2.lzo directory3 ...]");
+  public void printUsage() {
+    String usage =
+        "Command: hadoop jar /path/to/this/jar com.hadoop.compression.lzo.DistributedLzoIndexer <file.lzo | directory> [file2.lzo directory3 ...]" +
+        "\nConfiguration options: \"key\" [values] <default> description" +
+        "\n" + LZO_SKIP_INDEXING_SMALL_FILES_KEY + " [true,false] <" + LZO_SKIP_INDEXING_SMALL_FILES_DEFAULT + "> When indexing, skip files smaller than " + LZO_SMALL_FILE_SIZE_KEY + " bytes." +
+        "\n" + LZO_SMALL_FILE_SIZE_KEY + " [long] <" +LZO_SMALL_FILE_SIZE_DEFAULT + "> When indexing, skip files smaller than this number of bytes if " + LZO_SKIP_INDEXING_SMALL_FILES_KEY + " is true." +
+        "\n" + LZO_RECURSIVE_INDEXING_KEY + " [true,false] <" + LZO_RECURSIVE_INDEXING_DEFAULT + "> Look for files to index recursively from paths on command line.";
+    System.err.println(usage);
   }
 }