diff --git a/docsplit.gemspec b/docsplit.gemspec
index d3526bb..2ffb04c 100755
--- a/docsplit.gemspec
+++ b/docsplit.gemspec
@@ -21,4 +21,6 @@ Gem::Specification.new do |s|
 
   s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
                 'docsplit.gemspec', 'LICENSE', 'README']
-end
\ No newline at end of file
+
+  s.add_dependency "nokogiri"
+end
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
index 5001413..395a40f 100755
--- a/lib/docsplit.rb
+++ b/lib/docsplit.rb
@@ -79,6 +79,11 @@ def self.clean_text(text)
     TextCleaner.new.clean(text)
   end
 
+  # Utility method to clean OCR'd text in hOCR output format.
+  def self.clean_hocr(html)
+    TextCleaner.new.clean_hocr(html)
+  end
+
   private
 
   # Normalize a value in an options hash for the command line.
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
index 7c7af08..f079d45 100755
--- a/lib/docsplit/command_line.rb
+++ b/lib/docsplit/command_line.rb
@@ -91,6 +91,9 @@ def parse_options
         opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
           @options[:ocr] = o
         end
+        opts.on('-c', '--config [FILE]', 'use the specified config file') do |c|
+          @options[:config] = c
+        end
         opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
           @options[:clean] = false
         end
@@ -119,4 +122,4 @@ def parse_options
 
   end
 
-end
\ No newline at end of file
+end
diff --git a/lib/docsplit/text_cleaner.rb b/lib/docsplit/text_cleaner.rb
index c4aac01..ceee965 100644
--- a/lib/docsplit/text_cleaner.rb
+++ b/lib/docsplit/text_cleaner.rb
@@ -1,4 +1,5 @@
 require 'strscan'
+require 'nokogiri'
 
 module Docsplit
 
@@ -32,16 +33,8 @@ class TextCleaner
     REPEATED    = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
     SINGLETONS  = /^[AaIi]$/
 
-    # For the time being, `clean` uses the regular StringScanner, and not the
-    # multibyte-aware version, coercing to ASCII first.
     def clean(text)
-      if String.method_defined?(:encode)
-        text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
-      else
-        require 'iconv' unless defined?(Iconv)
-        text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
-      end
-
+      text = get_conversion_method.call(text)
       scanner = StringScanner.new(text)
       cleaned = []
       spaced  = false
@@ -60,6 +53,31 @@ def clean(text)
       end
     end
 
+    # When cleaning hOCR output, we follow a slightly simplied cleaning
+    # heuristic. Simply look at the individual word embedded within the
+    # XML text node that is a child of the XML element with the class
+    # attribute set to '.ocrx_word.' If it is garbage, delete that node.
+    def clean_hocr(xhtml)
+      convert = get_conversion_method
+      xml = Nokogiri::XML(xhtml)
+      xml.css('.ocrx_word').each do |elt|
+        word = xml.css('.ocrx_word').last.xpath(".//text()").text
+        elt.remove if garbage(convert.call(word))
+      end
+      xml.to_s
+    end
+
+    # For the time being, `clean` uses the regular StringScanner, and not the
+    # multibyte-aware version, coercing to ASCII first.
+    def get_conversion_method
+      if String.method_defined?(:encode)
+        lambda { |text| text.encode('ascii', :invalid => :replace, :undef => :replace, :replace => '?') }
+      else
+        require 'iconv' unless defined?(Iconv)
+        lambda { |text| Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first }
+      end
+    end
+
     # Is a given word OCR garbage?
     def garbage(w)
       acronym = w =~ ACRONYM
diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
index 0d55f32..b10f6e0 100644
--- a/lib/docsplit/text_extractor.rb
+++ b/lib/docsplit/text_extractor.rb
@@ -1,3 +1,5 @@
+require 'nokogiri'
+
 module Docsplit
 
   # Delegates to **pdftotext** and **tesseract** in order to extract text from
@@ -21,6 +23,10 @@ class TextExtractor
 
     MIN_TEXT_PER_PAGE = 100 # in bytes
 
+    HOCR_SECTIONS = [ [ '.ocr_par',   "\n\n" ],
+                      [ '.ocr_line',  "\n"   ],
+                      [ '.ocrx_word', " "    ] ]
+
     def initialize
       @pages_to_ocr = []
     end
@@ -66,16 +72,20 @@ def extract_from_ocr(pdf, pages)
           escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
           run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
-          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
-          clean_text(file + '.txt') if @clean_ocr
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{@config} 2>&1"
+          run "cp #{escaped_tiff} #{base_path}_#{page}.tif" if @gen_hocr
+          clean_ocr(file) if @clean_ocr
+          generate_text_and_annotate(file) if @gen_hocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
         escaped_tiff = ESCAPE[tiff]
         run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
-        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
-        clean_text(base_path + '.txt') if @clean_ocr
+        run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{@config} 2>&1"
+        run "cp #{escaped_tiff} #{base_path}.tif" if @gen_hocr
+        clean_ocr(base_path) if @clean_ocr
+        generate_text_and_annotate(base_path) if @gen_hocr
       end
     ensure
       FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
@@ -84,15 +94,69 @@ def extract_from_ocr(pdf, pages)
 
     private
 
-    def clean_text(file)
-      File.open(file, 'r+') do |f|
-        text = f.read
+    def clean_ocr(basename)
+      ext = @gen_hocr ? "html" : "txt"
+      File.open(basename + ".#{ext}", 'r+') do |f|
+        content = f.read
         f.truncate(0)
         f.rewind
-        f.write(Docsplit.clean_text(text))
+        meth = @gen_hocr ? "hocr" : "text"
+        f.write(Docsplit.send("clean_#{meth}".to_sym, content))
       end
     end
 
+    # When generating hOCR output, tesseract doesn't generate text output.
+    # This method will generate the text output, and also add the corresponding
+    # character position of the words back into the hOCR file as HTML data
+    # attributes.
+    def generate_text_and_annotate(basename)
+      File.open(basename + '.txt', 'w') do |output|
+        File.open(basename + '.html', 'r+') do |input|
+          xml = Nokogiri::XML(input.read)
+          generate_text_position(xml) do |text, pos, elt|
+            # Write the output text file
+            output.write(text)
+
+            # Annotate the hOCR element we are given
+            if elt
+              elt['data-start'] = pos
+              elt['data-stop' ] = pos + text.size
+            end
+          end
+          input.truncate(0)
+          input.rewind
+          input.write(xml.to_xml)
+        end
+      end
+    end
+
+    def generate_text_position(root, index=0, pos=0, &block)
+      raise RuntimeError, "bad section list" if index >= HOCR_SECTIONS.size
+      # Select the sections we want at this level
+      sections = root.css(HOCR_SECTIONS[index][0])
+      sections.each do |section|
+        if index < HOCR_SECTIONS.size - 1
+          # It is not the base section, so recurse.
+          pos = generate_text_position(section, index + 1, pos, &block)
+        else
+          # It is the base section (a word), so emit the
+          # text and the xml element so the caller can
+          # annotate.
+          block.call(section.text, pos, section) if block
+          pos += section.text.size
+        end
+
+        # We 'join' the sections with the specified separator.
+        # Emit the section join text, but without the xml
+        # element, since this is just generate text.
+        if section != sections.last
+          block.call(HOCR_SECTIONS[index][1], pos, nil) if block
+          pos += HOCR_SECTIONS[index][1].size
+        end
+      end
+      pos
+    end
+
     # Run an external process and raise an exception if it fails.
     def run(command)
       result = `#{command}`
@@ -123,8 +187,19 @@ def extract_options(options)
       @forbid_ocr = options[:ocr] == false
       @clean_ocr  = !(options[:clean] == false)
       @language   = options[:language] || 'eng'
+      @gen_hocr   = check_tesseract_config(options[:config])
+      @config     = options[:config] || ''
+    end
+
+    def check_tesseract_config(config)
+      return false unless config
+      hocr_configs = File.open(config, 'r').grep(/tessedit_create_hocr/)
+      if hocr_configs.size > 0
+        return hocr_configs.last.split[1] != "0"
+      end
+      false
     end
 
   end
 
-end
\ No newline at end of file
+end
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
index 00d24e3..09444dd 100755
--- a/test/unit/test_extract_text.rb
+++ b/test/unit/test_extract_text.rb
@@ -1,5 +1,6 @@
 here = File.expand_path(File.dirname(__FILE__))
 require File.join(here, '..', 'test_helper')
+require 'fileutils'
 require 'tmpdir'
 
 class ExtractTextTest < Test::Unit::TestCase
@@ -38,6 +39,33 @@ def test_ocr_extraction
     end
   end
 
+  def test_hocr_extraction
+    # Create a config that enables hOCR output
+    FileUtils.mkdir_p(OUTPUT)
+    File.write("#{OUTPUT}/config", "tessedit_create_hocr 1")
+
+    Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :config => "#{OUTPUT}/config")
+
+    # Remove the file to avoid polluting the tests below
+    FileUtils.rm("#{OUTPUT}/config")
+
+    files = []
+    4.times do |i|
+      file = "corrosion_#{i + 1}.txt"
+      files.push(file)
+      assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
+      # This page contains does not need ocr.
+      next if i == 2
+      file = "corrosion_#{i + 1}.html"
+      files.push(file)
+      assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with annotated html should have reasonable size"
+      file = "corrosion_#{i + 1}.tif"
+      files.push(file)
+      assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that tif file should have reasonable size"
+    end
+    assert_directory_contains(OUTPUT, files)
+  end
+
   def test_ocr_extraction_in_mock_language
     exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
     assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"