diff --git a/docsplit.gemspec b/docsplit.gemspec index d3526bb..2ffb04c 100755 --- a/docsplit.gemspec +++ b/docsplit.gemspec @@ -21,4 +21,6 @@ Gem::Specification.new do |s| s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*', 'docsplit.gemspec', 'LICENSE', 'README'] -end \ No newline at end of file + + s.add_dependency "nokogiri" +end diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 5001413..395a40f 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -79,6 +79,11 @@ def self.clean_text(text) TextCleaner.new.clean(text) end + # Utility method to clean OCR'd text in hOCR output format. + def self.clean_hocr(html) + TextCleaner.new.clean_hocr(html) + end + private # Normalize a value in an options hash for the command line. diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 7c7af08..f079d45 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -91,6 +91,9 @@ def parse_options opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o| @options[:ocr] = o end + opts.on('-c', '--config [FILE]', 'use the specified config file') do |c| + @options[:config] = c + end opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c| @options[:clean] = false end @@ -119,4 +122,4 @@ def parse_options end -end \ No newline at end of file +end diff --git a/lib/docsplit/text_cleaner.rb b/lib/docsplit/text_cleaner.rb index c4aac01..ceee965 100644 --- a/lib/docsplit/text_cleaner.rb +++ b/lib/docsplit/text_cleaner.rb @@ -1,4 +1,5 @@ require 'strscan' +require 'nokogiri' module Docsplit @@ -32,16 +33,8 @@ class TextCleaner REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/ SINGLETONS = /^[AaIi]$/ - # For the time being, `clean` uses the regular StringScanner, and not the - # multibyte-aware version, coercing to ASCII first. def clean(text) - if String.method_defined?(:encode) - text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?') - else - require 'iconv' unless defined?(Iconv) - text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first - end - + text = get_conversion_method.call(text) scanner = StringScanner.new(text) cleaned = [] spaced = false @@ -60,6 +53,31 @@ def clean(text) end end + # When cleaning hOCR output, we follow a slightly simplied cleaning + # heuristic. Simply look at the individual word embedded within the + # XML text node that is a child of the XML element with the class + # attribute set to '.ocrx_word.' If it is garbage, delete that node. + def clean_hocr(xhtml) + convert = get_conversion_method + xml = Nokogiri::XML(xhtml) + xml.css('.ocrx_word').each do |elt| + word = xml.css('.ocrx_word').last.xpath(".//text()").text + elt.remove if garbage(convert.call(word)) + end + xml.to_s + end + + # For the time being, `clean` uses the regular StringScanner, and not the + # multibyte-aware version, coercing to ASCII first. + def get_conversion_method + if String.method_defined?(:encode) + lambda { |text| text.encode('ascii', :invalid => :replace, :undef => :replace, :replace => '?') } + else + require 'iconv' unless defined?(Iconv) + lambda { |text| Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first } + end + end + # Is a given word OCR garbage? def garbage(w) acronym = w =~ ACRONYM diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..b10f6e0 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -1,3 +1,5 @@ +require 'nokogiri' + module Docsplit # Delegates to **pdftotext** and **tesseract** in order to extract text from @@ -21,6 +23,10 @@ class TextExtractor MIN_TEXT_PER_PAGE = 100 # in bytes + HOCR_SECTIONS = [ [ '.ocr_par', "\n\n" ], + [ '.ocr_line', "\n" ], + [ '.ocrx_word', " " ] ] + def initialize @pages_to_ocr = [] end @@ -66,16 +72,20 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" - clean_text(file + '.txt') if @clean_ocr + run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{@config} 2>&1" + run "cp #{escaped_tiff} #{base_path}_#{page}.tif" if @gen_hocr + clean_ocr(file) if @clean_ocr + generate_text_and_annotate(file) if @gen_hocr FileUtils.remove_entry_secure tiff end else tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1" - clean_text(base_path + '.txt') if @clean_ocr + run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{@config} 2>&1" + run "cp #{escaped_tiff} #{base_path}.tif" if @gen_hocr + clean_ocr(base_path) if @clean_ocr + generate_text_and_annotate(base_path) if @gen_hocr end ensure FileUtils.remove_entry_secure tempdir if File.exists?(tempdir) @@ -84,15 +94,69 @@ def extract_from_ocr(pdf, pages) private - def clean_text(file) - File.open(file, 'r+') do |f| - text = f.read + def clean_ocr(basename) + ext = @gen_hocr ? "html" : "txt" + File.open(basename + ".#{ext}", 'r+') do |f| + content = f.read f.truncate(0) f.rewind - f.write(Docsplit.clean_text(text)) + meth = @gen_hocr ? "hocr" : "text" + f.write(Docsplit.send("clean_#{meth}".to_sym, content)) end end + # When generating hOCR output, tesseract doesn't generate text output. + # This method will generate the text output, and also add the corresponding + # character position of the words back into the hOCR file as HTML data + # attributes. + def generate_text_and_annotate(basename) + File.open(basename + '.txt', 'w') do |output| + File.open(basename + '.html', 'r+') do |input| + xml = Nokogiri::XML(input.read) + generate_text_position(xml) do |text, pos, elt| + # Write the output text file + output.write(text) + + # Annotate the hOCR element we are given + if elt + elt['data-start'] = pos + elt['data-stop' ] = pos + text.size + end + end + input.truncate(0) + input.rewind + input.write(xml.to_xml) + end + end + end + + def generate_text_position(root, index=0, pos=0, &block) + raise RuntimeError, "bad section list" if index >= HOCR_SECTIONS.size + # Select the sections we want at this level + sections = root.css(HOCR_SECTIONS[index][0]) + sections.each do |section| + if index < HOCR_SECTIONS.size - 1 + # It is not the base section, so recurse. + pos = generate_text_position(section, index + 1, pos, &block) + else + # It is the base section (a word), so emit the + # text and the xml element so the caller can + # annotate. + block.call(section.text, pos, section) if block + pos += section.text.size + end + + # We 'join' the sections with the specified separator. + # Emit the section join text, but without the xml + # element, since this is just generate text. + if section != sections.last + block.call(HOCR_SECTIONS[index][1], pos, nil) if block + pos += HOCR_SECTIONS[index][1].size + end + end + pos + end + # Run an external process and raise an exception if it fails. def run(command) result = `#{command}` @@ -123,8 +187,19 @@ def extract_options(options) @forbid_ocr = options[:ocr] == false @clean_ocr = !(options[:clean] == false) @language = options[:language] || 'eng' + @gen_hocr = check_tesseract_config(options[:config]) + @config = options[:config] || '' + end + + def check_tesseract_config(config) + return false unless config + hocr_configs = File.open(config, 'r').grep(/tessedit_create_hocr/) + if hocr_configs.size > 0 + return hocr_configs.last.split[1] != "0" + end + false end end -end \ No newline at end of file +end diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb index 00d24e3..09444dd 100755 --- a/test/unit/test_extract_text.rb +++ b/test/unit/test_extract_text.rb @@ -1,5 +1,6 @@ here = File.expand_path(File.dirname(__FILE__)) require File.join(here, '..', 'test_helper') +require 'fileutils' require 'tmpdir' class ExtractTextTest < Test::Unit::TestCase @@ -38,6 +39,33 @@ def test_ocr_extraction end end + def test_hocr_extraction + # Create a config that enables hOCR output + FileUtils.mkdir_p(OUTPUT) + File.write("#{OUTPUT}/config", "tessedit_create_hocr 1") + + Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :config => "#{OUTPUT}/config") + + # Remove the file to avoid polluting the tests below + FileUtils.rm("#{OUTPUT}/config") + + files = [] + 4.times do |i| + file = "corrosion_#{i + 1}.txt" + files.push(file) + assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size" + # This page contains does not need ocr. + next if i == 2 + file = "corrosion_#{i + 1}.html" + files.push(file) + assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with annotated html should have reasonable size" + file = "corrosion_#{i + 1}.tif" + files.push(file) + assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that tif file should have reasonable size" + end + assert_directory_contains(OUTPUT, files) + end + def test_ocr_extraction_in_mock_language exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")} assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"