This repository has been archived by the owner on Feb 18, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtetracube-ocr.rb
64 lines (54 loc) · 1.47 KB
/
tetracube-ocr.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env ruby
require 'fileutils'
require 'optparse'
options = {}
opt_parse = OptionParser.new do |opts|
opts.banner = "Usage: tetracube-ocr.rb [options]"
opts.on('-f', '--file FILE', "Path to your book") do |book|
options[:book] = book
end
opts.on('-l', '--lang LANG', "Language") do |lang|
options[:lang] = lang
end
opts.on('-h', '--help', "Display this help") do
puts opts
exit
end
end
opt_parse.parse!
path = options[:book]
lang = options[:lang]
class Book
attr_accessor :path, :lang
def initialize(path, lang)
@path = path
@lang = lang
@ext = File.extname(@path).downcase
@file_name = File.basename(@path, ".*")
@temp_dir = "/tmp/#{@file_name}"
@temp_file = "#{@temp_dir}/multipage.tif"
@file_dir = File.dirname(@path)
end
def pdf_cmd
if @ext == '.pdf'
cmd = "gs -r200 -dNOPAUSE -q -dGraphicsAlphaBits=4 -dTextAlphaBits=4 -sDEVICE=tiffgray -sCompression=lzw -dBATCH -sOutputFile='#{@temp_file}' -- '#{path}' >> /dev/null 2>> /dev/null"
elsif @ext == '.djvu'
cmd = "ddjvu -format=tiff -mode=black -quality=150 '#{@path}' '#{@temp_file}'"
else
puts "File type #{@ext} is not supported"
exit
end
return cmd
end
def start_ocr
FileUtils.mkdir(@temp_dir) unless Dir.exists?(@temp_dir)
puts "Please wait…"
system(pdf_cmd)
ts_cmd = "tesseract '#{@temp_file}' '#{@file_dir}/#{@file_name}' -l #{@lang}"
system(ts_cmd)
puts "OCR is done!"
FileUtils.rm_rf(@temp_dir)
end
end
book = Book.new(path, lang)
book.start_ocr