-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathocr_images.py
executable file
·220 lines (186 loc) · 7.93 KB
/
ocr_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Perform OCR on an image (default 60dpi greyscale), and calculate
# the Levenshtein distance of the results from the ground truth, if found
# This requires a modified version of tesseract that can handle
# low-resolution images
import sys
import os
import subprocess
# import json
import argparse
from PIL import Image
import parse_hocr
import compare_hocr
import hocr_metrics
"""
Command line: $0 [options] <img>.png
This program will process the image with tesseract,
and the resulting .hocr file(s) will have .hocr in place of .png.
The merged result will be saved as <img>-merged.hocr and a text
version will be saved as <img>-merged.txt.
If there is a known ground truth, it should be saved as <img>.gt.txt,
or specified on the command line. The Levenstein distances will be
calculated if this file exists, and the results will be output to
<img>.metrics
"""
arg_parser = argparse.ArgumentParser(
description='Processes images with multiple tesseract runs')
arg_parser.add_argument('-s', '--scalings', default='C0',
help='Scalings/blurs to use; '
'B/L/C = box/bilinear/bicubic followed by blur '
'amount, eg B0, and '
'use specially-trained network')
arg_parser.add_argument('-r', '--resolution', metavar='RES',
default=0, type=int,
help='Resolution of images (default=60)')
arg_parser.add_argument('-d', '--debug', action='store_true',
help='Produce debugging information')
arg_parser.add_argument('-g', '--ground-truth',
help='Ground truth text file (default is image '
'name with .gt.txt extension)')
arg_parser.add_argument('--simulate', action='store_true',
help='Downscale a given 300dpi image before starting')
arg_parser.add_argument('-f', '--force', action='store_true',
help='Force rerunning of tesseract')
arg_parser.add_argument('--outbase',
help='basename of output files; default is basename '
'of input image file')
arg_parser.add_argument('--force-image', action='store_true',
help='Force regenerating simulated image')
arg_parser.add_argument('--tessbin-dir',
help='Directory in which tesseract appears; '
'default is to search on PATH')
arg_parser.add_argument('--tessenv', action='append',
help='Add this to the tesseract environment, eg '
'"DYLD_LIBRARY_PATH=../tesseract/src/api/.libs"'
' Can be used multiple times')
arg_parser.add_argument('--tessdata-path', required=True,
help='Use this path to the tessdata directory')
arg_parser.add_argument('--tessdata', default='dataRES_SCALING+BLUR',
help='Use this directory for the tessdata; '
'RES is replaced by the resolution '
'SCALING is replaced by the scaling name and '
'BLUR is replaced by the blur amount')
arg_parser.add_argument('-w', '--wmetrics', action='store_true',
help='Produce word-level metrics for image')
arg_parser.add_argument('image', help='Image to process')
args = arg_parser.parse_args()
debug = args.debug
if args.tessbin_dir:
tessbin = os.path.join(args.tessbin_dir, 'tesseract')
else:
tessbin = 'tesseract'
curdir = os.getcwd()
imgdir, imgfn = os.path.split(args.image)
imgbase, imgext = os.path.splitext(imgfn)
if imgdir:
os.chdir(imgdir)
if args.outbase:
outbase = args.outbase
else:
outbase = imgbase
if args.tessenv:
for env in args.tessenv:
if '=' in env:
var, val = env.split('=', maxsplit=1)
os.environ[var] = val
else:
print('--tessenv value does not have an = in it: %s' % env)
print('ignoring this environment variable')
if args.resolution == 0:
args.resolution = 60
if args.simulate:
img = Image.open(imgfn)
if img.mode == '1':
img = img.convert(mode='L')
# Our target images are 300 dpi
if 300 % args.resolution != 0:
print('Warning: resolution %d is not a factor of 300; '
'using rounded quotient instead!' % args.resolution)
factor = 300 // args.resolution
(wd, ht) = img.size
img = img.resize((wd // factor, ht // factor), resample=Image.BOX)
imggbase = imgbase + '-simulated-%ddpi' % args.resolution
if args.force_image or not os.path.isfile(imggbase + '.png'):
img.save(imggbase + '.png')
args.force = True
(wd, ht) = img.size
imgbig = None
else:
imggbase = imgbase
pages = []
scalings = args.scalings.split(',')
scalingsstr = ''.join(scalings).replace('.', '')
orighocr = None
try:
if args.ground_truth:
gt = open(args.ground_truth).read()
else:
gt = open(imgbase + '.gt.txt').read()
except OSError:
gt = None
print('Failed to read ground truth file; skipping comparisons')
scaling_types = {'B': (0, 'box'),
'L': (1, 'bilinear'),
'C': (2, 'bicubic')}
for scaling in scalings:
ext = '.png'
if scaling[0] not in scaling_types:
print('Unknown scaling type %s' % scaling[0])
continue
scaling_type = scaling_types[scaling[0]]
blur = scaling[1:]
scaling = scaling.replace('.', '')
imgout = outbase + '-' + scaling
if (args.force or not os.path.isfile(imgout + '.hocr')):
ddir = args.tessdata.replace('RES', str(args.resolution))
ddir = ddir.replace('SCALING', scaling_type[1])
ddir = ddir.replace('BLUR', blur)
ddir = os.path.join(args.tessdata_path, ddir, 'eng')
os.environ['TESSDATA_PREFIX'] = ddir
cmd = [tessbin,
'--dpi', '300', '-l', 'eng',
'-c', 'low_resolution_input=true',
'-c', 'low_resolution_dpi=%d' % args.resolution,
'-c', 'low_resolution_scaling=%d' % scaling_type[0],
'-c', 'low_resolution_blurring=%s' % blur,
'--psm', '6',
imggbase + ext, imgout, 'txt', 'hocr']
if debug:
print('About to run: %s' % ' '.join(cmd), file=sys.stderr)
print('TESSDATA_PREFIX = %s' % ddir)
subprocess.run(cmd, check=True)
tree, tidied = parse_hocr.parse_hocr_file(imgout + '.hocr',
resolution=args.resolution)
pages.append(tidied)
if not orighocr:
orighocr = open(imgout + '.hocr').read()
# produce word-level metrics if requested
if args.wmetrics:
if gt is not None:
hocr_metrics.compute_hocr_diff(tidied, gt)
hocr_metrics.output_hocr_diff_metrics(tidied,
imgout + '-wmetrics.csv')
del tree, tidied
if not orighocr:
print('No processing done; exiting')
else:
out123 = compare_hocr.merge_ocr_pages(pages, debug)
# update the hocr string to reflect the changes we've made
hocr123 = compare_hocr.update_hocr(orighocr, out123)
with open(outbase + '-merged-%s.hocr' % scalingsstr, 'w') as hocrout:
print(hocr123, end='', file=hocrout)
out123txt = parse_hocr.ocr_page_to_text(out123)
with open(outbase + '-merged-%s.txt' % scalingsstr, 'w') as mergedtxt:
print(out123txt, file=mergedtxt)
if gt is not None:
mergedfile = outbase + '-merged-%s.metrics' % scalingsstr
mergedcsv = outbase + '-merged-%s.csv' % scalingsstr
imgfilename = outbase + '-merged-%s' % scalingsstr
cmptxt, cmpcsv = hocr_metrics.get_metrics(out123txt, gt, imgfilename)
with open(mergedfile, 'w') as metrics:
print(cmptxt, end='', file=metrics)
with open(mergedcsv, 'w') as metrics:
print(cmpcsv, end='', file=metrics)
os.chdir(curdir)