-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathImgOCR.py
52 lines (46 loc) · 1.17 KB
/
ImgOCR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# coding: utf-8
"""
使用说明:
1. 安装tesseract。参考:https://github.com/tesseract-ocr/tesseract
2. 安装中文语言包并放在对应位置:https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.chi_sim.tar.gz
"""
import os
import traceback
def processImages(path, save):
cmdtpl = 'tesseract {} {} -l chi_sim'
imgList = os.listdir(path)
os.chdir(path)
for i in range(1, len(imgList)):
cmd = ""
try:
result = '%s/result-%s' % (save, imgList[i].split('.')[0])
cmd = cmdtpl.format(imgList[i], result)
os.popen(cmd)
cleanResult(result+'.txt')
except Exception as e:
print "process ", cmd, "error: ", traceback.format_exc()
else:
print "process ", imgList[i], "successfully."
transfers = {
'o ': '。',
'o': '。',
'\n': '',
', ': ',',
':': ':',
': ': ':',
'。 ': '。',
',': ',',
', ': ',',
}
def cleanResult(path):
result = ""
with open(path) as f:
result = f.read()
for k, v in transfers.items():
result = result.replace(k, v)
with open(path, "w") as f:
f.write(result)
if __name__ == '__main__':
path = "/Users/elexu/Pictures/OCR"
save = "/Users/elexu/Pictures/OCR_result"
processImages(path, save)