-
-
Notifications
You must be signed in to change notification settings - Fork 44
/
run.sh
49 lines (37 loc) · 1.24 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#! /bin/bash
prefix="resume-samples/"
jpg_prefix="resume-samples/"
pdf_suffix=".pdf"
jpg_suffix=".jpg"
html_suffix=".html"
doc_suffix=".doc"
docx_suffix=".docx"
txt_suffix=".txt"
eml_suffix=".eml"
excel_suffix=".xls"
for d in contracts/* ; do
pdf_name=${d#$prefix}
file_name=${pdf_name%$pdf_suffix}
mkdir contracts/${file_name}
mkdir tess-out-html/${file_name}
mkdir tess-out-txt/${file_name}
# convert to png
convert -transparent white -fuzz 10% $d contracts/${file_name}/${file_name}.png
# tesseract
for file in contracts/${file_name}/* ; do
tess_file_name=${file#$png_prefix}
tess_file_name=${tess_file_name#${file_name}/}
tess_file_name=${tess_file_name%$png_suffix}
# tesseract text invocation
tesseract contracts/${file_name}/${tess_file_name}.png tess-out-txt/${file_name}/${tess_file_name}
# tesseract html invocation
tesseract contracts/${file_name}/${tess_file_name}.png tess-out-html/${file_name}/${tess_file_name} hocr
done
# ABBYY
# txt
python ABBYY/process.py contracts/${pdf_name} abbyy-out/txt/${file_name}.txt -txt
# docx
python ABBYY/process.py contracts/${pdf_name} abbyy-out/docx/${file_name}.txt -docx
# xml
python ABBYY/process.py contracts/${pdf_name} abbyy-out/xml/${file_name}.txt -xml
done