-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathpdf_to_excel.py
99 lines (80 loc) · 4.58 KB
/
pdf_to_excel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import subprocess
import sys
import camelot
def is_good_enough(table):
if table['accuracy'] > 80.0 and table['whitespace'] < 25.0:
return True
else:
return False
def main(argv):
script_dir = os.path.dirname(os.path.realpath(__file__))
print(script_dir + '/pdf_to_excel.py: Start')
if len(sys.argv) > 1:
start_dir = sys.argv[1]
else:
# we will process the current directory
start_dir = '.'
for dir_name, subdirs, file_list in os.walk(start_dir):
os.chdir(dir_name)
for filename in file_list:
file_ext = os.path.splitext(filename)[1]
if file_ext == '.pdf':
full_path = dir_name + '/' + filename
new_filename = os.path.splitext(filename)[0] + "-OCR.pdf"
# let's try to find some text into the PDF
print("attempting to OCR: " + full_path)
# add --deskew only if we are sure that the images might need it
# because it might create large file together with the option --optmize 0
cmd = ["ocrmypdf", "--skip-text --deskew --rotate-pages --clean --optimize 0", '"' + filename + '"', '"' + new_filename + '"']
# you should run OCRmyPDF as a command line tool
# see https://ocrmypdf.readthedocs.io/en/latest/batch.html#api
proc = subprocess.run(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
result = proc.stdout
if proc.returncode == 0:
print("OCR complete")
elif proc.returncode == 6:
print("OCR processing skipped")
else:
print("Error in OCR processing")
print("Starting searching for tables in ", new_filename)
# now we can detect tables
lattice_tables = camelot.read_pdf(new_filename, process_background=True, flavor='lattice', pages='1-end')
stream_tables = camelot.read_pdf(new_filename, flavor='stream', pages='1-end')
# the final tables
tables = []
# we check whether the number of tables are the same
if len(lattice_tables) > 0 and len(stream_tables) > 0 and len(lattice_tables) == len(stream_tables):
for index in range(len(lattice_tables)):
# we check whether the tables are both good enough
if is_good_enough(lattice_tables[index].parsing_report) and is_good_enough(stream_tables[index].parsing_report):
# they probably represent the same table
if lattice_tables[index].parsing_report['page'] == stream_tables[index].parsing_report['page'] and lattice_tables[index].parsing_report['order'] == stream_tables[index].parsing_report['order']:
total_lattice = 1
total_stream = 1
for num in lattice_tables[index].shape:
total_lattice *= num
for num in stream_tables[index].shape:
total_stream *= num
# we pick the table with the most cells
if(total_lattice >= total_stream):
tables.append(lattice_tables[index])
else:
tables.append(stream_tables[index])
elif is_good_enough(lattice_tables[index].parsing_report):
tables.append(lattice_tables[index])
elif is_good_enough(stream_tables[index].parsing_report):
tables.append(stream_tables[index])
elif len(lattice_tables) >= len(stream_tables):
tables = lattice_tables
else:
tables = stream_tables
if tables is not None and len(tables) > 0:
# let's check whether is TableList object or just a list of tables
if isinstance(tables, camelot.core.TableList) is False:
tables = camelot.core.TableList(tables)
tables.export(os.path.splitext(new_filename)[0] + '.xls', f='excel')
print(script_dir + '/pdf_to_excel.py: End')
if __name__ == '__main__':
main(sys.argv)