-
Notifications
You must be signed in to change notification settings - Fork 1
/
heuristic_table_detection.py
102 lines (78 loc) · 3.01 KB
/
heuristic_table_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
''' Code taken from https://github.com/okfn/pdftables
from pdftables.pdf_document import PDFDocument
from pdftables.pdftables import page_to_tables
from pdftables.display import to_string
filepath = 'example.pdf'
fileobj = open(filepath,'rb')
doc = PDFDocument.from_fileobj(fileobj)
for page_number, page in enumerate(doc.get_pages()):
tables = page_to_tables(page)
for table in tables:
print(to_string(table.data))
'''
import PyPDF2
import os
# Possible bugfix
from PyPDF2 import PdfFileReader, utils
from io import StringIO
import subprocess
keywords = ('\nTableau', '\nTabelle', '\nTabella', '\nTable') # TODO add more
# counts the number of tables in a given pdf page
def count_tables_page(page):
page_content = page.extractText()
print(page_content)
i = 0
for keyword in keywords:
i += page_content.count(keyword)
return i
# counts the number of tables in a given pdf document
def count_tables_doc(pdf_file):
try:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
except PyPDF2.utils.PdfReadError:
return -1
number_of_pages = read_pdf.getNumPages()
i = 0
for p in range(0, number_of_pages):
i += count_tables_page(read_pdf.getPage(p))
return i
# counts the number of tables in a given directory and its subdirectories
def count_tables_dir(dirname):
n_tables = 0
n_errors = 0
print('dirname: ' + dirname)
# r=root, d=directories, f = files
for r, d, f in os.walk(dirname):
for file in f:
if ".pdf" in file: # not really required but why not
pdf_file = open(os.path.join(r, file), mode='rb')
# Need to catch tables that haven't been fully downloaded and don't have EOFMarker
i = count_tables_doc(pdf_file)
if i < 0:
n_errors += 1
else:
n_tables += i
return n_tables, n_errors
'''
# Getting weird EOF marker not found errors
# try solution from https://codedprojects.wordpress.com/2017/06/09/how-to-fix-pypdf-error-eof-marker-not-found/
def decompress_pdf(temp_buffer):
temp_buffer.seek(0) # Make sure we're at the start of the file.
process = subprocess.Popen(['pdftk.exe',
'-', # Read from stdin.
'output',
'-', # Write to stdout.
'uncompress'],
stdin=temp_buffer,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
return StringIO(stdout)
with open(os.path.join(r, file), encoding='latin1') as input_file:
input_buffer = StringIO(input_file.read())
try:
input_pdf = PdfFileReader(input_buffer)
except utils.PdfReadError:
input_pdf = PdfFileReader(decompress_pdf(input_file))
i += count_tables_doc(input_pdf)
'''