-
Notifications
You must be signed in to change notification settings - Fork 0
/
table.py
151 lines (118 loc) · 5.04 KB
/
table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# table.py
# (c) 2020 CincoNoveSeis Jornalismo Ltda.
#
# This program is licensed under the GNU General Public License, version 3.
# See the LICENSE file for details.
from util import as_dec, is_number
from interface import PollParser
from pdfminer.layout import LTTextBoxHorizontal
from itertools import tee
from functools import reduce
from unidecode import unidecode
from pprint import pprint
LOG = 0
# TableParser deals with polls encoded into tables.
# PDF tables are annoying because they can have two layouts, rather
# arbitrarily, i.e.:
#
# (1) One textbox per column, with rows separated by \n;
# (2) One textbox per cell.
#
# TableParser tries to detect whether we're dealing with one or the other,
# and parses accordingly.
class TableParser(PollParser):
@classmethod
def handle_relevant_page(cls, page, already_have):
p = None
st = False
e = None
tot_ref = None
relevant_lists = []
total_page, page = tee(page)
for element in total_page:
if isinstance(element, LTTextBoxHorizontal):
text = unidecode(element.get_text())
if 'TOTAL' in text:
tot_ref = (element.bbox[2],
element.bbox[1],
abs(element.bbox[3] - element.bbox[1]))
break
if tot_ref is None:
return None
for element in page:
if isinstance(element, LTTextBoxHorizontal):
if LOG:
print(element)
text = unidecode(element.get_text())
if cls.rule_out_page(text):
return None
pste = cls.get_pste(text, already_have)
if pste is not None:
(p, st, e) = pste
#if already_have[p][st][e]:
# return None
already_have[p][st][e] = True
if cls.is_proper_field(text):
if tot_ref[2] > 20 and element.bbox[1] <= tot_ref[1]:
if element.bbox[0] < tot_ref[0]:
relevant_lists.append({
'text': text.strip(),
'y': element.bbox[1],
'newlines': cls.field_has_newlines(element),
'height': abs(element.bbox[3] - element.bbox[1])
})
elif element.bbox[0] < tot_ref[0] and element.bbox[3] < tot_ref[1]:
relevant_lists.append({
'text': text.strip(),
'y': element.bbox[1],
'newlines': cls.field_has_newlines(element),
'height': abs(element.bbox[3] - element.bbox[1])
})
groups = []
if len(relevant_lists) == 2 and relevant_lists[0]['newlines'] and relevant_lists[1]['newlines']:
final_lists = [x['text'].strip().split('\n') for x in relevant_lists]
groups = list(zip(final_lists[0], final_lists[1]))
elif len(relevant_lists) >= 2 and reduce(lambda x, y: x+y['newlines'], relevant_lists, 0) == 0:
relevant_lists = sorted(relevant_lists, key = lambda k: k['y'])
v = [x['text'].strip().replace('\n', '') for x in relevant_lists]
it = iter(v)
groups = zip(it, it)
elif len(relevant_lists) >= 2 and cls.allow_mixed_mode():
print('WARNING: Operating on mixed mode. Verify results carefully.')
for v in relevant_lists:
if v['newlines']:
split = v['text'].strip().split('\n')
idx = len(split) - 1
for vv in split:
relevant_lists.append({
'text': vv,
'y': v['y'] + (v['height'] / len(split)) * idx,
'newlines': False
})
idx -= 1
relevant_lists = [x for x in relevant_lists if not x['newlines']]
relevant_lists = sorted(relevant_lists, key = lambda k: k['y'], reverse=True)
final = [x['text'] for x in relevant_lists]
if len(final) % 2 != 0:
final = final[:-1]
it = iter(final)
groups = zip(it, it)
if LOG:
pprint(relevant_lists)
groups = [sorted(x) for x in groups]
groups = [x for x in groups if is_number(x[0])]
groups = [x for x in groups if not is_number(x[1])]
if LOG:
pprint(groups)
candidates = [x[1] for x in groups]
numbers = [as_dec(x[0]) for x in groups]
return (e, p, dict(zip(candidates, numbers)))
@classmethod
def is_proper_field(cls, text):
return True
@classmethod
def rule_out_page(cls, text):
return False
@classmethod
def allow_mixed_mode(cls):
return True