-
Notifications
You must be signed in to change notification settings - Fork 0
/
parana.py
108 lines (85 loc) · 3.34 KB
/
parana.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# parana.py
# (c) 2020 CincoNoveSeis Jornalismo Ltda.
#
# This program is licensed under the GNU General Public License, version 3.
# See the LICENSE file for details.
from util import as_dec, is_number, generate_pairs
from interface import PollParser
from pdfminer.layout import LTTextBoxHorizontal
from itertools import combinations
from unidecode import unidecode
def merge(ret):
return {k: v for d in ret for k, v in d.items()}
def rule_out_mayor_page(text):
text = unidecode(text)
return ('Em funcao da questao' in text or
'comparativa' in text)
class ParanaParser2016(PollParser):
@classmethod
def is_relevant_page(cls, text):
text = unidecode(text)
return (('Se o segundo turno da eleicao' in text or
'Se as eleicoes para Prefeito' in text) and
'Em funcao da questao' not in text and
'JEITO NENHUM' not in text and
'comparativa' not in text and
'AGORA fossem' not in text)
@classmethod
def handle_relevant_page(cls, page, _):
e = True
texts = {}
for element in page:
if isinstance(element, LTTextBoxHorizontal):
text = element.get_text()
if rule_out_mayor_page(text):
return None
if 'ESPONTANEA' in unidecode(text):
e = False
y = round(element.bbox[1], 4)
if texts.get(y) is None:
texts[y] = []
texts[y].append({
'text': text.strip(),
'x': element.bbox[0]
})
texts = {k: sorted(v, key=lambda l: l['x']) for k, v in texts.items()}
texts = {k: [x['text'] for x in v] for k, v in texts.items()}
return (e, 'p', generate_pairs(texts))
class ParanaParser2018(PollParser):
@classmethod
def is_relevant_page(cls, text):
text = unidecode(text)
return (('Situacao Eleitoral -' in text or
'Segundo Turno -' in text) and
'VOTOS VALIDOS' not in text and
'Comparativo' not in text and
'Governador/' not in text and
'Governador /' not in text)
@classmethod
def handle_relevant_page(cls, page, _):
position = None
texts = {}
for element in page:
if isinstance(element, LTTextBoxHorizontal):
text = element.get_text().strip()
if position is None:
if 'Governador' in text:
position = 'g'
elif 'Senador' in text:
position = 's'
elif 'Presidente' in text:
position = 'pr'
x = element.bbox[0]
if x >= 250:
continue
y = round(element.bbox[1], 4)
if texts.get(y) is None:
texts[y] = []
texts[y].append(text)
c = combinations(texts.keys(), 2)
c_tracker = [(a[0], a[1], abs(a[0] - a[1])) for a in c]
c_tracker = [a for a in c_tracker if a[2] < 5]
for a in c_tracker:
texts[a[0]].extend(texts[a[1]])
del texts[a[1]]
return (True, position, generate_pairs(texts))