Skip to content

Commit

Permalink
Versão erro no 'gazette_themed_excerpts_extraction' tratado (#59)
Browse files Browse the repository at this point in the history
RP para ajudar a sincronizar os avanços da trilha de segmentadores.

@Jefersonalves @ogecece
  • Loading branch information
Giulio Carvalho authored Nov 21, 2023
2 parents e2062af + 4e5a6ec commit 9555960
Show file tree
Hide file tree
Showing 11 changed files with 346 additions and 5 deletions.
1 change: 1 addition & 0 deletions associations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .diario_ama import extrair_diarios_municipais
77 changes: 77 additions & 0 deletions associations/diario_ama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import re

from .diario_municipal import Diario, Municipio

# No final do regex, existe uma estrutura condicional que verifica se o próximo match é um \s ou SECRETARIA. Isso foi feito para resolver um problema no diário de 2018-10-02, em que o município de Coité do Nóia não foi percebido pelo código. Para resolver isso, utilizamos a próxima palavra (SECRETARIA) para tratar esse caso.
# Exceções Notáveis
# String: VAMOS, município Poço das Trincheiras, 06/01/2022, ato CCB3A6AB
re_nomes_municipios = (
r"ESTADO DE ALAGOAS(?:| )\n{1,2}PREFEITURA MUNICIPAL DE (.*\n{0,2}(?!VAMOS).*$)\n\s(?:\s|SECRETARIA)")


def extrair_diarios_municipais(texto_diario: str, gazette: dict, territories: list):
texto_diario_slice = texto_diario.lstrip().splitlines()

# Processamento
linhas_apagar = [] # slice de linhas a ser apagadas ao final.
ama_header = texto_diario_slice[0]
ama_header_count = 0
codigo_count = 0
codigo_total = texto_diario.count("Código Identificador")

for num_linha, linha in enumerate(texto_diario_slice):
# Remoção do cabeçalho AMA, porém temos que manter a primeira aparição.
if linha.startswith(ama_header):
ama_header_count += 1
if ama_header_count > 1:
linhas_apagar.append(num_linha)

# Remoção das linhas finais
if codigo_count == codigo_total:
linhas_apagar.append(num_linha)
elif linha.startswith("Código Identificador"):
codigo_count += 1

# Apagando linhas do slice
texto_diario_slice = [l for n, l in enumerate(
texto_diario_slice) if n not in linhas_apagar]

# Inserindo o cabeçalho no diário de cada município.
texto_diarios = {}
nomes_municipios = re.findall(
re_nomes_municipios, texto_diario, re.MULTILINE)
for municipio in nomes_municipios:
municipio = Municipio(municipio)
texto_diarios[municipio] = ama_header + '\n\n'

num_linha = 0
municipio_atual = None
while num_linha < len(texto_diario_slice):
linha = texto_diario_slice[num_linha].rstrip()

if linha.startswith("ESTADO DE ALAGOAS"):
nome = nome_municipio(texto_diario_slice, num_linha)
if nome is not None:
municipio_atual = Municipio(nome)

# Só começa, quando algum muncípio for encontrado.
if municipio_atual is None:
num_linha += 1
continue

# Conteúdo faz parte de um muncípio
texto_diarios[municipio_atual] += linha + '\n'
num_linha += 1

diarios = []
for municipio, diario in texto_diarios.items():
diarios.append(Diario(municipio, ama_header, diario, gazette, territories).__dict__)
return diarios


def nome_municipio(texto_diario_slice: slice, num_linha: int):
texto = '\n'.join(texto_diario_slice[num_linha:num_linha+10])
match = re.findall(re_nomes_municipios, texto, re.MULTILINE)
if len(match) > 0:
return match[0].strip().replace('\n', '')
return None
103 changes: 103 additions & 0 deletions associations/diario_municipal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import json
import re
import unicodedata
from datetime import date, datetime
from .utils import get_territorie_info
import hashlib
from io import BytesIO


class Municipio:

def __init__(self, municipio):
municipio = municipio.rstrip().replace('\n', '') # limpeza inicial
# Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso.
municipio = re.sub("(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*)", "", municipio)
self.id = self._computa_id(municipio)
self.nome = municipio

def _computa_id(self, nome_municipio):
ret = nome_municipio.strip().lower().replace(" ", "-")
ret = unicodedata.normalize('NFKD', ret)
ret = ret.encode('ASCII', 'ignore').decode("utf-8")
return ret

def __hash__(self):
return hash(self.id)

def __eq__(self, other):
return self.id == other.id

def __str__(self):
return json.dumps(self.__dict__, indent=2, default=str, ensure_ascii=False)


class Diario:

_mapa_meses = {
"Janeiro": 1,
"Fevereiro": 2,
"Março": 3,
"Abril": 4,
"Maio": 5,
"Junho": 6,
"Julho": 7,
"Agosto": 8,
"Setembro": 9,
"Outubro": 10,
"Novembro": 11,
"Dezembro": 12,
}

def __init__(self, municipio: Municipio, cabecalho: str, texto: str, gazette: dict, territories: list):


self.territory_id, self.territory_name, self.state_code = get_territorie_info(
name=municipio.nome,
state=cabecalho.split(",")[0],
territories=territories)

self.source_text = texto.rstrip()
self.date = self._extrai_data_publicacao(cabecalho)
self.edition_number = cabecalho.split("Nº")[1].strip()
self.is_extra_edition = False
self.power = "executive_legislative"
self.file_url = gazette["file_url"]
self.file_path = gazette["file_path"]
self.file_checksum = self.md5sum(BytesIO(self.source_text.encode(encoding='UTF-8')))
self.id = gazette["id"]
self.scraped_at = datetime.utcnow()
self.created_at = self.scraped_at
self.file_raw_txt = f"/{self.territory_id}/{self.date}/{self.file_checksum}.txt"
self.processed = True
self.url = self.file_raw_txt

def _extrai_data_publicacao(self, ama_header: str):
match = re.findall(
r".*(\d{2}) de (\w*) de (\d{4})", ama_header, re.MULTILINE)[0]
mes = Diario._mapa_meses[match[1]]
return date(year=int(match[2]), month=mes, day=int(match[0]))

def md5sum(self, file):
"""Calculate the md5 checksum of a file-like object without reading its
whole content in memory.
from io import BytesIO
md5sum(BytesIO(b'file content to hash'))
'784406af91dd5a54fbb9c84c2236595a'
"""
m = hashlib.md5()
while True:
d = file.read(8096)
if not d:
break
m.update(d)
return m.hexdigest()

def __hash__(self):
return hash(self.id)

def __eq__(self, other):
return self.id == other.id

def __str__(self):
return dict(self.__dict__)
1 change: 1 addition & 0 deletions associations/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .get_territory_info import get_territorie_info
27 changes: 27 additions & 0 deletions associations/utils/get_territory_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

import unicodedata


def get_territorie_info(state: str, name: str, territories: list):

state = state.strip()
name = limpar_name(name)

for territorie in territories:
territorie_name = limpar_name(territorie["territory_name"])
if territorie["state"].lower() == state.lower() and territorie_name == name:

return territorie["id"], territorie["territory_name"], territorie["state_code"]


def limpar_name(name: str):

clean_name = name.replace("'", "")
clean_name = unicodedata.normalize("NFD", clean_name)
clean_name = clean_name.encode("ascii", "ignore").decode("utf-8")
clean_name = clean_name.lower()
clean_name = clean_name.strip()

clean_name = "major isidoro" if clean_name == "major izidoro" else clean_name

return clean_name
7 changes: 6 additions & 1 deletion main/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
extract_themed_excerpts_from_gazettes,
get_gazettes_to_be_processed,
get_themes,
get_territories_gazettes,
tag_entities_in_excerpts,

)


Expand Down Expand Up @@ -43,9 +45,12 @@ def execute_pipeline():
themes = get_themes()

gazettes_to_be_processed = get_gazettes_to_be_processed(execution_mode, database)
territories = get_territories_gazettes(database)

indexed_gazette_ids = extract_text_from_gazettes(
gazettes_to_be_processed, database, storage, index, text_extractor
gazettes_to_be_processed, database, storage, index, text_extractor, territories
)

for theme in themes:
themed_excerpt_ids = extract_themed_excerpts_from_gazettes(
theme, indexed_gazette_ids, index
Expand Down
2 changes: 2 additions & 0 deletions tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@
TextExtractorInterface,
)
from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
from .list_territories import get_territories_gazettes

7 changes: 7 additions & 0 deletions tasks/gazette_segmentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from associations import extrair_diarios_municipais


def extrarir_diarios(pdf_text, gazette, territories):

diarios = extrair_diarios_municipais(pdf_text, gazette, territories)
return diarios
69 changes: 65 additions & 4 deletions tasks/gazette_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from pathlib import Path
from typing import Dict, Iterable, List
from .gazette_segmentation import extrarir_diarios

from .interfaces import (
DatabaseInterface,
Expand All @@ -18,6 +19,7 @@ def extract_text_from_gazettes(
storage: StorageInterface,
index: IndexInterface,
text_extractor: TextExtractorInterface,
territories: Iterable[Dict]
) -> List[str]:
"""
Extracts the text from a list of gazettes
Expand All @@ -26,18 +28,35 @@ def extract_text_from_gazettes(
create_index(index)

ids = []
association_ids = []

for gazette in gazettes:
try:
processed_gazette = try_process_gazette_file(
gazette, database, storage, index, text_extractor
)

if str(gazette["territory_id"][-4:]).strip() == "0000":

association_ids = try_process_gazette_association_file(
gazette, database, storage, index, text_extractor, territories
)
else:
processed_gazette = try_process_gazette_file(
gazette, database, storage, index, text_extractor
)

except Exception as e:
logging.warning(
f"Could not process gazette: {gazette['file_path']}. Cause: {e}"
)
else:
ids.append(processed_gazette["file_checksum"])

if association_ids:
ids += [association["file_checksum"] for association in association_ids.copy()]
association_ids.clear()

else:
ids.append(processed_gazette["file_checksum"])


return ids


Expand All @@ -58,9 +77,43 @@ def try_process_gazette_file(
index.index_document(gazette, document_id=gazette["file_checksum"])
delete_gazette_files(gazette_file)
set_gazette_as_processed(gazette, database)

return gazette


def try_process_gazette_association_file(
gazette: Dict,
database: DatabaseInterface,
storage: StorageInterface,
index: IndexInterface,
text_extractor: TextExtractorInterface,
territories: Iterable[Dict]
) -> List:
"""
Do all the work to extract the content from the gazette files
"""

logging.debug(f"Processing gazette {gazette['file_path']}")
pdf = download_gazette_file(gazette, storage)
get_gazette_text_and_define_url(gazette, pdf, text_extractor)
upload_gazette_raw_text(gazette, storage)
pdf_txt = try_to_extract_content(pdf, text_extractor)
diarios = extrarir_diarios(
pdf_text=pdf_txt,
gazette=gazette,
territories=territories
)

for diario in diarios:

upload_gazette_raw_text_association(diario, storage)
index.index_document(diario, document_id=diario["file_checksum"])

delete_gazette_files(pdf)
set_gazette_as_processed(gazette, database)
return diarios


def create_index(index: IndexInterface) -> None:
body = {
"mappings": {
Expand Down Expand Up @@ -146,6 +199,14 @@ def upload_gazette_raw_text(gazette: Dict, storage):
file_endpoint = get_file_endpoint()
gazette["file_raw_txt"] = f"{file_endpoint}/{file_raw_txt}"

def upload_gazette_raw_text_association(gazette: Dict, storage):
"""
Define gazette raw text and define the url to access the file in the storage
"""
storage.upload_content(gazette["file_raw_txt"], gazette["source_text"])
file_endpoint = get_file_endpoint()
gazette["file_raw_txt"] = f"{file_endpoint}{gazette['file_raw_txt']}"
gazette["url"] = f"{file_endpoint}/{gazette['file_path']}"

def get_gazette_text_and_define_url(
gazette: Dict, gazette_file: str, text_extractor: TextExtractorInterface
Expand Down
Loading

0 comments on commit 9555960

Please sign in to comment.