From 4e5a6ecd843ebe19bab00a4a95409894b3aa6d30 Mon Sep 17 00:00:00 2001 From: Winzen Date: Tue, 21 Nov 2023 04:30:00 -0300 Subject: [PATCH] =?UTF-8?q?Vers=C3=A3o=20erro=20no=20'gazette=5Fthemed=5Fe?= =?UTF-8?q?xcerpts=5Fextraction'=20tratado?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- associations/__init__.py | 1 + associations/diario_ama.py | 77 +++++++++++++++++ associations/diario_municipal.py | 103 +++++++++++++++++++++++ associations/utils/__init__.py | 1 + associations/utils/get_territory_info.py | 27 ++++++ main/__main__.py | 7 +- tasks/__init__.py | 2 + tasks/gazette_segmentation.py | 7 ++ tasks/gazette_text_extraction.py | 69 ++++++++++++++- tasks/list_gazettes_to_be_processed.py | 27 ++++++ tasks/list_territories.py | 30 +++++++ 11 files changed, 346 insertions(+), 5 deletions(-) create mode 100644 associations/__init__.py create mode 100644 associations/diario_ama.py create mode 100644 associations/diario_municipal.py create mode 100644 associations/utils/__init__.py create mode 100644 associations/utils/get_territory_info.py create mode 100644 tasks/gazette_segmentation.py create mode 100644 tasks/list_territories.py diff --git a/associations/__init__.py b/associations/__init__.py new file mode 100644 index 0000000..bfc6242 --- /dev/null +++ b/associations/__init__.py @@ -0,0 +1 @@ +from .diario_ama import extrair_diarios_municipais \ No newline at end of file diff --git a/associations/diario_ama.py b/associations/diario_ama.py new file mode 100644 index 0000000..bf20d4c --- /dev/null +++ b/associations/diario_ama.py @@ -0,0 +1,77 @@ +import re + +from .diario_municipal import Diario, Municipio + +# No final do regex, existe uma estrutura condicional que verifica se o próximo match é um \s ou SECRETARIA. Isso foi feito para resolver um problema no diário de 2018-10-02, em que o município de Coité do Nóia não foi percebido pelo código. Para resolver isso, utilizamos a próxima palavra (SECRETARIA) para tratar esse caso. +# Exceções Notáveis +# String: VAMOS, município Poço das Trincheiras, 06/01/2022, ato CCB3A6AB +re_nomes_municipios = ( + r"ESTADO DE ALAGOAS(?:| )\n{1,2}PREFEITURA MUNICIPAL DE (.*\n{0,2}(?!VAMOS).*$)\n\s(?:\s|SECRETARIA)") + + +def extrair_diarios_municipais(texto_diario: str, gazette: dict, territories: list): + texto_diario_slice = texto_diario.lstrip().splitlines() + + # Processamento + linhas_apagar = [] # slice de linhas a ser apagadas ao final. + ama_header = texto_diario_slice[0] + ama_header_count = 0 + codigo_count = 0 + codigo_total = texto_diario.count("Código Identificador") + + for num_linha, linha in enumerate(texto_diario_slice): + # Remoção do cabeçalho AMA, porém temos que manter a primeira aparição. + if linha.startswith(ama_header): + ama_header_count += 1 + if ama_header_count > 1: + linhas_apagar.append(num_linha) + + # Remoção das linhas finais + if codigo_count == codigo_total: + linhas_apagar.append(num_linha) + elif linha.startswith("Código Identificador"): + codigo_count += 1 + + # Apagando linhas do slice + texto_diario_slice = [l for n, l in enumerate( + texto_diario_slice) if n not in linhas_apagar] + + # Inserindo o cabeçalho no diário de cada município. + texto_diarios = {} + nomes_municipios = re.findall( + re_nomes_municipios, texto_diario, re.MULTILINE) + for municipio in nomes_municipios: + municipio = Municipio(municipio) + texto_diarios[municipio] = ama_header + '\n\n' + + num_linha = 0 + municipio_atual = None + while num_linha < len(texto_diario_slice): + linha = texto_diario_slice[num_linha].rstrip() + + if linha.startswith("ESTADO DE ALAGOAS"): + nome = nome_municipio(texto_diario_slice, num_linha) + if nome is not None: + municipio_atual = Municipio(nome) + + # Só começa, quando algum muncípio for encontrado. + if municipio_atual is None: + num_linha += 1 + continue + + # Conteúdo faz parte de um muncípio + texto_diarios[municipio_atual] += linha + '\n' + num_linha += 1 + + diarios = [] + for municipio, diario in texto_diarios.items(): + diarios.append(Diario(municipio, ama_header, diario, gazette, territories).__dict__) + return diarios + + +def nome_municipio(texto_diario_slice: slice, num_linha: int): + texto = '\n'.join(texto_diario_slice[num_linha:num_linha+10]) + match = re.findall(re_nomes_municipios, texto, re.MULTILINE) + if len(match) > 0: + return match[0].strip().replace('\n', '') + return None diff --git a/associations/diario_municipal.py b/associations/diario_municipal.py new file mode 100644 index 0000000..1071590 --- /dev/null +++ b/associations/diario_municipal.py @@ -0,0 +1,103 @@ +import json +import re +import unicodedata +from datetime import date, datetime +from .utils import get_territorie_info +import hashlib +from io import BytesIO + + +class Municipio: + + def __init__(self, municipio): + municipio = municipio.rstrip().replace('\n', '') # limpeza inicial + # Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso. + municipio = re.sub("(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*)", "", municipio) + self.id = self._computa_id(municipio) + self.nome = municipio + + def _computa_id(self, nome_municipio): + ret = nome_municipio.strip().lower().replace(" ", "-") + ret = unicodedata.normalize('NFKD', ret) + ret = ret.encode('ASCII', 'ignore').decode("utf-8") + return ret + + def __hash__(self): + return hash(self.id) + + def __eq__(self, other): + return self.id == other.id + + def __str__(self): + return json.dumps(self.__dict__, indent=2, default=str, ensure_ascii=False) + + +class Diario: + + _mapa_meses = { + "Janeiro": 1, + "Fevereiro": 2, + "Março": 3, + "Abril": 4, + "Maio": 5, + "Junho": 6, + "Julho": 7, + "Agosto": 8, + "Setembro": 9, + "Outubro": 10, + "Novembro": 11, + "Dezembro": 12, + } + + def __init__(self, municipio: Municipio, cabecalho: str, texto: str, gazette: dict, territories: list): + + + self.territory_id, self.territory_name, self.state_code = get_territorie_info( + name=municipio.nome, + state=cabecalho.split(",")[0], + territories=territories) + + self.source_text = texto.rstrip() + self.date = self._extrai_data_publicacao(cabecalho) + self.edition_number = cabecalho.split("Nº")[1].strip() + self.is_extra_edition = False + self.power = "executive_legislative" + self.file_url = gazette["file_url"] + self.file_path = gazette["file_path"] + self.file_checksum = self.md5sum(BytesIO(self.source_text.encode(encoding='UTF-8'))) + self.id = gazette["id"] + self.scraped_at = datetime.utcnow() + self.created_at = self.scraped_at + self.file_raw_txt = f"/{self.territory_id}/{self.date}/{self.file_checksum}.txt" + self.processed = True + self.url = self.file_raw_txt + + def _extrai_data_publicacao(self, ama_header: str): + match = re.findall( + r".*(\d{2}) de (\w*) de (\d{4})", ama_header, re.MULTILINE)[0] + mes = Diario._mapa_meses[match[1]] + return date(year=int(match[2]), month=mes, day=int(match[0])) + + def md5sum(self, file): + """Calculate the md5 checksum of a file-like object without reading its + whole content in memory. + from io import BytesIO + md5sum(BytesIO(b'file content to hash')) + '784406af91dd5a54fbb9c84c2236595a' + """ + m = hashlib.md5() + while True: + d = file.read(8096) + if not d: + break + m.update(d) + return m.hexdigest() + + def __hash__(self): + return hash(self.id) + + def __eq__(self, other): + return self.id == other.id + + def __str__(self): + return dict(self.__dict__) diff --git a/associations/utils/__init__.py b/associations/utils/__init__.py new file mode 100644 index 0000000..d032955 --- /dev/null +++ b/associations/utils/__init__.py @@ -0,0 +1 @@ +from .get_territory_info import get_territorie_info \ No newline at end of file diff --git a/associations/utils/get_territory_info.py b/associations/utils/get_territory_info.py new file mode 100644 index 0000000..11bed5c --- /dev/null +++ b/associations/utils/get_territory_info.py @@ -0,0 +1,27 @@ + +import unicodedata + + +def get_territorie_info(state: str, name: str, territories: list): + + state = state.strip() + name = limpar_name(name) + + for territorie in territories: + territorie_name = limpar_name(territorie["territory_name"]) + if territorie["state"].lower() == state.lower() and territorie_name == name: + + return territorie["id"], territorie["territory_name"], territorie["state_code"] + + +def limpar_name(name: str): + + clean_name = name.replace("'", "") + clean_name = unicodedata.normalize("NFD", clean_name) + clean_name = clean_name.encode("ascii", "ignore").decode("utf-8") + clean_name = clean_name.lower() + clean_name = clean_name.strip() + + clean_name = "major isidoro" if clean_name == "major izidoro" else clean_name + + return clean_name diff --git a/main/__main__.py b/main/__main__.py index 9a430fe..cc64a08 100644 --- a/main/__main__.py +++ b/main/__main__.py @@ -11,7 +11,9 @@ extract_themed_excerpts_from_gazettes, get_gazettes_to_be_processed, get_themes, + get_territories_gazettes, tag_entities_in_excerpts, + ) @@ -43,9 +45,12 @@ def execute_pipeline(): themes = get_themes() gazettes_to_be_processed = get_gazettes_to_be_processed(execution_mode, database) + territories = get_territories_gazettes(database) + indexed_gazette_ids = extract_text_from_gazettes( - gazettes_to_be_processed, database, storage, index, text_extractor + gazettes_to_be_processed, database, storage, index, text_extractor, territories ) + for theme in themes: themed_excerpt_ids = extract_themed_excerpts_from_gazettes( theme, indexed_gazette_ids, index diff --git a/tasks/__init__.py b/tasks/__init__.py index bb16ccd..b738481 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -10,3 +10,5 @@ TextExtractorInterface, ) from .list_gazettes_to_be_processed import get_gazettes_to_be_processed +from .list_territories import get_territories_gazettes + diff --git a/tasks/gazette_segmentation.py b/tasks/gazette_segmentation.py new file mode 100644 index 0000000..4bc546c --- /dev/null +++ b/tasks/gazette_segmentation.py @@ -0,0 +1,7 @@ +from associations import extrair_diarios_municipais + + +def extrarir_diarios(pdf_text, gazette, territories): + + diarios = extrair_diarios_municipais(pdf_text, gazette, territories) + return diarios diff --git a/tasks/gazette_text_extraction.py b/tasks/gazette_text_extraction.py index 846b8a5..664263c 100644 --- a/tasks/gazette_text_extraction.py +++ b/tasks/gazette_text_extraction.py @@ -3,6 +3,7 @@ import os from pathlib import Path from typing import Dict, Iterable, List +from .gazette_segmentation import extrarir_diarios from .interfaces import ( DatabaseInterface, @@ -18,6 +19,7 @@ def extract_text_from_gazettes( storage: StorageInterface, index: IndexInterface, text_extractor: TextExtractorInterface, + territories: Iterable[Dict] ) -> List[str]: """ Extracts the text from a list of gazettes @@ -26,18 +28,35 @@ def extract_text_from_gazettes( create_index(index) ids = [] + association_ids = [] + for gazette in gazettes: try: - processed_gazette = try_process_gazette_file( - gazette, database, storage, index, text_extractor - ) + + if str(gazette["territory_id"][-4:]).strip() == "0000": + + association_ids = try_process_gazette_association_file( + gazette, database, storage, index, text_extractor, territories + ) + else: + processed_gazette = try_process_gazette_file( + gazette, database, storage, index, text_extractor + ) + except Exception as e: logging.warning( f"Could not process gazette: {gazette['file_path']}. Cause: {e}" ) else: - ids.append(processed_gazette["file_checksum"]) + + if association_ids: + ids += [association["file_checksum"] for association in association_ids.copy()] + association_ids.clear() + else: + ids.append(processed_gazette["file_checksum"]) + + return ids @@ -58,9 +77,43 @@ def try_process_gazette_file( index.index_document(gazette, document_id=gazette["file_checksum"]) delete_gazette_files(gazette_file) set_gazette_as_processed(gazette, database) + return gazette +def try_process_gazette_association_file( + gazette: Dict, + database: DatabaseInterface, + storage: StorageInterface, + index: IndexInterface, + text_extractor: TextExtractorInterface, + territories: Iterable[Dict] +) -> List: + """ + Do all the work to extract the content from the gazette files + """ + + logging.debug(f"Processing gazette {gazette['file_path']}") + pdf = download_gazette_file(gazette, storage) + get_gazette_text_and_define_url(gazette, pdf, text_extractor) + upload_gazette_raw_text(gazette, storage) + pdf_txt = try_to_extract_content(pdf, text_extractor) + diarios = extrarir_diarios( + pdf_text=pdf_txt, + gazette=gazette, + territories=territories + ) + + for diario in diarios: + + upload_gazette_raw_text_association(diario, storage) + index.index_document(diario, document_id=diario["file_checksum"]) + + delete_gazette_files(pdf) + set_gazette_as_processed(gazette, database) + return diarios + + def create_index(index: IndexInterface) -> None: body = { "mappings": { @@ -146,6 +199,14 @@ def upload_gazette_raw_text(gazette: Dict, storage): file_endpoint = get_file_endpoint() gazette["file_raw_txt"] = f"{file_endpoint}/{file_raw_txt}" +def upload_gazette_raw_text_association(gazette: Dict, storage): + """ + Define gazette raw text and define the url to access the file in the storage + """ + storage.upload_content(gazette["file_raw_txt"], gazette["source_text"]) + file_endpoint = get_file_endpoint() + gazette["file_raw_txt"] = f"{file_endpoint}{gazette['file_raw_txt']}" + gazette["url"] = f"{file_endpoint}/{gazette['file_path']}" def get_gazette_text_and_define_url( gazette: Dict, gazette_file: str, text_extractor: TextExtractorInterface diff --git a/tasks/list_gazettes_to_be_processed.py b/tasks/list_gazettes_to_be_processed.py index 1547e7b..e1fb4f1 100644 --- a/tasks/list_gazettes_to_be_processed.py +++ b/tasks/list_gazettes_to_be_processed.py @@ -7,6 +7,7 @@ def get_gazettes_to_be_processed( execution_mode: str, database: DatabaseInterface ) -> Iterable[Dict]: + if execution_mode == "DAILY": yield from get_gazettes_extracted_since_yesterday(database) elif execution_mode == "ALL": @@ -123,6 +124,23 @@ def get_unprocessed_gazettes( yield format_gazette_data(gazette) +def get_territories_gazettes( + database: DatabaseInterface, +) -> Iterable[Dict]: + + command = """ + SELECT + * + FROM + territories + ; + """ + + territories = [format_territories_data(territory) for territory in database.select(command)] + + return territories + + def format_gazette_data(data): return { "id": data[0], @@ -141,3 +159,12 @@ def format_gazette_data(data): "territory_name": data[13], "state_code": data[14], } + + +def format_territories_data(data): + return { + "id": data[0], + "territory_name": data[1], + "state_code": data[2], + "state": data[3], + } diff --git a/tasks/list_territories.py b/tasks/list_territories.py new file mode 100644 index 0000000..3969e30 --- /dev/null +++ b/tasks/list_territories.py @@ -0,0 +1,30 @@ +import logging +from typing import Dict, Iterable + +from .interfaces import DatabaseInterface + + +def get_territories_gazettes( + database: DatabaseInterface, +) -> Iterable[Dict]: + + command = """ + SELECT + * + FROM + territories + ; + """ + + territories = [format_territories_data(territory) for territory in database.select(command)] + + return territories + + +def format_territories_data(data): + return { + "id": data[0], + "territory_name": data[1], + "state_code": data[2], + "state": data[3], + } \ No newline at end of file