Versão erro no 'gazette_themed_excerpts_extraction' tratado (#59)

RP para ajudar a sincronizar os avanços da trilha de segmentadores. @Jefersonalves @ogecece
okfn-brasil · Nov 21, 2023 · 9555960 · 9555960
2 parents e2062af + 4e5a6ec
commit 9555960
Show file tree

Hide file tree

Showing 11 changed files with 346 additions and 5 deletions.
diff --git a/associations/__init__.py b/associations/__init__.py
@@ -0,0 +1 @@
+from .diario_ama import extrair_diarios_municipais
diff --git a/associations/diario_ama.py b/associations/diario_ama.py
@@ -0,0 +1,77 @@
+import re
+
+from .diario_municipal import Diario, Municipio
+
+# No final do regex, existe uma estrutura condicional que verifica se o próximo match é um \s ou SECRETARIA. Isso foi feito para resolver um problema no diário de 2018-10-02, em que o município de Coité do Nóia não foi percebido pelo código. Para resolver isso, utilizamos a próxima palavra (SECRETARIA) para tratar esse caso.
+# Exceções Notáveis
+# String: VAMOS, município Poço das Trincheiras, 06/01/2022, ato CCB3A6AB
+re_nomes_municipios = (
+    r"ESTADO DE ALAGOAS(?:| )\n{1,2}PREFEITURA MUNICIPAL DE (.*\n{0,2}(?!VAMOS).*$)\n\s(?:\s|SECRETARIA)")
+
+
+def extrair_diarios_municipais(texto_diario: str, gazette: dict, territories: list):
+    texto_diario_slice = texto_diario.lstrip().splitlines()
+
+    # Processamento
+    linhas_apagar = []  # slice de linhas a ser apagadas ao final.
+    ama_header = texto_diario_slice[0]
+    ama_header_count = 0
+    codigo_count = 0
+    codigo_total = texto_diario.count("Código Identificador")
+
+    for num_linha, linha in enumerate(texto_diario_slice):
+        # Remoção do cabeçalho AMA, porém temos que manter a primeira aparição.
+        if linha.startswith(ama_header):
+            ama_header_count += 1
+            if ama_header_count > 1:
+                linhas_apagar.append(num_linha)
+
+        # Remoção das linhas finais
+        if codigo_count == codigo_total:
+            linhas_apagar.append(num_linha)
+        elif linha.startswith("Código Identificador"):
+            codigo_count += 1
+
+    # Apagando linhas do slice
+    texto_diario_slice = [l for n, l in enumerate(
+        texto_diario_slice) if n not in linhas_apagar]
+
+    # Inserindo o cabeçalho no diário de cada município.
+    texto_diarios = {}
+    nomes_municipios = re.findall(
+        re_nomes_municipios, texto_diario, re.MULTILINE)
+    for municipio in nomes_municipios:
+        municipio = Municipio(municipio)
+        texto_diarios[municipio] = ama_header + '\n\n'
+
+    num_linha = 0
+    municipio_atual = None
+    while num_linha < len(texto_diario_slice):
+        linha = texto_diario_slice[num_linha].rstrip()
+
+        if linha.startswith("ESTADO DE ALAGOAS"):
+            nome = nome_municipio(texto_diario_slice, num_linha)
+            if nome is not None:
+                municipio_atual = Municipio(nome)
+
+        # Só começa, quando algum muncípio for encontrado.
+        if municipio_atual is None:
+            num_linha += 1
+            continue
+
+        # Conteúdo faz parte de um muncípio
+        texto_diarios[municipio_atual] += linha + '\n'
+        num_linha += 1
+
+    diarios = []
+    for municipio, diario in texto_diarios.items():
+        diarios.append(Diario(municipio, ama_header, diario, gazette, territories).__dict__)
+    return diarios
+
+
+def nome_municipio(texto_diario_slice: slice, num_linha: int):
+    texto = '\n'.join(texto_diario_slice[num_linha:num_linha+10])
+    match = re.findall(re_nomes_municipios, texto, re.MULTILINE)
+    if len(match) > 0:
+        return match[0].strip().replace('\n', '')
+    return None
diff --git a/associations/diario_municipal.py b/associations/diario_municipal.py
@@ -0,0 +1,103 @@
+import json
+import re
+import unicodedata
+from datetime import date, datetime
+from .utils import get_territorie_info
+import hashlib
+from io import BytesIO
+
+
+class Municipio:
+
+    def __init__(self, municipio):
+        municipio = municipio.rstrip().replace('\n', '')  # limpeza inicial
+        # Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso. 
+        municipio = re.sub("(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*)", "", municipio)
+        self.id = self._computa_id(municipio)
+        self.nome = municipio
+
+    def _computa_id(self, nome_municipio):
+        ret = nome_municipio.strip().lower().replace(" ", "-")
+        ret = unicodedata.normalize('NFKD', ret)
+        ret = ret.encode('ASCII', 'ignore').decode("utf-8")
+        return ret
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __eq__(self, other):
+        return self.id == other.id
+
+    def __str__(self):
+        return json.dumps(self.__dict__, indent=2, default=str, ensure_ascii=False)
+
+
+class Diario:
+
+    _mapa_meses = {
+        "Janeiro": 1,
+        "Fevereiro": 2,
+        "Março": 3,
+        "Abril": 4,
+        "Maio": 5,
+        "Junho": 6,
+        "Julho": 7,
+        "Agosto": 8,
+        "Setembro": 9,
+        "Outubro": 10,
+        "Novembro": 11,
+        "Dezembro": 12,
+    }
+
+    def __init__(self, municipio: Municipio, cabecalho: str, texto: str, gazette: dict, territories: list):
+
+
+        self.territory_id, self.territory_name, self.state_code = get_territorie_info(
+            name=municipio.nome,
+            state=cabecalho.split(",")[0],
+            territories=territories)
+
+        self.source_text = texto.rstrip()
+        self.date = self._extrai_data_publicacao(cabecalho)
+        self.edition_number = cabecalho.split("Nº")[1].strip()
+        self.is_extra_edition = False
+        self.power = "executive_legislative"
+        self.file_url = gazette["file_url"]
+        self.file_path = gazette["file_path"]
+        self.file_checksum = self.md5sum(BytesIO(self.source_text.encode(encoding='UTF-8')))
+        self.id = gazette["id"]
+        self.scraped_at = datetime.utcnow()
+        self.created_at = self.scraped_at
+        self.file_raw_txt = f"/{self.territory_id}/{self.date}/{self.file_checksum}.txt"
+        self.processed = True
+        self.url = self.file_raw_txt
+
+    def _extrai_data_publicacao(self, ama_header: str):
+        match = re.findall(
+            r".*(\d{2}) de (\w*) de (\d{4})", ama_header, re.MULTILINE)[0]
+        mes = Diario._mapa_meses[match[1]]
+        return date(year=int(match[2]), month=mes, day=int(match[0]))
+
+    def md5sum(self, file):
+        """Calculate the md5 checksum of a file-like object without reading its
+        whole content in memory.
+        from io import BytesIO
+        md5sum(BytesIO(b'file content to hash'))
+        '784406af91dd5a54fbb9c84c2236595a'
+        """
+        m = hashlib.md5()
+        while True:
+            d = file.read(8096)
+            if not d:
+                break
+            m.update(d)
+        return m.hexdigest()
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __eq__(self, other):
+        return self.id == other.id
+
+    def __str__(self):
+        return dict(self.__dict__)
diff --git a/associations/utils/__init__.py b/associations/utils/__init__.py
@@ -0,0 +1 @@
+from .get_territory_info import get_territorie_info
diff --git a/associations/utils/get_territory_info.py b/associations/utils/get_territory_info.py
@@ -0,0 +1,27 @@
+
+import unicodedata
+
+
+def get_territorie_info(state: str, name: str, territories: list):
+
+    state = state.strip()
+    name = limpar_name(name)
+
+    for territorie in territories:
+        territorie_name = limpar_name(territorie["territory_name"])
+        if territorie["state"].lower() == state.lower() and territorie_name == name:
+
+            return territorie["id"], territorie["territory_name"], territorie["state_code"]
+
+
+def limpar_name(name: str):
+
+    clean_name = name.replace("'", "")
+    clean_name = unicodedata.normalize("NFD", clean_name)
+    clean_name = clean_name.encode("ascii", "ignore").decode("utf-8")
+    clean_name = clean_name.lower()
+    clean_name = clean_name.strip()
+
+    clean_name = "major isidoro" if clean_name == "major izidoro" else clean_name
+
+    return clean_name
diff --git a/main/__main__.py b/main/__main__.py
@@ -11,7 +11,9 @@
     extract_themed_excerpts_from_gazettes,
     get_gazettes_to_be_processed,
     get_themes,
+    get_territories_gazettes,
     tag_entities_in_excerpts,
+
 )
 
 
@@ -43,9 +45,12 @@ def execute_pipeline():
     themes = get_themes()
 
     gazettes_to_be_processed = get_gazettes_to_be_processed(execution_mode, database)
+    territories = get_territories_gazettes(database)
+
     indexed_gazette_ids = extract_text_from_gazettes(
-        gazettes_to_be_processed, database, storage, index, text_extractor
+        gazettes_to_be_processed, database, storage, index, text_extractor, territories
     )
+
     for theme in themes:
         themed_excerpt_ids = extract_themed_excerpts_from_gazettes(
             theme, indexed_gazette_ids, index

diff --git a/tasks/__init__.py b/tasks/__init__.py
@@ -10,3 +10,5 @@
     TextExtractorInterface,
 )
 from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
+from .list_territories import get_territories_gazettes
+
diff --git a/tasks/gazette_segmentation.py b/tasks/gazette_segmentation.py
@@ -0,0 +1,7 @@
+from associations import extrair_diarios_municipais
+
+
+def extrarir_diarios(pdf_text, gazette, territories):
+
+    diarios = extrair_diarios_municipais(pdf_text, gazette, territories)
+    return diarios
diff --git a/tasks/gazette_text_extraction.py b/tasks/gazette_text_extraction.py
@@ -3,6 +3,7 @@
 import os
 from pathlib import Path
 from typing import Dict, Iterable, List
+from .gazette_segmentation import extrarir_diarios
 
 from .interfaces import (
     DatabaseInterface,
@@ -18,6 +19,7 @@ def extract_text_from_gazettes(
     storage: StorageInterface,
     index: IndexInterface,
     text_extractor: TextExtractorInterface,
+    territories: Iterable[Dict]
 ) -> List[str]:
     """
     Extracts the text from a list of gazettes
@@ -26,18 +28,35 @@ def extract_text_from_gazettes(
     create_index(index)
 
     ids = []
+    association_ids = []
+
     for gazette in gazettes:
         try:
-            processed_gazette = try_process_gazette_file(
-                gazette, database, storage, index, text_extractor
-            )
+
+            if str(gazette["territory_id"][-4:]).strip() == "0000":
+
+                association_ids = try_process_gazette_association_file(
+                    gazette, database, storage, index, text_extractor, territories
+                )
+            else:
+                processed_gazette = try_process_gazette_file(
+                    gazette, database, storage, index, text_extractor
+                )
+
         except Exception as e:
             logging.warning(
                 f"Could not process gazette: {gazette['file_path']}. Cause: {e}"
             )
         else:
-            ids.append(processed_gazette["file_checksum"])
+
+            if association_ids:
+               ids += [association["file_checksum"] for association in association_ids.copy()]
+               association_ids.clear()
 
+            else:
+                ids.append(processed_gazette["file_checksum"])
+
+
     return ids
 
 
@@ -58,9 +77,43 @@ def try_process_gazette_file(
     index.index_document(gazette, document_id=gazette["file_checksum"])
     delete_gazette_files(gazette_file)
     set_gazette_as_processed(gazette, database)
+
     return gazette
 
 
+def try_process_gazette_association_file(
+    gazette: Dict,
+    database: DatabaseInterface,
+    storage: StorageInterface,
+    index: IndexInterface,
+    text_extractor: TextExtractorInterface,
+    territories: Iterable[Dict]
+) -> List:
+    """
+    Do all the work to extract the content from the gazette files
+    """
+
+    logging.debug(f"Processing gazette {gazette['file_path']}")
+    pdf = download_gazette_file(gazette, storage)
+    get_gazette_text_and_define_url(gazette, pdf, text_extractor)
+    upload_gazette_raw_text(gazette, storage)
+    pdf_txt = try_to_extract_content(pdf, text_extractor)
+    diarios = extrarir_diarios(
+        pdf_text=pdf_txt,
+        gazette=gazette,
+        territories=territories
+    )
+
+    for diario in diarios:
+
+        upload_gazette_raw_text_association(diario, storage)
+        index.index_document(diario, document_id=diario["file_checksum"])
+
+    delete_gazette_files(pdf)
+    set_gazette_as_processed(gazette, database)
+    return diarios
+
+
 def create_index(index: IndexInterface) -> None:
     body = {
         "mappings": {
@@ -146,6 +199,14 @@ def upload_gazette_raw_text(gazette: Dict, storage):
     file_endpoint = get_file_endpoint()
     gazette["file_raw_txt"] = f"{file_endpoint}/{file_raw_txt}"
 
+def upload_gazette_raw_text_association(gazette: Dict, storage):
+    """
+    Define gazette raw text and define the url to access the file in the storage
+    """
+    storage.upload_content(gazette["file_raw_txt"], gazette["source_text"])
+    file_endpoint = get_file_endpoint()
+    gazette["file_raw_txt"] = f"{file_endpoint}{gazette['file_raw_txt']}"
+    gazette["url"] = f"{file_endpoint}/{gazette['file_path']}"
 
 def get_gazette_text_and_define_url(
     gazette: Dict, gazette_file: str, text_extractor: TextExtractorInterface