From 4e5a6ecd843ebe19bab00a4a95409894b3aa6d30 Mon Sep 17 00:00:00 2001
From: Winzen <luiz.sinx@gmail.com>
Date: Tue, 21 Nov 2023 04:30:00 -0300
Subject: [PATCH] =?UTF-8?q?Vers=C3=A3o=20erro=20no=20'gazette=5Fthemed=5Fe?=
 =?UTF-8?q?xcerpts=5Fextraction'=20tratado?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 associations/__init__.py                 |   1 +
 associations/diario_ama.py               |  77 +++++++++++++++++
 associations/diario_municipal.py         | 103 +++++++++++++++++++++++
 associations/utils/__init__.py           |   1 +
 associations/utils/get_territory_info.py |  27 ++++++
 main/__main__.py                         |   7 +-
 tasks/__init__.py                        |   2 +
 tasks/gazette_segmentation.py            |   7 ++
 tasks/gazette_text_extraction.py         |  69 ++++++++++++++-
 tasks/list_gazettes_to_be_processed.py   |  27 ++++++
 tasks/list_territories.py                |  30 +++++++
 11 files changed, 346 insertions(+), 5 deletions(-)
 create mode 100644 associations/__init__.py
 create mode 100644 associations/diario_ama.py
 create mode 100644 associations/diario_municipal.py
 create mode 100644 associations/utils/__init__.py
 create mode 100644 associations/utils/get_territory_info.py
 create mode 100644 tasks/gazette_segmentation.py
 create mode 100644 tasks/list_territories.py

diff --git a/associations/__init__.py b/associations/__init__.py
new file mode 100644
index 0000000..bfc6242
--- /dev/null
+++ b/associations/__init__.py
@@ -0,0 +1 @@
+from .diario_ama import extrair_diarios_municipais
\ No newline at end of file
diff --git a/associations/diario_ama.py b/associations/diario_ama.py
new file mode 100644
index 0000000..bf20d4c
--- /dev/null
+++ b/associations/diario_ama.py
@@ -0,0 +1,77 @@
+import re
+
+from .diario_municipal import Diario, Municipio
+
+# No final do regex, existe uma estrutura condicional que verifica se o próximo match é um \s ou SECRETARIA. Isso foi feito para resolver um problema no diário de 2018-10-02, em que o município de Coité do Nóia não foi percebido pelo código. Para resolver isso, utilizamos a próxima palavra (SECRETARIA) para tratar esse caso.
+# Exceções Notáveis
+# String: VAMOS, município Poço das Trincheiras, 06/01/2022, ato CCB3A6AB
+re_nomes_municipios = (
+    r"ESTADO DE ALAGOAS(?:| )\n{1,2}PREFEITURA MUNICIPAL DE (.*\n{0,2}(?!VAMOS).*$)\n\s(?:\s|SECRETARIA)")
+
+
+def extrair_diarios_municipais(texto_diario: str, gazette: dict, territories: list):
+    texto_diario_slice = texto_diario.lstrip().splitlines()
+
+    # Processamento
+    linhas_apagar = []  # slice de linhas a ser apagadas ao final.
+    ama_header = texto_diario_slice[0]
+    ama_header_count = 0
+    codigo_count = 0
+    codigo_total = texto_diario.count("Código Identificador")
+
+    for num_linha, linha in enumerate(texto_diario_slice):
+        # Remoção do cabeçalho AMA, porém temos que manter a primeira aparição.
+        if linha.startswith(ama_header):
+            ama_header_count += 1
+            if ama_header_count > 1:
+                linhas_apagar.append(num_linha)
+
+        # Remoção das linhas finais
+        if codigo_count == codigo_total:
+            linhas_apagar.append(num_linha)
+        elif linha.startswith("Código Identificador"):
+            codigo_count += 1
+
+    # Apagando linhas do slice
+    texto_diario_slice = [l for n, l in enumerate(
+        texto_diario_slice) if n not in linhas_apagar]
+
+    # Inserindo o cabeçalho no diário de cada município.
+    texto_diarios = {}
+    nomes_municipios = re.findall(
+        re_nomes_municipios, texto_diario, re.MULTILINE)
+    for municipio in nomes_municipios:
+        municipio = Municipio(municipio)
+        texto_diarios[municipio] = ama_header + '\n\n'
+
+    num_linha = 0
+    municipio_atual = None
+    while num_linha < len(texto_diario_slice):
+        linha = texto_diario_slice[num_linha].rstrip()
+
+        if linha.startswith("ESTADO DE ALAGOAS"):
+            nome = nome_municipio(texto_diario_slice, num_linha)
+            if nome is not None:
+                municipio_atual = Municipio(nome)
+
+        # Só começa, quando algum muncípio for encontrado.
+        if municipio_atual is None:
+            num_linha += 1
+            continue
+
+        # Conteúdo faz parte de um muncípio
+        texto_diarios[municipio_atual] += linha + '\n'
+        num_linha += 1
+
+    diarios = []
+    for municipio, diario in texto_diarios.items():
+        diarios.append(Diario(municipio, ama_header, diario, gazette, territories).__dict__)
+    return diarios
+
+
+def nome_municipio(texto_diario_slice: slice, num_linha: int):
+    texto = '\n'.join(texto_diario_slice[num_linha:num_linha+10])
+    match = re.findall(re_nomes_municipios, texto, re.MULTILINE)
+    if len(match) > 0:
+        return match[0].strip().replace('\n', '')
+    return None
diff --git a/associations/diario_municipal.py b/associations/diario_municipal.py
new file mode 100644
index 0000000..1071590
--- /dev/null
+++ b/associations/diario_municipal.py
@@ -0,0 +1,103 @@
+import json
+import re
+import unicodedata
+from datetime import date, datetime
+from .utils import get_territorie_info
+import hashlib
+from io import BytesIO
+
+
+class Municipio:
+
+    def __init__(self, municipio):
+        municipio = municipio.rstrip().replace('\n', '')  # limpeza inicial
+        # Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso. 
+        municipio = re.sub("(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*)", "", municipio)
+        self.id = self._computa_id(municipio)
+        self.nome = municipio
+
+    def _computa_id(self, nome_municipio):
+        ret = nome_municipio.strip().lower().replace(" ", "-")
+        ret = unicodedata.normalize('NFKD', ret)
+        ret = ret.encode('ASCII', 'ignore').decode("utf-8")
+        return ret
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __eq__(self, other):
+        return self.id == other.id
+
+    def __str__(self):
+        return json.dumps(self.__dict__, indent=2, default=str, ensure_ascii=False)
+
+
+class Diario:
+
+    _mapa_meses = {
+        "Janeiro": 1,
+        "Fevereiro": 2,
+        "Março": 3,
+        "Abril": 4,
+        "Maio": 5,
+        "Junho": 6,
+        "Julho": 7,
+        "Agosto": 8,
+        "Setembro": 9,
+        "Outubro": 10,
+        "Novembro": 11,
+        "Dezembro": 12,
+    }
+
+    def __init__(self, municipio: Municipio, cabecalho: str, texto: str, gazette: dict, territories: list):
+        
+       
+        self.territory_id, self.territory_name, self.state_code = get_territorie_info(
+            name=municipio.nome,
+            state=cabecalho.split(",")[0],
+            territories=territories)
+        
+        self.source_text = texto.rstrip()
+        self.date = self._extrai_data_publicacao(cabecalho)
+        self.edition_number = cabecalho.split("Nº")[1].strip()
+        self.is_extra_edition = False
+        self.power = "executive_legislative"
+        self.file_url = gazette["file_url"]
+        self.file_path = gazette["file_path"]
+        self.file_checksum = self.md5sum(BytesIO(self.source_text.encode(encoding='UTF-8')))
+        self.id = gazette["id"]
+        self.scraped_at = datetime.utcnow()
+        self.created_at = self.scraped_at
+        self.file_raw_txt = f"/{self.territory_id}/{self.date}/{self.file_checksum}.txt"
+        self.processed = True
+        self.url = self.file_raw_txt
+
+    def _extrai_data_publicacao(self, ama_header: str):
+        match = re.findall(
+            r".*(\d{2}) de (\w*) de (\d{4})", ama_header, re.MULTILINE)[0]
+        mes = Diario._mapa_meses[match[1]]
+        return date(year=int(match[2]), month=mes, day=int(match[0]))
+
+    def md5sum(self, file):
+        """Calculate the md5 checksum of a file-like object without reading its
+        whole content in memory.
+        from io import BytesIO
+        md5sum(BytesIO(b'file content to hash'))
+        '784406af91dd5a54fbb9c84c2236595a'
+        """
+        m = hashlib.md5()
+        while True:
+            d = file.read(8096)
+            if not d:
+                break
+            m.update(d)
+        return m.hexdigest()
+
+    def __hash__(self):
+        return hash(self.id)
+
+    def __eq__(self, other):
+        return self.id == other.id
+
+    def __str__(self):
+        return dict(self.__dict__)
diff --git a/associations/utils/__init__.py b/associations/utils/__init__.py
new file mode 100644
index 0000000..d032955
--- /dev/null
+++ b/associations/utils/__init__.py
@@ -0,0 +1 @@
+from .get_territory_info import get_territorie_info
\ No newline at end of file
diff --git a/associations/utils/get_territory_info.py b/associations/utils/get_territory_info.py
new file mode 100644
index 0000000..11bed5c
--- /dev/null
+++ b/associations/utils/get_territory_info.py
@@ -0,0 +1,27 @@
+
+import unicodedata
+
+
+def get_territorie_info(state: str, name: str, territories: list):
+
+    state = state.strip()
+    name = limpar_name(name)
+
+    for territorie in territories:
+        territorie_name = limpar_name(territorie["territory_name"])
+        if territorie["state"].lower() == state.lower() and territorie_name == name:
+
+            return territorie["id"], territorie["territory_name"], territorie["state_code"]
+    
+
+def limpar_name(name: str):
+
+    clean_name = name.replace("'", "")
+    clean_name = unicodedata.normalize("NFD", clean_name)
+    clean_name = clean_name.encode("ascii", "ignore").decode("utf-8")
+    clean_name = clean_name.lower()
+    clean_name = clean_name.strip()
+
+    clean_name = "major isidoro" if clean_name == "major izidoro" else clean_name
+
+    return clean_name
diff --git a/main/__main__.py b/main/__main__.py
index 9a430fe..cc64a08 100644
--- a/main/__main__.py
+++ b/main/__main__.py
@@ -11,7 +11,9 @@
     extract_themed_excerpts_from_gazettes,
     get_gazettes_to_be_processed,
     get_themes,
+    get_territories_gazettes,
     tag_entities_in_excerpts,
+
 )
 
 
@@ -43,9 +45,12 @@ def execute_pipeline():
     themes = get_themes()
 
     gazettes_to_be_processed = get_gazettes_to_be_processed(execution_mode, database)
+    territories = get_territories_gazettes(database)
+
     indexed_gazette_ids = extract_text_from_gazettes(
-        gazettes_to_be_processed, database, storage, index, text_extractor
+        gazettes_to_be_processed, database, storage, index, text_extractor, territories
     )
+   
     for theme in themes:
         themed_excerpt_ids = extract_themed_excerpts_from_gazettes(
             theme, indexed_gazette_ids, index
diff --git a/tasks/__init__.py b/tasks/__init__.py
index bb16ccd..b738481 100644
--- a/tasks/__init__.py
+++ b/tasks/__init__.py
@@ -10,3 +10,5 @@
     TextExtractorInterface,
 )
 from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
+from .list_territories import get_territories_gazettes
+
diff --git a/tasks/gazette_segmentation.py b/tasks/gazette_segmentation.py
new file mode 100644
index 0000000..4bc546c
--- /dev/null
+++ b/tasks/gazette_segmentation.py
@@ -0,0 +1,7 @@
+from associations import extrair_diarios_municipais
+
+
+def extrarir_diarios(pdf_text, gazette, territories):
+
+    diarios = extrair_diarios_municipais(pdf_text, gazette, territories)
+    return diarios
diff --git a/tasks/gazette_text_extraction.py b/tasks/gazette_text_extraction.py
index 846b8a5..664263c 100644
--- a/tasks/gazette_text_extraction.py
+++ b/tasks/gazette_text_extraction.py
@@ -3,6 +3,7 @@
 import os
 from pathlib import Path
 from typing import Dict, Iterable, List
+from .gazette_segmentation import extrarir_diarios
 
 from .interfaces import (
     DatabaseInterface,
@@ -18,6 +19,7 @@ def extract_text_from_gazettes(
     storage: StorageInterface,
     index: IndexInterface,
     text_extractor: TextExtractorInterface,
+    territories: Iterable[Dict]
 ) -> List[str]:
     """
     Extracts the text from a list of gazettes
@@ -26,18 +28,35 @@ def extract_text_from_gazettes(
     create_index(index)
 
     ids = []
+    association_ids = []
+    
     for gazette in gazettes:
         try:
-            processed_gazette = try_process_gazette_file(
-                gazette, database, storage, index, text_extractor
-            )
+
+            if str(gazette["territory_id"][-4:]).strip() == "0000":
+                
+                association_ids = try_process_gazette_association_file(
+                    gazette, database, storage, index, text_extractor, territories
+                )
+            else:
+                processed_gazette = try_process_gazette_file(
+                    gazette, database, storage, index, text_extractor
+                )
+
         except Exception as e:
             logging.warning(
                 f"Could not process gazette: {gazette['file_path']}. Cause: {e}"
             )
         else:
-            ids.append(processed_gazette["file_checksum"])
+            
+            if association_ids:
+               ids += [association["file_checksum"] for association in association_ids.copy()]
+               association_ids.clear()
 
+            else:
+                ids.append(processed_gazette["file_checksum"])
+        
+        
     return ids
 
 
@@ -58,9 +77,43 @@ def try_process_gazette_file(
     index.index_document(gazette, document_id=gazette["file_checksum"])
     delete_gazette_files(gazette_file)
     set_gazette_as_processed(gazette, database)
+
     return gazette
 
 
+def try_process_gazette_association_file(
+    gazette: Dict,
+    database: DatabaseInterface,
+    storage: StorageInterface,
+    index: IndexInterface,
+    text_extractor: TextExtractorInterface,
+    territories: Iterable[Dict]
+) -> List:
+    """
+    Do all the work to extract the content from the gazette files
+    """
+
+    logging.debug(f"Processing gazette {gazette['file_path']}")
+    pdf = download_gazette_file(gazette, storage)
+    get_gazette_text_and_define_url(gazette, pdf, text_extractor)
+    upload_gazette_raw_text(gazette, storage)
+    pdf_txt = try_to_extract_content(pdf, text_extractor)
+    diarios = extrarir_diarios(
+        pdf_text=pdf_txt,
+        gazette=gazette,
+        territories=territories
+    )
+    
+    for diario in diarios:
+
+        upload_gazette_raw_text_association(diario, storage)
+        index.index_document(diario, document_id=diario["file_checksum"])
+
+    delete_gazette_files(pdf)
+    set_gazette_as_processed(gazette, database)
+    return diarios
+
+
 def create_index(index: IndexInterface) -> None:
     body = {
         "mappings": {
@@ -146,6 +199,14 @@ def upload_gazette_raw_text(gazette: Dict, storage):
     file_endpoint = get_file_endpoint()
     gazette["file_raw_txt"] = f"{file_endpoint}/{file_raw_txt}"
 
+def upload_gazette_raw_text_association(gazette: Dict, storage):
+    """
+    Define gazette raw text and define the url to access the file in the storage
+    """
+    storage.upload_content(gazette["file_raw_txt"], gazette["source_text"])
+    file_endpoint = get_file_endpoint()
+    gazette["file_raw_txt"] = f"{file_endpoint}{gazette['file_raw_txt']}"
+    gazette["url"] = f"{file_endpoint}/{gazette['file_path']}"
 
 def get_gazette_text_and_define_url(
     gazette: Dict, gazette_file: str, text_extractor: TextExtractorInterface
diff --git a/tasks/list_gazettes_to_be_processed.py b/tasks/list_gazettes_to_be_processed.py
index 1547e7b..e1fb4f1 100644
--- a/tasks/list_gazettes_to_be_processed.py
+++ b/tasks/list_gazettes_to_be_processed.py
@@ -7,6 +7,7 @@
 def get_gazettes_to_be_processed(
     execution_mode: str, database: DatabaseInterface
 ) -> Iterable[Dict]:
+
     if execution_mode == "DAILY":
         yield from get_gazettes_extracted_since_yesterday(database)
     elif execution_mode == "ALL":
@@ -123,6 +124,23 @@ def get_unprocessed_gazettes(
         yield format_gazette_data(gazette)
 
 
+def get_territories_gazettes(
+    database: DatabaseInterface,
+) -> Iterable[Dict]:
+
+    command = """
+    SELECT 
+        *
+    FROM
+        territories
+    ;
+    """
+
+    territories = [format_territories_data(territory) for territory in database.select(command)]
+
+    return territories
+
+
 def format_gazette_data(data):
     return {
         "id": data[0],
@@ -141,3 +159,12 @@ def format_gazette_data(data):
         "territory_name": data[13],
         "state_code": data[14],
     }
+
+
+def format_territories_data(data):
+    return {
+        "id": data[0],
+        "territory_name": data[1],
+        "state_code": data[2],
+        "state": data[3],
+    }
diff --git a/tasks/list_territories.py b/tasks/list_territories.py
new file mode 100644
index 0000000..3969e30
--- /dev/null
+++ b/tasks/list_territories.py
@@ -0,0 +1,30 @@
+import logging
+from typing import Dict, Iterable
+
+from .interfaces import DatabaseInterface
+
+
+def get_territories_gazettes(
+    database: DatabaseInterface,
+) -> Iterable[Dict]:
+
+    command = """
+    SELECT 
+        *
+    FROM
+        territories
+    ;
+    """
+
+    territories = [format_territories_data(territory) for territory in database.select(command)]
+
+    return territories
+
+
+def format_territories_data(data):
+    return {
+        "id": data[0],
+        "territory_name": data[1],
+        "state_code": data[2],
+        "state": data[3],
+    }
\ No newline at end of file