Cria módulo de segmentação para suportar novos segmentadores

- Cria módulo `segmentation` para organizar o código dos segmentadores - O módulo segue o padrão de classe base e classes derivadas para cada segmentador específico - O primeiro segmentador base é o `AssociationSegmenter` para segmentar documentos de associações de municípios - Uma dataclass para segmentos `GazetteSegment` também foi criada - Os segmentadores são instanciados por meio de uma fábrica `get_segmenter`
okfn-brasil · Nov 27, 2023 · 14537e2 · 14537e2
1 parent 9555960
commit 14537e2
Show file tree

Hide file tree

Showing 12 changed files with 276 additions and 182 deletions.
diff --git a/segmentation/__init__.py b/segmentation/__init__.py
@@ -0,0 +1,5 @@
+from .factory import get_segmenter
+
+__all__ = [
+    "get_segmenter",
+]
diff --git a/segmentation/base/__init__.py b/segmentation/base/__init__.py
@@ -0,0 +1,7 @@
+from .gazette_segment import GazetteSegment
+from .association_segmenter import AssociationSegmenter
+
+__all__ = [
+    "GazetteSegment",
+    "AssociationSegmenter",
+]
diff --git a/segmentation/base/association_segmenter.py b/segmentation/base/association_segmenter.py
@@ -0,0 +1,27 @@
+from typing import Union, Dict, List
+from segmentation.base import GazetteSegment
+
+
+class AssociationSegmenter:
+    def __init__(self, association_gazette: str):
+        self.association_gazette = association_gazette
+
+    def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]:
+        """
+        Returns a list of GazetteSegment
+        """
+        raise NotImplementedError
+
+    def split_text_by_territory(self, *args, **kwargs) -> Union[Dict[str, str], List[str]]:
+        """
+        Segment a association text by territory
+        and returns a list of text segments
+        """
+        raise NotImplementedError
+
+    def build_segment(self, *args, **kwargs) -> GazetteSegment:
+        """
+        Returns a GazetteSegment
+        """
+        raise NotImplementedError
+
diff --git a/segmentation/base/gazette_segment.py b/segmentation/base/gazette_segment.py
@@ -0,0 +1,26 @@
+from datetime import date, datetime
+from dataclasses import dataclass
+
+
+@dataclass
+class GazetteSegment:
+    """
+    Dataclass to represent a gazette segment of a association
+    related to a city
+    """
+    territory_name: str
+    source_text: str
+    date: date
+    edition_number: str
+    is_extra_edition: bool
+    power: str
+    file_checksum: str
+    scraped_at: datetime
+    created_at: datetime
+    processed: bool
+    file_path: str
+    file_url: str
+    state_code: str
+    territory_id: str
+    file_raw_txt: str
+    url: str
diff --git a/segmentation/factory.py b/segmentation/factory.py
@@ -0,0 +1,42 @@
+from typing import Any
+
+from segmentation.base import AssociationSegmenter
+from segmentation import segmenters
+
+
+def get_segmenter(territory_id: str, association_gazzete: dict[str, Any]) -> AssociationSegmenter:
+    """
+    Factory method to return a AssociationSegmenter
+
+    Example
+    -------
+    >>> association_gazette = {
+        "territory_name": "Associação",
+        "created_at": datetime.datetime.now(),
+        "date": datetime.datetime.now(),
+        "edition_number": 1,
+        "file_path": 'raw/pdf.pdf',
+        "file_url": 'localhost:8000/raw/pdf.pdf',
+        "is_extra_edition": True,
+        "power": 'executive',
+        "scraped_at": datetime.datetime.now(),
+        "state_code": 'AL',
+        "source_text": texto,
+    }
+    >>> from segmentation import get_segmenter
+    >>> segmenter = get_segmenter(territory_id, association_gazette)
+    >>> segments = segmenter.get_gazette_segments()
+
+    Notes
+    -----
+    This method implements a factory method pattern.
+    See: https://github.com/faif/python-patterns/blob/master/patterns/creational/factory.py
+    """
+
+    territory_to_segmenter_class = {
+        "2700000": "ALAssociacaoMunicipiosSegmenter",
+    }
+
+    segmenter_class_name = territory_to_segmenter_class[territory_id]
+    segmenter_class = getattr(segmenters, segmenter_class_name)
+    return segmenter_class(association_gazzete)
diff --git a/segmentation/segmenters/__init__.py b/segmentation/segmenters/__init__.py
@@ -0,0 +1,5 @@
+from .al_associacao_municipios import ALAssociacaoMunicipiosSegmenter
+
+__all__ = [
+    "ALAssociacaoMunicipiosSegmenter",
+]
diff --git a/segmentation/segmenters/al_associacao_municipios.py b/segmentation/segmenters/al_associacao_municipios.py
@@ -0,0 +1,136 @@
+import re
+
+from typing import Any
+from segmentation.base import AssociationSegmenter, GazetteSegment
+from tasks.utils import get_checksum
+
+class ALAssociacaoMunicipiosSegmenter(AssociationSegmenter):
+    def __init__(self, association_gazzete: dict[str, Any]):
+        super().__init__(association_gazzete)
+        # No final do regex, existe uma estrutura condicional que verifica se o próximo match é um \s ou SECRETARIA. Isso foi feito para resolver um problema no diário de 2018-10-02, em que o município de Coité do Nóia não foi percebido pelo código. Para resolver isso, utilizamos a próxima palavra (SECRETARIA) para tratar esse caso.
+        # Exceções Notáveis
+        # String: VAMOS, município Poço das Trincheiras, 06/01/2022, ato CCB3A6AB
+        self.RE_NOMES_MUNICIPIOS = (
+            r"ESTADO DE ALAGOAS(?:| )\n{1,2}PREFEITURA MUNICIPAL DE (.*\n{0,2}(?!VAMOS).*$)\n\s(?:\s|SECRETARIA)"
+        )
+        self.association_source_text = self.association_gazette["source_text"]
+
+    def get_gazette_segments(self) -> list[dict[str, Any]]:
+        """
+        Returns a list of dicts with the gazettes metadata
+        """
+        territory_to_text_split = self.split_text_by_territory()
+        gazette_segments = []
+        for municipio, texto_diario in territory_to_text_split.items():
+            segmento = self.build_segment(municipio, texto_diario)
+            gazette_segments.append(segmento.__dict__)
+        return gazette_segments
+
+    def split_text_by_territory(self) -> dict[str, str]:
+        """
+        Segment a association text by territory
+        and returns a dict with the territory name and the text segment
+        """
+        texto_diario_slice = self.association_source_text.lstrip().splitlines()
+
+        # Processamento
+        linhas_apagar = []  # slice de linhas a ser apagadas ao final.
+        ama_header = texto_diario_slice[0]
+        ama_header_count = 0
+        codigo_count = 0
+        codigo_total = self.association_source_text.count("Código Identificador")
+
+        for num_linha, linha in enumerate(texto_diario_slice):
+            # Remoção do cabeçalho AMA, porém temos que manter a primeira aparição.
+            if linha.startswith(ama_header):
+                ama_header_count += 1
+                if ama_header_count > 1:
+                    linhas_apagar.append(num_linha)
+
+            # Remoção das linhas finais
+            if codigo_count == codigo_total:
+                linhas_apagar.append(num_linha)
+            elif linha.startswith("Código Identificador"):
+                codigo_count += 1
+
+        # Apagando linhas do slice
+        texto_diario_slice = [l for n, l in enumerate(
+            texto_diario_slice) if n not in linhas_apagar]
+
+        # Inserindo o cabeçalho no diário de cada município.
+        territory_to_text_split = {}
+        nomes_municipios = re.findall(
+            self.RE_NOMES_MUNICIPIOS, self.association_source_text, re.MULTILINE)
+        for municipio in nomes_municipios:
+            nome_municipio_normalizado = self._normalize_territory_name(municipio)
+            territory_to_text_split[nome_municipio_normalizado] = ama_header + '\n\n'
+
+        num_linha = 0
+        municipio_atual = None
+        while num_linha < len(texto_diario_slice):
+            linha = texto_diario_slice[num_linha].rstrip()
+
+            if linha.startswith("ESTADO DE ALAGOAS"):
+                nome = self._extract_territory_name(texto_diario_slice, num_linha)
+                if nome is not None:
+                    nome_normalizado = self._normalize_territory_name(nome)
+                    municipio_atual = nome_normalizado
+
+            # Só começa, quando algum muncípio for encontrado.
+            if municipio_atual is None:
+                num_linha += 1
+                continue
+
+            # Conteúdo faz parte de um muncípio
+            territory_to_text_split[municipio_atual] += linha + '\n'
+            num_linha += 1
+
+        return territory_to_text_split
+
+    def build_segment(self, territory, segment_text) -> GazetteSegment:
+        file_checksum = get_checksum(segment_text)
+        processed = True
+        territory_name = territory
+        source_text = segment_text.rstrip()
+
+        # TODO: get territory data and replace the None values
+        territory_id = None
+        # file_raw_txt = f"/{territory_id}/{date}/{file_checksum}.txt"
+        file_raw_txt = None
+        # url = file_raw_txt
+        url = None
+
+        return GazetteSegment(
+            # same association values
+            created_at=self.association_gazette.get("created_at"),
+            date=self.association_gazette.get("date"),
+            edition_number=self.association_gazette.get("edition_number"),
+            file_path=self.association_gazette.get("file_path"),
+            file_url=self.association_gazette.get("file_url"),
+            is_extra_edition=self.association_gazette.get("is_extra_edition"),
+            power=self.association_gazette.get("power"),
+            scraped_at=self.association_gazette.get("scraped_at"),
+            state_code=self.association_gazette.get("state_code"),
+            url=self.association_gazette.get("url"),
+
+            # segment specific values
+            file_checksum=file_checksum,
+            processed=processed,
+            territory_name=territory_name,
+            source_text=source_text,
+            territory_id=territory_id,
+            file_raw_txt=file_raw_txt,
+        )
+
+    def _normalize_territory_name(self, municipio: str) -> str:
+        municipio = municipio.rstrip().replace('\n', '')  # limpeza inicial
+        # Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso. 
+        municipio = re.sub("(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*)", "", municipio)
+        return municipio
+
+    def _extract_territory_name(self, texto_diario_slice: list[str], num_linha: int):
+        texto = '\n'.join(texto_diario_slice[num_linha:num_linha+10])
+        match = re.findall(self.RE_NOMES_MUNICIPIOS, texto, re.MULTILINE)
+        if len(match) > 0:
+            return match[0].strip().replace('\n', '')
+        return None
diff --git a/tasks/__init__.py b/tasks/__init__.py
@@ -11,4 +11,4 @@
 )
 from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
 from .list_territories import get_territories_gazettes
-
+from .get_territorie_info import get_territorie_info
diff --git a/tasks/diario_ama.py b/tasks/diario_ama.py