-
-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Cria módulo de segmentação para suportar novos segmentadores
- Cria módulo `segmentation` para organizar o código dos segmentadores - O módulo segue o padrão de classe base e classes derivadas para cada segmentador específico - O primeiro segmentador base é o `AssociationSegmenter` para segmentar documentos de associações de municípios - Uma dataclass para segmentos `GazetteSegment` também foi criada - Os segmentadores são instanciados por meio de uma fábrica `get_segmenter`
- Loading branch information
1 parent
9555960
commit 14537e2
Showing
12 changed files
with
276 additions
and
182 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .factory import get_segmenter | ||
|
||
__all__ = [ | ||
"get_segmenter", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from .gazette_segment import GazetteSegment | ||
from .association_segmenter import AssociationSegmenter | ||
|
||
__all__ = [ | ||
"GazetteSegment", | ||
"AssociationSegmenter", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from typing import Union, Dict, List | ||
from segmentation.base import GazetteSegment | ||
|
||
|
||
class AssociationSegmenter: | ||
def __init__(self, association_gazette: str): | ||
self.association_gazette = association_gazette | ||
|
||
def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]: | ||
""" | ||
Returns a list of GazetteSegment | ||
""" | ||
raise NotImplementedError | ||
|
||
def split_text_by_territory(self, *args, **kwargs) -> Union[Dict[str, str], List[str]]: | ||
""" | ||
Segment a association text by territory | ||
and returns a list of text segments | ||
""" | ||
raise NotImplementedError | ||
|
||
def build_segment(self, *args, **kwargs) -> GazetteSegment: | ||
""" | ||
Returns a GazetteSegment | ||
""" | ||
raise NotImplementedError | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from datetime import date, datetime | ||
from dataclasses import dataclass | ||
|
||
|
||
@dataclass | ||
class GazetteSegment: | ||
""" | ||
Dataclass to represent a gazette segment of a association | ||
related to a city | ||
""" | ||
territory_name: str | ||
source_text: str | ||
date: date | ||
edition_number: str | ||
is_extra_edition: bool | ||
power: str | ||
file_checksum: str | ||
scraped_at: datetime | ||
created_at: datetime | ||
processed: bool | ||
file_path: str | ||
file_url: str | ||
state_code: str | ||
territory_id: str | ||
file_raw_txt: str | ||
url: str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from typing import Any | ||
|
||
from segmentation.base import AssociationSegmenter | ||
from segmentation import segmenters | ||
|
||
|
||
def get_segmenter(territory_id: str, association_gazzete: dict[str, Any]) -> AssociationSegmenter: | ||
""" | ||
Factory method to return a AssociationSegmenter | ||
Example | ||
------- | ||
>>> association_gazette = { | ||
"territory_name": "Associação", | ||
"created_at": datetime.datetime.now(), | ||
"date": datetime.datetime.now(), | ||
"edition_number": 1, | ||
"file_path": 'raw/pdf.pdf', | ||
"file_url": 'localhost:8000/raw/pdf.pdf', | ||
"is_extra_edition": True, | ||
"power": 'executive', | ||
"scraped_at": datetime.datetime.now(), | ||
"state_code": 'AL', | ||
"source_text": texto, | ||
} | ||
>>> from segmentation import get_segmenter | ||
>>> segmenter = get_segmenter(territory_id, association_gazette) | ||
>>> segments = segmenter.get_gazette_segments() | ||
Notes | ||
----- | ||
This method implements a factory method pattern. | ||
See: https://github.com/faif/python-patterns/blob/master/patterns/creational/factory.py | ||
""" | ||
|
||
territory_to_segmenter_class = { | ||
"2700000": "ALAssociacaoMunicipiosSegmenter", | ||
} | ||
|
||
segmenter_class_name = territory_to_segmenter_class[territory_id] | ||
segmenter_class = getattr(segmenters, segmenter_class_name) | ||
return segmenter_class(association_gazzete) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .al_associacao_municipios import ALAssociacaoMunicipiosSegmenter | ||
|
||
__all__ = [ | ||
"ALAssociacaoMunicipiosSegmenter", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import re | ||
|
||
from typing import Any | ||
from segmentation.base import AssociationSegmenter, GazetteSegment | ||
from tasks.utils import get_checksum | ||
|
||
class ALAssociacaoMunicipiosSegmenter(AssociationSegmenter): | ||
def __init__(self, association_gazzete: dict[str, Any]): | ||
super().__init__(association_gazzete) | ||
# No final do regex, existe uma estrutura condicional que verifica se o próximo match é um \s ou SECRETARIA. Isso foi feito para resolver um problema no diário de 2018-10-02, em que o município de Coité do Nóia não foi percebido pelo código. Para resolver isso, utilizamos a próxima palavra (SECRETARIA) para tratar esse caso. | ||
# Exceções Notáveis | ||
# String: VAMOS, município Poço das Trincheiras, 06/01/2022, ato CCB3A6AB | ||
self.RE_NOMES_MUNICIPIOS = ( | ||
r"ESTADO DE ALAGOAS(?:| )\n{1,2}PREFEITURA MUNICIPAL DE (.*\n{0,2}(?!VAMOS).*$)\n\s(?:\s|SECRETARIA)" | ||
) | ||
self.association_source_text = self.association_gazette["source_text"] | ||
|
||
def get_gazette_segments(self) -> list[dict[str, Any]]: | ||
""" | ||
Returns a list of dicts with the gazettes metadata | ||
""" | ||
territory_to_text_split = self.split_text_by_territory() | ||
gazette_segments = [] | ||
for municipio, texto_diario in territory_to_text_split.items(): | ||
segmento = self.build_segment(municipio, texto_diario) | ||
gazette_segments.append(segmento.__dict__) | ||
return gazette_segments | ||
|
||
def split_text_by_territory(self) -> dict[str, str]: | ||
""" | ||
Segment a association text by territory | ||
and returns a dict with the territory name and the text segment | ||
""" | ||
texto_diario_slice = self.association_source_text.lstrip().splitlines() | ||
|
||
# Processamento | ||
linhas_apagar = [] # slice de linhas a ser apagadas ao final. | ||
ama_header = texto_diario_slice[0] | ||
ama_header_count = 0 | ||
codigo_count = 0 | ||
codigo_total = self.association_source_text.count("Código Identificador") | ||
|
||
for num_linha, linha in enumerate(texto_diario_slice): | ||
# Remoção do cabeçalho AMA, porém temos que manter a primeira aparição. | ||
if linha.startswith(ama_header): | ||
ama_header_count += 1 | ||
if ama_header_count > 1: | ||
linhas_apagar.append(num_linha) | ||
|
||
# Remoção das linhas finais | ||
if codigo_count == codigo_total: | ||
linhas_apagar.append(num_linha) | ||
elif linha.startswith("Código Identificador"): | ||
codigo_count += 1 | ||
|
||
# Apagando linhas do slice | ||
texto_diario_slice = [l for n, l in enumerate( | ||
texto_diario_slice) if n not in linhas_apagar] | ||
|
||
# Inserindo o cabeçalho no diário de cada município. | ||
territory_to_text_split = {} | ||
nomes_municipios = re.findall( | ||
self.RE_NOMES_MUNICIPIOS, self.association_source_text, re.MULTILINE) | ||
for municipio in nomes_municipios: | ||
nome_municipio_normalizado = self._normalize_territory_name(municipio) | ||
territory_to_text_split[nome_municipio_normalizado] = ama_header + '\n\n' | ||
|
||
num_linha = 0 | ||
municipio_atual = None | ||
while num_linha < len(texto_diario_slice): | ||
linha = texto_diario_slice[num_linha].rstrip() | ||
|
||
if linha.startswith("ESTADO DE ALAGOAS"): | ||
nome = self._extract_territory_name(texto_diario_slice, num_linha) | ||
if nome is not None: | ||
nome_normalizado = self._normalize_territory_name(nome) | ||
municipio_atual = nome_normalizado | ||
|
||
# Só começa, quando algum muncípio for encontrado. | ||
if municipio_atual is None: | ||
num_linha += 1 | ||
continue | ||
|
||
# Conteúdo faz parte de um muncípio | ||
territory_to_text_split[municipio_atual] += linha + '\n' | ||
num_linha += 1 | ||
|
||
return territory_to_text_split | ||
|
||
def build_segment(self, territory, segment_text) -> GazetteSegment: | ||
file_checksum = get_checksum(segment_text) | ||
processed = True | ||
territory_name = territory | ||
source_text = segment_text.rstrip() | ||
|
||
# TODO: get territory data and replace the None values | ||
territory_id = None | ||
# file_raw_txt = f"/{territory_id}/{date}/{file_checksum}.txt" | ||
file_raw_txt = None | ||
# url = file_raw_txt | ||
url = None | ||
|
||
return GazetteSegment( | ||
# same association values | ||
created_at=self.association_gazette.get("created_at"), | ||
date=self.association_gazette.get("date"), | ||
edition_number=self.association_gazette.get("edition_number"), | ||
file_path=self.association_gazette.get("file_path"), | ||
file_url=self.association_gazette.get("file_url"), | ||
is_extra_edition=self.association_gazette.get("is_extra_edition"), | ||
power=self.association_gazette.get("power"), | ||
scraped_at=self.association_gazette.get("scraped_at"), | ||
state_code=self.association_gazette.get("state_code"), | ||
url=self.association_gazette.get("url"), | ||
|
||
# segment specific values | ||
file_checksum=file_checksum, | ||
processed=processed, | ||
territory_name=territory_name, | ||
source_text=source_text, | ||
territory_id=territory_id, | ||
file_raw_txt=file_raw_txt, | ||
) | ||
|
||
def _normalize_territory_name(self, municipio: str) -> str: | ||
municipio = municipio.rstrip().replace('\n', '') # limpeza inicial | ||
# Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso. | ||
municipio = re.sub("(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*)", "", municipio) | ||
return municipio | ||
|
||
def _extract_territory_name(self, texto_diario_slice: list[str], num_linha: int): | ||
texto = '\n'.join(texto_diario_slice[num_linha:num_linha+10]) | ||
match = re.findall(self.RE_NOMES_MUNICIPIOS, texto, re.MULTILINE) | ||
if len(match) > 0: | ||
return match[0].strip().replace('\n', '') | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.