Skip to content

Commit

Permalink
Refatoração do processo de Segmentação (#65)
Browse files Browse the repository at this point in the history
* Torna em task a listagem dos territórios através de consulta no banco
de dados
* Torna uma util a função para obter os dados de um território a partir
do nome
* Adicionar tratamento para casos em que o nome do município vem seguido
de sufixos até então não mapeados
* Lança exceção caso o município não seja encontrado para que seja
capturada e gere um erro no processamento do município
* Refatora a função `_fix_territory_name` que corrige a escrita dos
nomes dos municípios quando existem erros conhecidos
* Refatora a função `extract_text_from_gazettes` na qual é feita a
bifurcação do processamento de diários comuns ou associações
* Adicionar argumento `territories` para a função
`ALAssociacaoMunicipiosSegmenter.get_gazette_segments`. Esta função é
chamada na `try_process_gazette_association_file`
  • Loading branch information
Giulio Carvalho authored Dec 13, 2023
2 parents 646e8bf + 4e51a52 commit d551f0d
Show file tree
Hide file tree
Showing 13 changed files with 402 additions and 424 deletions.
8 changes: 7 additions & 1 deletion main/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
from storage import create_storage_interface
from index import create_index_interface
from tasks import (
create_gazettes_index,
create_themed_excerpts_index,
embedding_rerank_excerpts,
extract_text_from_gazettes,
extract_themed_excerpts_from_gazettes,
get_gazettes_to_be_processed,
get_themes,
get_territories,
tag_entities_in_excerpts,
)

Expand Down Expand Up @@ -42,12 +45,15 @@ def execute_pipeline():
text_extractor = create_apache_tika_text_extraction()
themes = get_themes()

create_gazettes_index(index)
territories = get_territories(database)
gazettes_to_be_processed = get_gazettes_to_be_processed(execution_mode, database)
indexed_gazette_ids = extract_text_from_gazettes(
gazettes_to_be_processed, database, storage, index, text_extractor
gazettes_to_be_processed, territories, database, storage, index, text_extractor
)

for theme in themes:
create_themed_excerpts_index(theme, index)
themed_excerpt_ids = extract_themed_excerpts_from_gazettes(
theme, indexed_gazette_ids, index
)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ requests==2.25.0
scikit-learn==1.0.2
sentence-transformers==2.2.0
huggingface-hub==0.10.1 # fix: https://github.com/UKPLab/sentence-transformers/issues/1762
python-slugify[unidecode]==8.0.1
7 changes: 3 additions & 4 deletions segmentation/base/association_segmenter.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from typing import Union, Dict, List
from typing import Any, Dict, Iterable, List, Union
from segmentation.base import GazetteSegment


class AssociationSegmenter:
def __init__(self, association_gazette: str, territory_to_data: Dict):
self.association_gazette = association_gazette
self.territory_to_data = territory_to_data
def __init__(self, territories: Iterable[Dict[str, Any]]):
self.territories = territories

def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]:
"""
Expand Down
45 changes: 26 additions & 19 deletions segmentation/factory.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,34 @@
from typing import Any, Dict
from typing import Any, Dict, Iterable

from segmentation.base import AssociationSegmenter
from segmentation import segmenters


def get_segmenter(territory_id: str, association_gazzete: Dict[str, Any], territory_to_data: Dict) -> AssociationSegmenter:
_segmenter_instances = {}


def get_segmenter(territory_id: str, territories: Iterable[Dict[str, Any]]) -> AssociationSegmenter:
"""
Factory method to return a AssociationSegmenter
Example
-------
>>> association_gazette = {
"territory_name": "Associação",
"created_at": datetime.datetime.now(),
"date": datetime.datetime.now(),
"edition_number": 1,
"file_path": 'raw/pdf.pdf',
"file_url": 'localhost:8000/raw/pdf.pdf',
"is_extra_edition": True,
"power": 'executive',
"scraped_at": datetime.datetime.now(),
"state_code": 'AL',
"source_text": texto,
}
>>> territory_id = "9999999"
>>> territories = [
{
"id": "9999999",
"territory_name": "Bairro do Limoeiro",
"state_code": "ZZ",
"state": "Limoeirolândia",
}, {
"id": "0000000",
"territory_name": "Castelo Rá-Tim-Bum",
"state_code": "SP",
"state": "São Paulo",
},
]
>>> from segmentation import get_segmenter
>>> segmenter = get_segmenter(territory_id, association_gazette)
>>> segmenter = get_segmenter(territory_id, territories)
>>> segments = segmenter.get_gazette_segments()
Notes
Expand All @@ -37,6 +41,9 @@ def get_segmenter(territory_id: str, association_gazzete: Dict[str, Any], territ
"2700000": "ALAssociacaoMunicipiosSegmenter",
}

segmenter_class_name = territory_to_segmenter_class[territory_id]
segmenter_class = getattr(segmenters, segmenter_class_name)
return segmenter_class(association_gazzete, territory_to_data)
if territory_id not in _segmenter_instances:
segmenter_class_name = territory_to_segmenter_class[territory_id]
segmenter_class = getattr(segmenters, segmenter_class_name)
_segmenter_instances[territory_id] = segmenter_class(territories)

return _segmenter_instances[territory_id]
211 changes: 68 additions & 143 deletions segmentation/segmenters/al_associacao_municipios.py
Original file line number Diff line number Diff line change
@@ -1,163 +1,88 @@
import re
import unicodedata
import logging

from typing import Any, Dict, List
from segmentation.base import AssociationSegmenter, GazetteSegment
from tasks.utils import get_checksum
from tasks.utils import batched, get_checksum, get_territory_data, get_territory_slug

class ALAssociacaoMunicipiosSegmenter(AssociationSegmenter):
def __init__(self, association_gazzete: Dict[str, Any], territory_to_data: Dict[str, Any]):
super().__init__(association_gazzete, territory_to_data)
# No final do regex, existe uma estrutura condicional que verifica se o próximo match é um \s ou SECRETARIA. Isso foi feito para resolver um problema no diário de 2018-10-02, em que o município de Coité do Nóia não foi percebido pelo código. Para resolver isso, utilizamos a próxima palavra (SECRETARIA) para tratar esse caso.
# Exceções Notáveis
# String: VAMOS, município Poço das Trincheiras, 06/01/2022, ato CCB3A6AB
self.RE_NOMES_MUNICIPIOS = (
r"ESTADO DE ALAGOAS(?:| )\n{1,2}PREFEITURA MUNICIPAL DE (.*\n{0,2}(?!VAMOS).*$)\n\s(?:\s|SECRETARIA)"
)
self.association_source_text = self.association_gazette["source_text"]
self.territory_to_data = self._format_territory_to_data(territory_to_data)

def get_gazette_segments(self) -> List[Dict[str, Any]]:
class ALAssociacaoMunicipiosSegmenter(AssociationSegmenter):
RE_NOMES_MUNICIPIOS = re.compile(
r"""
(ESTADO\sDE\sALAGOAS(?:|\s)\n{1,2}PREFEITURA\sMUNICIPAL\sDE\s) # Marcador de início do cabeçalho de publicação do município
((?!EDUCAÇÃO).*?\n{0,2}(?!VAMOS).*?$) # Nome do município (pode estar presente em até duas linhas). Exceções Notáveis: VAMOS, Poço das Trincheiras, 06/01/2022, ato CCB3A6AB; EDUCAÇÃO, Dois Riachos, 07/12/2023, ato ABCCE576
(\n\s(?:\s|SECRETARIA|Secretaria)) # Marcador de fim do cabeçalho (pula mais de duas linhas). Exceções Notáveis: SECRETARIA, Coité do Nóia, 02/10/2018, ato 12F7DE15; Secretaria, Qubrângulo, 18/07/2023, atos 27FB2D83 a 1FAF9421
""",
re.MULTILINE | re.VERBOSE,
)

def get_gazette_segments(self, gazette: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Returns a list of dicts with the gazettes metadata
"""
territory_to_text_split = self.split_text_by_territory()
gazette_segments = []
for municipio, texto_diario in territory_to_text_split.items():
segmento = self.build_segment(municipio, texto_diario)
gazette_segments.append(segmento.__dict__)
territory_to_text_map = self.split_text_by_territory(gazette["source_text"])
gazette_segments = [
self.build_segment(territory_slug, segment_text, gazette).__dict__
for territory_slug, segment_text in territory_to_text_map.items()
]
return gazette_segments

def split_text_by_territory(self) -> Dict[str, str]:
def split_text_by_territory(self, text: str) -> Dict[str, str]:
"""
Segment a association text by territory
and returns a dict with the territory name and the text segment
"""
texto_diario_slice = self.association_source_text.lstrip().splitlines()

# Processamento
linhas_apagar = [] # slice de linhas a ser apagadas ao final.
ama_header = texto_diario_slice[0]
ama_header_count = 0
codigo_count = 0
codigo_total = self.association_source_text.count("Código Identificador")

for num_linha, linha in enumerate(texto_diario_slice):
# Remoção do cabeçalho AMA, porém temos que manter a primeira aparição.
if linha.startswith(ama_header):
ama_header_count += 1
if ama_header_count > 1:
linhas_apagar.append(num_linha)

# Remoção das linhas finais
if codigo_count == codigo_total:
linhas_apagar.append(num_linha)
elif linha.startswith("Código Identificador"):
codigo_count += 1

# Apagando linhas do slice
texto_diario_slice = [l for n, l in enumerate(
texto_diario_slice) if n not in linhas_apagar]

# Inserindo o cabeçalho no diário de cada município.
territory_to_text_split = {}
nomes_municipios = re.findall(
self.RE_NOMES_MUNICIPIOS, self.association_source_text, re.MULTILINE)
for municipio in nomes_municipios:
nome_municipio_normalizado = self._normalize_territory_name(municipio)
territory_to_text_split[nome_municipio_normalizado] = ama_header + '\n\n'

num_linha = 0
municipio_atual = None
while num_linha < len(texto_diario_slice):
linha = texto_diario_slice[num_linha].rstrip()

if linha.startswith("ESTADO DE ALAGOAS"):
nome = self._extract_territory_name(texto_diario_slice, num_linha)
if nome is not None:
nome_normalizado = self._normalize_territory_name(nome)
municipio_atual = nome_normalizado

# Só começa, quando algum muncípio for encontrado.
if municipio_atual is None:
num_linha += 1
continue

# Conteúdo faz parte de um muncípio
territory_to_text_split[municipio_atual] += linha + '\n'
num_linha += 1

return territory_to_text_split

def build_segment(self, raw_territory_name, segment_text) -> GazetteSegment:
file_checksum = get_checksum(segment_text)
processed = True
source_text = segment_text.rstrip()
state = self.association_gazette.get("state_code")
raw_territory_name = self._fix_territory_name(raw_territory_name)

territory_data = self.territory_to_data.get((self._clear_state_code(state), self._clear_city_name(raw_territory_name)))
ama_header = text.lstrip().split("\n", maxsplit=1)[0].rstrip()
# clean headers
clean_text = "\n".join(re.split(re.escape(ama_header), text))
# clean final lines
clean_text = "\n".join(
re.split(r"(Código Ide ?ntificador:\s*\w+)", clean_text)[:-1]
)

territory_id = territory_data["id"]
territory_name = territory_data["territory_name"]
date = self.association_gazette["date"]
file_raw_txt = f"/{territory_id}/{date}/{file_checksum}.txt"

return GazetteSegment(
# same association values
id=self.association_gazette.get("id"),
created_at=self.association_gazette.get("created_at"),
date=self.association_gazette.get("date"),
edition_number=self.association_gazette.get("edition_number"),
file_path=self.association_gazette.get("file_path"),
file_url=self.association_gazette.get("file_url"),
is_extra_edition=self.association_gazette.get("is_extra_edition"),
power=self.association_gazette.get("power"),
scraped_at=self.association_gazette.get("scraped_at"),
state_code=state,
url=self.association_gazette.get("url"),
raw_segments = re.split(self.RE_NOMES_MUNICIPIOS, clean_text)[1:]

territory_to_text_map = {}
for pattern_batch in batched(raw_segments, 4):
territory_name = pattern_batch[1]
clean_territory_name = self._normalize_territory_name(territory_name)
territory_slug = get_territory_slug(clean_territory_name, "AL")
previous_text_or_header = territory_to_text_map.setdefault(
territory_slug, f"{ama_header}\n"
)
raw_batch_text = "".join(pattern_batch)
new_territory_text = f"{previous_text_or_header}\n{raw_batch_text}"
territory_to_text_map[territory_slug] = new_territory_text

return territory_to_text_map

def build_segment(
self, territory_slug: str, segment_text: str, gazette: Dict
) -> GazetteSegment:
logging.debug(
f"Creating segment for territory \"{territory_slug}\" from {gazette['file_path']} file."
)
territory_data = get_territory_data(territory_slug, self.territories)

return GazetteSegment(**{
**gazette,
# segment specific values
file_checksum=file_checksum,
processed=processed,
territory_name=territory_name,
source_text=source_text,
territory_id=territory_id,
file_raw_txt=file_raw_txt,
"processed": True,
"file_checksum": get_checksum(segment_text),
"source_text": segment_text.strip(),
"territory_name": territory_data["territory_name"],
"territory_id": territory_data["id"],
})

def _normalize_territory_name(self, territory_name: str) -> str:
clean_name = territory_name.strip().replace("\n", "")
# Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso.
clean_name = re.sub(
"\s*(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*|EXTRATO.*|SÚMULA.*|RATIFICAÇÃO.*)",
"",
clean_name,
)

def _normalize_territory_name(self, municipio: str) -> str:
municipio = municipio.rstrip().replace('\n', '') # limpeza inicial
# Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso.
municipio = re.sub("(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*)", "", municipio)
return municipio

def _extract_territory_name(self, texto_diario_slice: List[str], num_linha: int):
texto = '\n'.join(texto_diario_slice[num_linha:num_linha+10])
match = re.findall(self.RE_NOMES_MUNICIPIOS, texto, re.MULTILINE)
if len(match) > 0:
return match[0].strip().replace('\n', '')
return None

def _format_territory_to_data(self, territory_to_data: Dict[str, Any]):
territory_to_data = {
(self._clear_state_code(k[0]), self._clear_city_name(k[1])): v for k, v in territory_to_data.items()
name_to_fixed = {
"MAJOR IZIDORO": "MAJOR ISIDORO",
}
return territory_to_data

def _clear_city_name(self, name: str):
clean_name = name.replace("'", "")
clean_name = unicodedata.normalize("NFD", clean_name)
clean_name = clean_name.encode("ascii", "ignore").decode("utf-8")
clean_name = clean_name.lower()
clean_name = clean_name.strip()
return clean_name

def _clear_state_code(self, code: str):
return code.lower().strip()

def _fix_territory_name(self, name: str):
#clean_name = "major isidoro" if clean_name == "major izidoro" else clean_name
if name == "major izidoro":
return "major isidoro"
return name
return name_to_fixed.get(clean_name, clean_name)
2 changes: 2 additions & 0 deletions tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .create_index import create_gazettes_index, create_themed_excerpts_index
from .gazette_excerpts_embedding_reranking import embedding_rerank_excerpts
from .gazette_excerpts_entities_tagging import tag_entities_in_excerpts
from .gazette_text_extraction import extract_text_from_gazettes
Expand All @@ -10,3 +11,4 @@
TextExtractorInterface,
)
from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
from .list_territories import get_territories
Loading

0 comments on commit d551f0d

Please sign in to comment.