Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pt] add Portuguese Wiktionary extractor #929

Merged
merged 1 commit into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def parse_section(
page_data: list[WordEntry],
base_data: WordEntry,
forms_data: WordEntry,
level_node: WikiNode,
level_node: LevelNode,
) -> list[Etymology]:
# title templates
# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
Expand Down
31 changes: 31 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pydantic import BaseModel, ConfigDict, Field


class PortugueseBaseModel(BaseModel):
model_config = ConfigDict(
extra="forbid",
strict=True,
validate_assignment=True,
validate_default=True,
)


class Sense(PortugueseBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
topics: list[str] = []


class WordEntry(PortugueseBaseModel):
model_config = ConfigDict(title="Portuguese Wiktionary")
word: str = Field(description="Word string", min_length=1)
lang_code: str = Field(description="Wiktionary language code", min_length=1)
lang: str = Field(description="Localized language name", min_length=1)
pos: str = Field(description="Part of speech type", min_length=1)
pos_title: str = ""
senses: list[Sense] = []
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
61 changes: 61 additions & 0 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Any

from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
# page layout
# https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text)
page_data: list[WordEntry] = []
for level1_node in tree.find_child(NodeKind.LEVEL1):
lang_name = clean_node(wxr, None, level1_node.largs)
lang_code = "unknown"
for lang_template in level1_node.find_content(NodeKind.TEMPLATE):
lang_code = lang_template.template_name.strip("-")
break
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
word=wxr.wtp.title,
lang_code=lang_code,
lang=lang_name,
pos="unknown",
)
for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level_node)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [m.model_dump(exclude_defaults=True) for m in page_data]
38 changes: 38 additions & 0 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .section_titles import POS_DATA


def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))

for list_index, list_node in level_node.find_child(NodeKind.LIST, True):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(wxr, page_data[-1], list_item)


def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item_node: WikiNode,
) -> None:
gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST))
sense = Sense()
gloss_str = clean_node(wxr, sense, gloss_nodes)
if len(gloss_str) > 0:
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)
26 changes: 26 additions & 0 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
POS_DATA = {
"Artigo": {"pos": "article"},
"Adjetivo": {"pos": "adj"},
"Advérbio": {"pos": "adv"},
"Conjunção": {"pos": "conj"},
"Interjeição": {"pos": "intj"},
"Numeral": {"pos": "num"},
"Partícula": {"pos": "particle"},
"Preposição": {"pos": "prep"},
"Posposição": {"pos": "postp"},
"Pronome": {"pos": "pron"},
"Substantivo": {"pos": "noun"},
"Verbo": {"pos": "verb"},
"Forma verbal": {"pos": "verb", "tags": ["form-of"]},
"Locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
"Locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]},
"Locução adverbial": {"pos": "phrase", "tags": ["adverbial"]},
"Locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
"Expressão": {"pos": "phrase"},
"Abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
"Contração": {"pos": "contraction", "tags": ["contraction"]},
"Prefixo": {"pos": "prefix", "tags": ["morpheme"]},
"Sufixo": {"pos": "suffix", "tags": ["morpheme"]},
"Sigla": {"pos": "abbrev", "tags": ["abbreviation"]},
"Símbolo": {"pos": "symbol"},
}