Skip to content

Commit

Permalink
[pt] add Portuguese Wiktionary extractor
Browse files Browse the repository at this point in the history
only extract gloss list text
  • Loading branch information
xxyzz committed Nov 29, 2024
1 parent 5d8b946 commit 789d6c3
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def parse_section(
page_data: list[WordEntry],
base_data: WordEntry,
forms_data: WordEntry,
level_node: WikiNode,
level_node: LevelNode,
) -> list[Etymology]:
# title templates
# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
Expand Down
31 changes: 31 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pydantic import BaseModel, ConfigDict, Field


class PortugueseBaseModel(BaseModel):
model_config = ConfigDict(
extra="forbid",
strict=True,
validate_assignment=True,
validate_default=True,
)


class Sense(PortugueseBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
topics: list[str] = []


class WordEntry(PortugueseBaseModel):
model_config = ConfigDict(title="Portuguese Wiktionary")
word: str = Field(description="Word string", min_length=1)
lang_code: str = Field(description="Wiktionary language code", min_length=1)
lang: str = Field(description="Localized language name", min_length=1)
pos: str = Field(description="Part of speech type", min_length=1)
pos_title: str = ""
senses: list[Sense] = []
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
61 changes: 61 additions & 0 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from typing import Any

from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
# page layout
# https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text)
page_data: list[WordEntry] = []
for level1_node in tree.find_child(NodeKind.LEVEL1):
lang_name = clean_node(wxr, None, level1_node.largs)
lang_code = "unknown"
for lang_template in level1_node.find_content(NodeKind.TEMPLATE):
lang_code = lang_template.template_name.strip("-")
break
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
word=wxr.wtp.title,
lang_code=lang_code,
lang=lang_name,
pos="unknown",
)
for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level_node)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [m.model_dump(exclude_defaults=True) for m in page_data]
38 changes: 38 additions & 0 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .section_titles import POS_DATA


def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))

for list_index, list_node in level_node.find_child(NodeKind.LIST, True):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(wxr, page_data[-1], list_item)


def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item_node: WikiNode,
) -> None:
gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST))
sense = Sense()
gloss_str = clean_node(wxr, sense, gloss_nodes)
if len(gloss_str) > 0:
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)
26 changes: 26 additions & 0 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
POS_DATA = {
"Artigo": {"pos": "article"},
"Adjetivo": {"pos": "adj"},
"Advérbio": {"pos": "adv"},
"Conjunção": {"pos": "conj"},
"Interjeição": {"pos": "intj"},
"Numeral": {"pos": "num"},
"Partícula": {"pos": "particle"},
"Preposição": {"pos": "prep"},
"Posposição": {"pos": "postp"},
"Pronome": {"pos": "pron"},
"Substantivo": {"pos": "noun"},
"Verbo": {"pos": "verb"},
"Forma verbal": {"pos": "verb", "tags": ["form-of"]},
"Locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
"Locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]},
"Locução adverbial": {"pos": "phrase", "tags": ["adverbial"]},
"Locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
"Expressão": {"pos": "phrase"},
"Abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
"Contração": {"pos": "contraction", "tags": ["contraction"]},
"Prefixo": {"pos": "prefix", "tags": ["morpheme"]},
"Sufixo": {"pos": "suffix", "tags": ["morpheme"]},
"Sigla": {"pos": "abbrev", "tags": ["abbreviation"]},
"Símbolo": {"pos": "symbol"},
}

0 comments on commit 789d6c3

Please sign in to comment.