From 789d6c31a9fee77c0a32c90c599d8e6145e1e3a7 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Nov 2024 17:03:11 +0800 Subject: [PATCH] [pt] add Portuguese Wiktionary extractor only extract gloss list text --- src/wiktextract/extractor/nl/page.py | 2 +- src/wiktextract/extractor/pt/models.py | 31 ++++++++++ src/wiktextract/extractor/pt/page.py | 61 +++++++++++++++++++ src/wiktextract/extractor/pt/pos.py | 38 ++++++++++++ .../extractor/pt/section_titles.py | 26 ++++++++ 5 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 src/wiktextract/extractor/pt/models.py create mode 100644 src/wiktextract/extractor/pt/page.py create mode 100644 src/wiktextract/extractor/pt/pos.py create mode 100644 src/wiktextract/extractor/pt/section_titles.py diff --git a/src/wiktextract/extractor/nl/page.py b/src/wiktextract/extractor/nl/page.py index 0210da81..f7371c63 100644 --- a/src/wiktextract/extractor/nl/page.py +++ b/src/wiktextract/extractor/nl/page.py @@ -35,7 +35,7 @@ def parse_section( page_data: list[WordEntry], base_data: WordEntry, forms_data: WordEntry, - level_node: WikiNode, + level_node: LevelNode, ) -> list[Etymology]: # title templates # https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py new file mode 100644 index 00000000..609324d7 --- /dev/null +++ b/src/wiktextract/extractor/pt/models.py @@ -0,0 +1,31 @@ +from pydantic import BaseModel, ConfigDict, Field + + +class PortugueseBaseModel(BaseModel): + model_config = ConfigDict( + extra="forbid", + strict=True, + validate_assignment=True, + validate_default=True, + ) + + +class Sense(PortugueseBaseModel): + glosses: list[str] = [] + tags: list[str] = [] + raw_tags: list[str] = [] + categories: list[str] = [] + topics: list[str] = [] + + +class WordEntry(PortugueseBaseModel): + model_config = ConfigDict(title="Portuguese Wiktionary") + word: str = Field(description="Word string", min_length=1) + lang_code: str = Field(description="Wiktionary language code", min_length=1) + lang: str = Field(description="Localized language name", min_length=1) + pos: str = Field(description="Part of speech type", min_length=1) + pos_title: str = "" + senses: list[Sense] = [] + categories: list[str] = [] + tags: list[str] = [] + raw_tags: list[str] = [] diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py new file mode 100644 index 00000000..e953f7bb --- /dev/null +++ b/src/wiktextract/extractor/pt/page.py @@ -0,0 +1,61 @@ +from typing import Any + +from wikitextprocessor.parser import ( + LEVEL_KIND_FLAGS, + LevelNode, + NodeKind, + WikiNode, +) + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Sense, WordEntry +from .pos import extract_pos_section +from .section_titles import POS_DATA + + +def parse_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: LevelNode, +) -> None: + cats = {} + title_text = clean_node(wxr, cats, level_node.largs) + if title_text in POS_DATA: + extract_pos_section(wxr, page_data, base_data, level_node, title_text) + + +def parse_page( + wxr: WiktextractContext, page_title: str, page_text: str +) -> list[dict[str, Any]]: + # page layout + # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo + wxr.wtp.start_page(page_title) + tree = wxr.wtp.parse(page_text) + page_data: list[WordEntry] = [] + for level1_node in tree.find_child(NodeKind.LEVEL1): + lang_name = clean_node(wxr, None, level1_node.largs) + lang_code = "unknown" + for lang_template in level1_node.find_content(NodeKind.TEMPLATE): + lang_code = lang_template.template_name.strip("-") + break + if ( + wxr.config.capture_language_codes is not None + and lang_code not in wxr.config.capture_language_codes + ): + continue + wxr.wtp.start_section(lang_name) + base_data = WordEntry( + word=wxr.wtp.title, + lang_code=lang_code, + lang=lang_name, + pos="unknown", + ) + for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS): + parse_section(wxr, page_data, base_data, next_level_node) + + for data in page_data: + if len(data.senses) == 0: + data.senses.append(Sense(tags=["no-gloss"])) + return [m.model_dump(exclude_defaults=True) for m in page_data] diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py new file mode 100644 index 00000000..83b8dc33 --- /dev/null +++ b/src/wiktextract/extractor/pt/pos.py @@ -0,0 +1,38 @@ +from wikitextprocessor import LevelNode, NodeKind, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Sense, WordEntry +from .section_titles import POS_DATA + + +def extract_pos_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: LevelNode, + pos_title: str, +) -> None: + page_data.append(base_data.model_copy(deep=True)) + page_data[-1].pos_title = pos_title + pos_data = POS_DATA[pos_title] + page_data[-1].pos = pos_data["pos"] + page_data[-1].tags.extend(pos_data.get("tags", [])) + + for list_index, list_node in level_node.find_child(NodeKind.LIST, True): + if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + extract_gloss_list_item(wxr, page_data[-1], list_item) + + +def extract_gloss_list_item( + wxr: WiktextractContext, + word_entry: WordEntry, + list_item_node: WikiNode, +) -> None: + gloss_nodes = list(list_item_node.invert_find_child(NodeKind.LIST)) + sense = Sense() + gloss_str = clean_node(wxr, sense, gloss_nodes) + if len(gloss_str) > 0: + sense.glosses.append(gloss_str) + word_entry.senses.append(sense) diff --git a/src/wiktextract/extractor/pt/section_titles.py b/src/wiktextract/extractor/pt/section_titles.py new file mode 100644 index 00000000..7c23a5c3 --- /dev/null +++ b/src/wiktextract/extractor/pt/section_titles.py @@ -0,0 +1,26 @@ +POS_DATA = { + "Artigo": {"pos": "article"}, + "Adjetivo": {"pos": "adj"}, + "Advérbio": {"pos": "adv"}, + "Conjunção": {"pos": "conj"}, + "Interjeição": {"pos": "intj"}, + "Numeral": {"pos": "num"}, + "Partícula": {"pos": "particle"}, + "Preposição": {"pos": "prep"}, + "Posposição": {"pos": "postp"}, + "Pronome": {"pos": "pron"}, + "Substantivo": {"pos": "noun"}, + "Verbo": {"pos": "verb"}, + "Forma verbal": {"pos": "verb", "tags": ["form-of"]}, + "Locução substantiva": {"pos": "phrase", "tags": ["substantive"]}, + "Locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]}, + "Locução adverbial": {"pos": "phrase", "tags": ["adverbial"]}, + "Locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]}, + "Expressão": {"pos": "phrase"}, + "Abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]}, + "Contração": {"pos": "contraction", "tags": ["contraction"]}, + "Prefixo": {"pos": "prefix", "tags": ["morpheme"]}, + "Sufixo": {"pos": "suffix", "tags": ["morpheme"]}, + "Sigla": {"pos": "abbrev", "tags": ["abbreviation"]}, + "Símbolo": {"pos": "symbol"}, +}