From e5a4a0a13b1b806552d7825593b6197f6b782981 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Fri, 24 Nov 2023 13:04:45 +0100 Subject: [PATCH] Extract glosses from Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/es/gloss.py | 60 ++++++++++++++++++ src/wiktextract/extractor/es/models.py | 3 + src/wiktextract/extractor/es/page.py | 11 +++- tests/test_es_gloss.py | 88 ++++++++++++++++++++++++++ 4 files changed, 159 insertions(+), 3 deletions(-) create mode 100644 src/wiktextract/extractor/es/gloss.py create mode 100644 tests/test_es_gloss.py diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py new file mode 100644 index 000000000..51562d419 --- /dev/null +++ b/src/wiktextract/extractor/es/gloss.py @@ -0,0 +1,60 @@ +import re +from typing import List +from wiktextract.extractor.es.models import Sense, WordEntry +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext +from wikitextprocessor import WikiNode, NodeKind +from wikitextprocessor.parser import WikiNodeChildrenList + + +def extract_gloss( + wxr: WiktextractContext, + page_data: List[WordEntry], + list_node: WikiNode, +) -> None: + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + gloss_data = Sense(glosses=[]) + + definition: WikiNodeChildrenList = [] + other: WikiNodeChildrenList = [] + + for node in list_item.definition: + if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + other.append(node) + else: + definition.append(node) + + list_item.definition + + gloss = clean_node(wxr, gloss_data, definition) + gloss_data.glosses.append(gloss) + + gloss_note = clean_node(wxr, gloss_data, list_item.children) + + match = re.match(r"^(\d+)", gloss_note) + + if match: + gloss_data["senseid"] = int(match.group(1)) + tag_string = gloss_note[len(match.group(1)) :].strip() + else: + tag_string = gloss_data["tags"] = gloss_note.strip() + + # split tags by comma or "y" + tags = re.split(r",|y", tag_string) + for tag in tags: + tag = ( + tag.strip() + .removesuffix(".") + .removesuffix("Main") + .removeprefix("Main") + ) + if tag: + gloss_data["tags"].append(tag) + + if other: + wxr.wtp.debug( + f"Found nodes that are not part of definition: {other}", + sortid="extractor/es/gloss/extract_gloss/46", + ) + + page_data[-1].senses.append(gloss_data) diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 1250bedb2..4f695b301 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -79,6 +79,9 @@ class Sense(LoggingExtraFieldsModel): subsenses: list["Sense"] = Field( default=[], description="List of subsenses" ) + senseid: Optional[int] = Field( + default=None, description="Sense number used in Wiktionary" + ) class WordEntry(LoggingExtraFieldsModel): diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 538a94bed..3d7642256 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -5,6 +5,7 @@ from wikitextprocessor import NodeKind, WikiNode from wiktextract.datautils import append_base_data +from wiktextract.extractor.es.gloss import extract_gloss from wiktextract.extractor.es.pronunciation import extract_pronunciation from wiktextract.extractor.es.models import WordEntry, PydanticLogger @@ -76,9 +77,13 @@ def process_pos_block( ): # XXX: Extract forms pass - elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: - # XXX: Extract data - pass + elif ( + isinstance(child, WikiNode) + and child.kind == NodeKind.LIST + and child.sarg == ";" + ): + extract_gloss(wxr, page_data, child) + else: # XXX: Extract data pass diff --git a/tests/test_es_gloss.py b/tests/test_es_gloss.py new file mode 100644 index 000000000..ed3cbe487 --- /dev/null +++ b/tests/test_es_gloss.py @@ -0,0 +1,88 @@ +from typing import List +import unittest + +from wikitextprocessor import Wtp +from wiktextract.extractor.es.gloss import extract_gloss +from wiktextract.extractor.es.models import WordEntry + +from wiktextract.config import WiktionaryConfig +from wiktextract.wxr_context import WiktextractContext + + +class TestESGloss(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="es"), + WiktionaryConfig(dump_file_lang_code="es"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_page_data(self) -> List[WordEntry]: + return [WordEntry(word="test", lang_code="es", lang_name="Language")] + + def test_es_extract_glosses(self): + # https://es.wiktionary.org/wiki/ayudar + + self.wxr.wtp.add_page("Plantilla:plm", 10, "Contribuir") + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse( + """;1: {{plm|contribuir}} [[esfuerzo]] o [[recurso]]s para la [[realización]] de algo. +;2: Por antonomasia, [[cooperar]] a que alguno [[salir|salga]] de una [[situación]] [[dificultoso|dificultosa]]""" + ) + + page_data = self.get_default_page_data() + + extract_gloss(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": [ + "Contribuir esfuerzo o recursos para la realización de algo." + ], + "senseid": 1, + }, + { + "glosses": [ + "Por antonomasia, cooperar a que alguno salga de una situación dificultosa" + ], + "senseid": 2, + }, + ], + ) + + def test_es_extract_gloss_categories(self): + # https://es.wiktionary.org/wiki/amor + self.wxr.wtp.add_page("Plantilla:plm", 10, "Sentimiento") + self.wxr.wtp.add_page( + "Plantilla:sentimientos", + 10, + "Humanidades. [[Categoría:ES:Sentimientos]]", + ) + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse( + ";1 {{sentimientos}}: {{plm|sentimiento}} [[afectivo]] de [[atracción]], [[unión]] y [[afinidad]] que se experimenta hacia una persona, animal o cosa" + ) + + page_data = self.get_default_page_data() + + extract_gloss(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": [ + "Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa" + ], + "senseid": 1, + "tags": ["Humanidades."], + "categories": ["ES:Sentimientos"], + } + ], + )