From 7a65b8c51495cccee4bfe876e5d5ce4ff4e7dc9e Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 11 Dec 2024 16:40:54 +0800 Subject: [PATCH] [it] extract example lists --- src/wiktextract/extractor/it/example.py | 24 +++++++++++++ src/wiktextract/extractor/it/models.py | 7 ++++ src/wiktextract/extractor/it/pos.py | 7 +++- tests/test_it_example.py | 45 +++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 src/wiktextract/extractor/it/example.py create mode 100644 tests/test_it_example.py diff --git a/src/wiktextract/extractor/it/example.py b/src/wiktextract/extractor/it/example.py new file mode 100644 index 00000000..6117b854 --- /dev/null +++ b/src/wiktextract/extractor/it/example.py @@ -0,0 +1,24 @@ +from wikitextprocessor import NodeKind, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Example, Sense + + +def extract_example_list_item( + wxr: WiktextractContext, sense: Sense, list_item: WikiNode +) -> None: + example = Example() + for node in list_item.children: + if isinstance(node, WikiNode): + match node.kind: + case NodeKind.ITALIC: + example.text = clean_node(wxr, sense, node) + case NodeKind.LIST: + for tr_list_item in node.find_child(NodeKind.LIST_ITEM): + example.translation = clean_node( + wxr, sense, tr_list_item.children + ) + + if example.text != "": + sense.examples.append(example) diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index e0bfacaf..113da01c 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -10,11 +10,18 @@ class ItalianBaseModel(BaseModel): ) +class Example(ItalianBaseModel): + text: str = "" + translation: str = "" + ref: str = "" + + class Sense(ItalianBaseModel): glosses: list[str] = [] tags: list[str] = [] raw_tags: list[str] = [] categories: list[str] = [] + examples: list[Example] = [] class WordEntry(ItalianBaseModel): diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index c77e427f..590cbd56 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -2,6 +2,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext +from .example import extract_example_list_item from .models import Sense, WordEntry from .section_titles import POS_DATA @@ -43,7 +44,11 @@ def extract_gloss_list_item( sense.raw_tags.append(raw_tag) case _: gloss_nodes.append(node) - elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + if node.sarg.endswith("*"): + for example_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_example_list_item(wxr, sense, example_list_item) + else: gloss_nodes.append(node) gloss_str = clean_node(wxr, sense, gloss_nodes) if gloss_str != "": diff --git a/tests/test_it_example.py b/tests/test_it_example.py new file mode 100644 index 00000000..ae66a81e --- /dev/null +++ b/tests/test_it_example.py @@ -0,0 +1,45 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.it.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestItExample(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="it"), + WiktionaryConfig( + dump_file_lang_code="it", capture_language_codes=None + ), + ) + + def test_list_example(self): + self.wxr.wtp.add_page("Template:-br-", 10, "Bretone") + data = parse_page( + self.wxr, + "dog", + """== {{-br-}} == +===Sostantivo=== +# mutazione +#* ''Da '''dog''', e '''dog'''.'' +#*: Il tuo cappello, il suo cappello.""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["mutazione"], + "examples": [ + { + "text": "Da dog, e dog.", + "translation": "Il tuo cappello, il suo cappello.", + } + ], + } + ], + )