Skip to content

Commit

Permalink
[it] extract example lists
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 11, 2024
1 parent e948032 commit 7a65b8c
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 1 deletion.
24 changes: 24 additions & 0 deletions src/wiktextract/extractor/it/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from wikitextprocessor import NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Example, Sense


def extract_example_list_item(
wxr: WiktextractContext, sense: Sense, list_item: WikiNode
) -> None:
example = Example()
for node in list_item.children:
if isinstance(node, WikiNode):
match node.kind:
case NodeKind.ITALIC:
example.text = clean_node(wxr, sense, node)
case NodeKind.LIST:
for tr_list_item in node.find_child(NodeKind.LIST_ITEM):
example.translation = clean_node(
wxr, sense, tr_list_item.children
)

if example.text != "":
sense.examples.append(example)
7 changes: 7 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,18 @@ class ItalianBaseModel(BaseModel):
)


class Example(ItalianBaseModel):
text: str = ""
translation: str = ""
ref: str = ""


class Sense(ItalianBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []


class WordEntry(ItalianBaseModel):
Expand Down
7 changes: 6 additions & 1 deletion src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .section_titles import POS_DATA

Expand Down Expand Up @@ -43,7 +44,11 @@ def extract_gloss_list_item(
sense.raw_tags.append(raw_tag)
case _:
gloss_nodes.append(node)
elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, example_list_item)
else:
gloss_nodes.append(node)
gloss_str = clean_node(wxr, sense, gloss_nodes)
if gloss_str != "":
Expand Down
45 changes: 45 additions & 0 deletions tests/test_it_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.it.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestItExample(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="it"),
WiktionaryConfig(
dump_file_lang_code="it", capture_language_codes=None
),
)

def test_list_example(self):
self.wxr.wtp.add_page("Template:-br-", 10, "Bretone")
data = parse_page(
self.wxr,
"dog",
"""== {{-br-}} ==
===Sostantivo===
# mutazione
#* ''Da '''dog''', e '''dog'''.''
#*: Il tuo cappello, il suo cappello.""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["mutazione"],
"examples": [
{
"text": "Da dog, e dog.",
"translation": "Il tuo cappello, il suo cappello.",
}
],
}
],
)

0 comments on commit 7a65b8c

Please sign in to comment.