Skip to content

Commit

Permalink
[it] handle example list with more than one italic nodes layout
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 17, 2024
1 parent b7ab69f commit d533e56
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 10 deletions.
44 changes: 40 additions & 4 deletions src/wiktextract/extractor/it/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@ def extract_example_list_item(
text_nodes = []
roman = ""
translation = ""
ref = ""
has_zh_tradsem = False
for index, node in enumerate(list_item.children):
if (
isinstance(node, TemplateNode)
and node.template_name == "zh-tradsem"
):
examples.extend(extract_zh_tradsem(wxr, node))
has_zh_tradsem = True
elif isinstance(node, WikiNode):
match node.kind:
case NodeKind.ITALIC:
Expand All @@ -39,17 +42,38 @@ def extract_example_list_item(
case _ if lang_code in ["zh", "ja"]:
if before_italic:
text_nodes.append(node)
elif (
isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node
):
elif isinstance(node, str) and "-" in node:
for t_node in list_item.find_child(NodeKind.TEMPLATE):
if t_node.template_name == "Term":
ref = clean_node(wxr, None, t_node).strip("()")
break
translation = clean_node(
wxr,
sense,
wxr.wtp.node_to_wikitext(
[node[node.index("-") + 1 :]]
+ list_item.children[index + 1 :]
+ [
n
for n in list_item.children[index + 1 :]
if not (
isinstance(n, TemplateNode)
and n.template_name == "Term"
)
]
),
)
if not has_zh_tradsem and len(examples) > 1:
examples.clear()
examples.append(
Example(
text=clean_node(
wxr,
None,
list_item.children[:index]
+ [node[: node.index("-")]],
)
)
)
break
elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic:
text_nodes.append(node)
Expand All @@ -69,11 +93,23 @@ def extract_example_list_item(
)
examples.append(example)

if not has_zh_tradsem and len(examples) > 1:
examples.clear()
examples.append(
Example(
text=clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
)
)

for example in examples:
if roman != "":
example.roman = roman
if translation != "":
example.translation = translation
if ref != "":
example.ref = ref
if example.text != "":
sense.examples.append(example)

Expand Down
12 changes: 6 additions & 6 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,7 @@ def extract_gloss_list_item(
else:
gloss_nodes.append(t_str)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, example_list_item, word_entry.lang_code
)
elif (
if (
node.sarg.endswith(":")
and len(sense.examples) > 0
and sense.examples[-1].translation == ""
Expand All @@ -105,6 +100,11 @@ def extract_gloss_list_item(
sense.examples[-1].translation = clean_node(
wxr, sense, tr_list_item.children
)
elif node.sarg.endswith(("*", ":")):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, example_list_item, word_entry.lang_code
)
else:
gloss_nodes.append(node)
gloss_str = clean_node(wxr, sense, gloss_nodes)
Expand Down
76 changes: 76 additions & 0 deletions tests/test_it_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,79 @@ def test_zh_tradsem(self):
}
],
)

def test_double_italic_nodes_with_translation(self):
self.wxr.wtp.add_page("Template:-en-", 10, "Inglese")
data = parse_page(
self.wxr,
"water",
"""== {{-en-}} ==
===Sostantivo===
# acqua
#: ''May I have a glass of '''water'''?'' - ''Posso avere un bicchiere d''''acqua'''''?""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["acqua"],
"examples": [
{
"text": "May I have a glass of water?",
"translation": "Posso avere un bicchiere d'acqua?",
}
],
}
],
)

def test_double_italic_nodes_no_translation(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
self.wxr,
"essere",
"""== {{-it-}} ==
===Sostantivo===
#chi [[esiste]]
#* ''gli '''esseri''' viventi''; ''gli '''esseri''' animati''""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["chi esiste"],
"examples": [
{"text": "gli esseri viventi; gli esseri animati"}
],
}
],
)

def test_term_ref_template(self):
self.wxr.wtp.add_page("Template:-la-", 10, "Latino")
self.wxr.wtp.add_page("Template:Term", 10, "({{{1}}})")
data = parse_page(
self.wxr,
"libero",
"""== {{-la-}} ==
===Verbo===
# [[assolvere]], [[liberare]] dalle [[accuse]], [[giudicare]] [[innocente]]
#* ''et eum omni [[ignominia]] '''liberat''''' - e lo [[assolve]] da ogni [[ignominia]] {{Term|[[:w:Marco Tullio Cicerone|Cicerone]], [[:w:Pro Cluentio|Pro Cluentio]], [[:s:la:Pro_Aulo_Cluentio_Habito|XLVII, 132]]}}""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"assolvere, liberare dalle accuse, giudicare innocente"
],
"examples": [
{
"text": "et eum omni ignominia liberat",
"translation": "e lo assolve da ogni ignominia",
"ref": "Cicerone, Pro Cluentio, XLVII, 132",
}
],
}
],
)

0 comments on commit d533e56

Please sign in to comment.