Skip to content

Commit

Permalink
Merge pull request #948 from xxyzz/it
Browse files Browse the repository at this point in the history
[it] improve pos and proverb sections code
  • Loading branch information
xxyzz authored Dec 17, 2024
2 parents 98779e3 + d533e56 commit 681a778
Show file tree
Hide file tree
Showing 11 changed files with 333 additions and 15 deletions.
5 changes: 5 additions & 0 deletions src/wiktextract/data/overrides/it.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,10 @@
"body": "===Note / Riferimenti===\n",
"namespace_id": 10,
"need_pre_expand": true
},
"Template:-verb-": {
"body": "{{Sezione voce|Immagine=Open_book_01.svg|Dimensione=30px|Sezione=verbo|Sezione al plurale=verbi|Genere=m|Lingua={{{1|}}}}}{{#invoke:Categorizzazione verbi italiani|main|{{{1|}}}}}",
"namespace_id": 10,
"need_pre_expand": true
}
}
44 changes: 40 additions & 4 deletions src/wiktextract/extractor/it/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,15 @@ def extract_example_list_item(
text_nodes = []
roman = ""
translation = ""
ref = ""
has_zh_tradsem = False
for index, node in enumerate(list_item.children):
if (
isinstance(node, TemplateNode)
and node.template_name == "zh-tradsem"
):
examples.extend(extract_zh_tradsem(wxr, node))
has_zh_tradsem = True
elif isinstance(node, WikiNode):
match node.kind:
case NodeKind.ITALIC:
Expand All @@ -39,17 +42,38 @@ def extract_example_list_item(
case _ if lang_code in ["zh", "ja"]:
if before_italic:
text_nodes.append(node)
elif (
isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node
):
elif isinstance(node, str) and "-" in node:
for t_node in list_item.find_child(NodeKind.TEMPLATE):
if t_node.template_name == "Term":
ref = clean_node(wxr, None, t_node).strip("()")
break
translation = clean_node(
wxr,
sense,
wxr.wtp.node_to_wikitext(
[node[node.index("-") + 1 :]]
+ list_item.children[index + 1 :]
+ [
n
for n in list_item.children[index + 1 :]
if not (
isinstance(n, TemplateNode)
and n.template_name == "Term"
)
]
),
)
if not has_zh_tradsem and len(examples) > 1:
examples.clear()
examples.append(
Example(
text=clean_node(
wxr,
None,
list_item.children[:index]
+ [node[: node.index("-")]],
)
)
)
break
elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic:
text_nodes.append(node)
Expand All @@ -69,11 +93,23 @@ def extract_example_list_item(
)
examples.append(example)

if not has_zh_tradsem and len(examples) > 1:
examples.clear()
examples.append(
Example(
text=clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
)
)

for example in examples:
if roman != "":
example.roman = roman
if translation != "":
example.translation = translation
if ref != "":
example.ref = ref
if example.text != "":
sense.examples.append(example)

Expand Down
27 changes: 25 additions & 2 deletions src/wiktextract/extractor/it/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ def extract_linkage_section(
linkages = []
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
linkages.extend(extract_linkage_list_item(wxr, list_item))
linkages.extend(
extract_proverb_list_item(wxr, list_item)
if linkage_type == "proverbs"
else extract_linkage_list_item(wxr, list_item)
)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
Expand Down Expand Up @@ -43,8 +47,27 @@ def extract_linkage_list_item(
elif isinstance(node, str):
for word_str in node.split(","):
word_str = word_str.strip()
if word_str != "":
if word_str.startswith("(") and word_str.endswith(")"):
raw_tags.append(word_str.strip("()"))
elif word_str != "":
linkages.append(Linkage(word=word_str, raw_tags=raw_tags))
raw_tags.clear()

return linkages


def extract_proverb_list_item(
wxr: WiktextractContext, list_item: WikiNode
) -> list[Linkage]:
proverb = Linkage(word="")
for index, node in enumerate(list_item.children):
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
proverb.word = clean_node(wxr, None, node)
elif isinstance(node, str) and ":" in node:
proverb.sense = clean_node(
wxr,
None,
[node[node.index(":") + 1 :]] + list_item.children[index + 1 :],
)
break
return [proverb] if proverb.word != "" else []
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class Linkage(ItalianBaseModel):
word: str
tags: list[str] = []
raw_tags: list[str] = []
sense: str = ""


class WordEntry(ItalianBaseModel):
Expand Down
49 changes: 42 additions & 7 deletions src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,24 @@
from .section_titles import POS_DATA
from .tag_form_line import extract_tag_form_line_nodes

# https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi
POS_SUBSECTION_TEMPLATES = frozenset(
[
"-participio passato-",
"-participio presente-",
"Ausiliare",
"Deponente",
"Intransitivo",
"Medio",
"Passivo",
"Reciproco",
"Riflessivo",
"Transitivo",
]
)

def extract_pos_section(

def add_new_pos_data(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
Expand All @@ -23,6 +39,15 @@ def extract_pos_section(
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, page_data[-1], link_node)


def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
add_new_pos_data(wxr, page_data, base_data, level_node, pos_title)
first_gloss_list_index = len(level_node.children)
for index, node in enumerate(level_node.children):
if (
Expand All @@ -35,6 +60,16 @@ def extract_pos_section(
extract_gloss_list_item(wxr, page_data[-1], list_item)
if index < first_gloss_list_index:
first_gloss_list_index = index
elif (
isinstance(node, TemplateNode)
and node.template_name in POS_SUBSECTION_TEMPLATES
):
if len(page_data[-1].senses) > 0:
add_new_pos_data(
wxr, page_data, base_data, level_node, pos_title
)
raw_tag = clean_node(wxr, page_data[-1], node).strip("= \n")
page_data[-1].raw_tags.append(raw_tag)

extract_tag_form_line_nodes(
wxr, page_data[-1], level_node.children[:first_gloss_list_index]
Expand All @@ -56,12 +91,7 @@ def extract_gloss_list_item(
else:
gloss_nodes.append(t_str)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, example_list_item, word_entry.lang_code
)
elif (
if (
node.sarg.endswith(":")
and len(sense.examples) > 0
and sense.examples[-1].translation == ""
Expand All @@ -70,6 +100,11 @@ def extract_gloss_list_item(
sense.examples[-1].translation = clean_node(
wxr, sense, tr_list_item.children
)
elif node.sarg.endswith(("*", ":")):
for example_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, example_list_item, word_entry.lang_code
)
else:
gloss_nodes.append(node)
gloss_str = clean_node(wxr, sense, gloss_nodes)
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/it/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def extract_translation_section(
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
# https://it.wiktionary.org/wiki/Aiuto:Traduzioni
sense = ""
translations = []
cats = {}
Expand Down
2 changes: 1 addition & 1 deletion tests/test_it_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from wiktextract.wxr_context import WiktextractContext


class TestItGloss(TestCase):
class TestItEtymology(TestCase):
maxDiff = None

def setUp(self) -> None:
Expand Down
76 changes: 76 additions & 0 deletions tests/test_it_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,79 @@ def test_zh_tradsem(self):
}
],
)

def test_double_italic_nodes_with_translation(self):
self.wxr.wtp.add_page("Template:-en-", 10, "Inglese")
data = parse_page(
self.wxr,
"water",
"""== {{-en-}} ==
===Sostantivo===
# acqua
#: ''May I have a glass of '''water'''?'' - ''Posso avere un bicchiere d''''acqua'''''?""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["acqua"],
"examples": [
{
"text": "May I have a glass of water?",
"translation": "Posso avere un bicchiere d'acqua?",
}
],
}
],
)

def test_double_italic_nodes_no_translation(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
self.wxr,
"essere",
"""== {{-it-}} ==
===Sostantivo===
#chi [[esiste]]
#* ''gli '''esseri''' viventi''; ''gli '''esseri''' animati''""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["chi esiste"],
"examples": [
{"text": "gli esseri viventi; gli esseri animati"}
],
}
],
)

def test_term_ref_template(self):
self.wxr.wtp.add_page("Template:-la-", 10, "Latino")
self.wxr.wtp.add_page("Template:Term", 10, "({{{1}}})")
data = parse_page(
self.wxr,
"libero",
"""== {{-la-}} ==
===Verbo===
# [[assolvere]], [[liberare]] dalle [[accuse]], [[giudicare]] [[innocente]]
#* ''et eum omni [[ignominia]] '''liberat''''' - e lo [[assolve]] da ogni [[ignominia]] {{Term|[[:w:Marco Tullio Cicerone|Cicerone]], [[:w:Pro Cluentio|Pro Cluentio]], [[:s:la:Pro_Aulo_Cluentio_Habito|XLVII, 132]]}}""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"assolvere, liberare dalle accuse, giudicare innocente"
],
"examples": [
{
"text": "et eum omni ignominia liberat",
"translation": "e lo assolve da ogni ignominia",
"ref": "Cicerone, Pro Cluentio, XLVII, 132",
}
],
}
],
)
Loading

0 comments on commit 681a778

Please sign in to comment.