Merge pull request #948 from xxyzz/it

[it] improve pos and proverb sections code
tatuylonen · Dec 17, 2024 · 681a778 · 681a778
2 parents 98779e3 + d533e56
commit 681a778
Show file tree

Hide file tree

Showing 11 changed files with 333 additions and 15 deletions.
diff --git a/src/wiktextract/data/overrides/it.json b/src/wiktextract/data/overrides/it.json
@@ -8,5 +8,10 @@
     "body": "===Note / Riferimenti===\n",
     "namespace_id": 10,
     "need_pre_expand": true
+  },
+  "Template:-verb-": {
+    "body": "{{Sezione voce|Immagine=Open_book_01.svg|Dimensione=30px|Sezione=verbo|Sezione al plurale=verbi|Genere=m|Lingua={{{1|}}}}}{{#invoke:Categorizzazione verbi italiani|main|{{{1|}}}}}",
+    "namespace_id": 10,
+    "need_pre_expand": true
   }
 }
diff --git a/src/wiktextract/extractor/it/example.py b/src/wiktextract/extractor/it/example.py
@@ -14,12 +14,15 @@ def extract_example_list_item(
     text_nodes = []
     roman = ""
     translation = ""
+    ref = ""
+    has_zh_tradsem = False
     for index, node in enumerate(list_item.children):
         if (
             isinstance(node, TemplateNode)
             and node.template_name == "zh-tradsem"
         ):
             examples.extend(extract_zh_tradsem(wxr, node))
+            has_zh_tradsem = True
         elif isinstance(node, WikiNode):
             match node.kind:
                 case NodeKind.ITALIC:
@@ -39,17 +42,38 @@ def extract_example_list_item(
                 case _ if lang_code in ["zh", "ja"]:
                     if before_italic:
                         text_nodes.append(node)
-        elif (
-            isinstance(node, str) and lang_code in ["zh", "ja"] and "-" in node
-        ):
+        elif isinstance(node, str) and "-" in node:
+            for t_node in list_item.find_child(NodeKind.TEMPLATE):
+                if t_node.template_name == "Term":
+                    ref = clean_node(wxr, None, t_node).strip("()")
+                    break
             translation = clean_node(
                 wxr,
                 sense,
                 wxr.wtp.node_to_wikitext(
                     [node[node.index("-") + 1 :]]
-                    + list_item.children[index + 1 :]
+                    + [
+                        n
+                        for n in list_item.children[index + 1 :]
+                        if not (
+                            isinstance(n, TemplateNode)
+                            and n.template_name == "Term"
+                        )
+                    ]
                 ),
             )
+            if not has_zh_tradsem and len(examples) > 1:
+                examples.clear()
+                examples.append(
+                    Example(
+                        text=clean_node(
+                            wxr,
+                            None,
+                            list_item.children[:index]
+                            + [node[: node.index("-")]],
+                        )
+                    )
+                )
             break
         elif lang_code in ["zh", "ja"] and len(examples) == 0 and before_italic:
             text_nodes.append(node)
@@ -69,11 +93,23 @@ def extract_example_list_item(
         )
         examples.append(example)
 
+    if not has_zh_tradsem and len(examples) > 1:
+        examples.clear()
+        examples.append(
+            Example(
+                text=clean_node(
+                    wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
+                )
+            )
+        )
+
     for example in examples:
         if roman != "":
             example.roman = roman
         if translation != "":
             example.translation = translation
+        if ref != "":
+            example.ref = ref
         if example.text != "":
             sense.examples.append(example)
 

diff --git a/src/wiktextract/extractor/it/linkage.py b/src/wiktextract/extractor/it/linkage.py
@@ -14,7 +14,11 @@ def extract_linkage_section(
     linkages = []
     for list_node in level_node.find_child(NodeKind.LIST):
         for list_item in list_node.find_child(NodeKind.LIST_ITEM):
-            linkages.extend(extract_linkage_list_item(wxr, list_item))
+            linkages.extend(
+                extract_proverb_list_item(wxr, list_item)
+                if linkage_type == "proverbs"
+                else extract_linkage_list_item(wxr, list_item)
+            )
 
     for data in page_data:
         if data.lang_code == page_data[-1].lang_code:
@@ -43,8 +47,27 @@ def extract_linkage_list_item(
         elif isinstance(node, str):
             for word_str in node.split(","):
                 word_str = word_str.strip()
-                if word_str != "":
+                if word_str.startswith("(") and word_str.endswith(")"):
+                    raw_tags.append(word_str.strip("()"))
+                elif word_str != "":
                     linkages.append(Linkage(word=word_str, raw_tags=raw_tags))
                     raw_tags.clear()
 
     return linkages
+
+
+def extract_proverb_list_item(
+    wxr: WiktextractContext, list_item: WikiNode
+) -> list[Linkage]:
+    proverb = Linkage(word="")
+    for index, node in enumerate(list_item.children):
+        if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+            proverb.word = clean_node(wxr, None, node)
+        elif isinstance(node, str) and ":" in node:
+            proverb.sense = clean_node(
+                wxr,
+                None,
+                [node[node.index(":") + 1 :]] + list_item.children[index + 1 :],
+            )
+            break
+    return [proverb] if proverb.word != "" else []
diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py
@@ -72,6 +72,7 @@ class Linkage(ItalianBaseModel):
     word: str
     tags: list[str] = []
     raw_tags: list[str] = []
+    sense: str = ""
 
 
 class WordEntry(ItalianBaseModel):

diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py
@@ -7,8 +7,24 @@
 from .section_titles import POS_DATA
 from .tag_form_line import extract_tag_form_line_nodes
 
+# https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi
+POS_SUBSECTION_TEMPLATES = frozenset(
+    [
+        "-participio passato-",
+        "-participio presente-",
+        "Ausiliare",
+        "Deponente",
+        "Intransitivo",
+        "Medio",
+        "Passivo",
+        "Reciproco",
+        "Riflessivo",
+        "Transitivo",
+    ]
+)
 
-def extract_pos_section(
+
+def add_new_pos_data(
     wxr: WiktextractContext,
     page_data: list[WordEntry],
     base_data: WordEntry,
@@ -23,6 +39,15 @@ def extract_pos_section(
     for link_node in level_node.find_child(NodeKind.LINK):
         clean_node(wxr, page_data[-1], link_node)
 
+
+def extract_pos_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    base_data: WordEntry,
+    level_node: LevelNode,
+    pos_title: str,
+) -> None:
+    add_new_pos_data(wxr, page_data, base_data, level_node, pos_title)
     first_gloss_list_index = len(level_node.children)
     for index, node in enumerate(level_node.children):
         if (
@@ -35,6 +60,16 @@ def extract_pos_section(
                 extract_gloss_list_item(wxr, page_data[-1], list_item)
             if index < first_gloss_list_index:
                 first_gloss_list_index = index
+        elif (
+            isinstance(node, TemplateNode)
+            and node.template_name in POS_SUBSECTION_TEMPLATES
+        ):
+            if len(page_data[-1].senses) > 0:
+                add_new_pos_data(
+                    wxr, page_data, base_data, level_node, pos_title
+                )
+            raw_tag = clean_node(wxr, page_data[-1], node).strip("= \n")
+            page_data[-1].raw_tags.append(raw_tag)
 
     extract_tag_form_line_nodes(
         wxr, page_data[-1], level_node.children[:first_gloss_list_index]
@@ -56,12 +91,7 @@ def extract_gloss_list_item(
             else:
                 gloss_nodes.append(t_str)
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
-            if node.sarg.endswith("*"):
-                for example_list_item in node.find_child(NodeKind.LIST_ITEM):
-                    extract_example_list_item(
-                        wxr, sense, example_list_item, word_entry.lang_code
-                    )
-            elif (
+            if (
                 node.sarg.endswith(":")
                 and len(sense.examples) > 0
                 and sense.examples[-1].translation == ""
@@ -70,6 +100,11 @@ def extract_gloss_list_item(
                     sense.examples[-1].translation = clean_node(
                         wxr, sense, tr_list_item.children
                     )
+            elif node.sarg.endswith(("*", ":")):
+                for example_list_item in node.find_child(NodeKind.LIST_ITEM):
+                    extract_example_list_item(
+                        wxr, sense, example_list_item, word_entry.lang_code
+                    )
         else:
             gloss_nodes.append(node)
     gloss_str = clean_node(wxr, sense, gloss_nodes)

diff --git a/src/wiktextract/extractor/it/translation.py b/src/wiktextract/extractor/it/translation.py
@@ -13,6 +13,7 @@ def extract_translation_section(
     page_data: list[WordEntry],
     level_node: LevelNode,
 ) -> None:
+    # https://it.wiktionary.org/wiki/Aiuto:Traduzioni
     sense = ""
     translations = []
     cats = {}

diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py
@@ -7,7 +7,7 @@
 from wiktextract.wxr_context import WiktextractContext
 
 
-class TestItGloss(TestCase):
+class TestItEtymology(TestCase):
     maxDiff = None
 
     def setUp(self) -> None:

diff --git a/tests/test_it_example.py b/tests/test_it_example.py
@@ -139,3 +139,79 @@ def test_zh_tradsem(self):
                 }
             ],
         )
+
+    def test_double_italic_nodes_with_translation(self):
+        self.wxr.wtp.add_page("Template:-en-", 10, "Inglese")
+        data = parse_page(
+            self.wxr,
+            "water",
+            """== {{-en-}} ==
+===Sostantivo===
+# acqua
+#: ''May I have a glass of '''water'''?'' - ''Posso avere un bicchiere d''''acqua'''''?""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {
+                    "glosses": ["acqua"],
+                    "examples": [
+                        {
+                            "text": "May I have a glass of water?",
+                            "translation": "Posso avere un bicchiere d'acqua?",
+                        }
+                    ],
+                }
+            ],
+        )
+
+    def test_double_italic_nodes_no_translation(self):
+        self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
+        data = parse_page(
+            self.wxr,
+            "essere",
+            """== {{-it-}} ==
+===Sostantivo===
+#chi [[esiste]]
+#* ''gli '''esseri''' viventi''; ''gli '''esseri''' animati''""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {
+                    "glosses": ["chi esiste"],
+                    "examples": [
+                        {"text": "gli esseri viventi; gli esseri animati"}
+                    ],
+                }
+            ],
+        )
+
+    def test_term_ref_template(self):
+        self.wxr.wtp.add_page("Template:-la-", 10, "Latino")
+        self.wxr.wtp.add_page("Template:Term", 10, "({{{1}}})")
+        data = parse_page(
+            self.wxr,
+            "libero",
+            """== {{-la-}} ==
+===Verbo===
+# [[assolvere]], [[liberare]] dalle [[accuse]], [[giudicare]] [[innocente]]
+#* ''et eum omni [[ignominia]] '''liberat''''' - e lo [[assolve]] da ogni [[ignominia]] {{Term|[[:w:Marco Tullio Cicerone|Cicerone]], [[:w:Pro Cluentio|Pro Cluentio]], [[:s:la:Pro_Aulo_Cluentio_Habito|XLVII, 132]]}}""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {
+                    "glosses": [
+                        "assolvere, liberare dalle accuse, giudicare innocente"
+                    ],
+                    "examples": [
+                        {
+                            "text": "et eum omni ignominia liberat",
+                            "translation": "e lo assolve da ogni ignominia",
+                            "ref": "Cicerone, Pro Cluentio, XLVII, 132",
+                        }
+                    ],
+                }
+            ],
+        )