Merge pull request #947 from xxyzz/it

[it] improve sound section code and extract linkage sections
tatuylonen · Dec 16, 2024 · 98779e3 · 98779e3
2 parents 8a39820 + b92d96e
commit 98779e3
Show file tree

Hide file tree

Showing 9 changed files with 341 additions and 35 deletions.
diff --git a/src/wiktextract/extractor/it/etymology.py b/src/wiktextract/extractor/it/etymology.py
@@ -8,6 +8,7 @@
 def extract_etymology_section(
     wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
 ) -> None:
+    # https://it.wiktionary.org/wiki/Aiuto:Etimologia
     etymology_texts = []
     for list_node in level_node.find_child(NodeKind.LIST):
         for list_item in list_node.find_child(NodeKind.LIST_ITEM):

diff --git a/src/wiktextract/extractor/it/linkage.py b/src/wiktextract/extractor/it/linkage.py
@@ -0,0 +1,50 @@
+from wikitextprocessor import LevelNode, NodeKind, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Linkage, WordEntry
+
+
+def extract_linkage_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    level_node: LevelNode,
+    linkage_type: str,
+) -> None:
+    linkages = []
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            linkages.extend(extract_linkage_list_item(wxr, list_item))
+
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            getattr(data, linkage_type).extend(linkages)
+
+
+def extract_linkage_list_item(
+    wxr: WiktextractContext, list_item: WikiNode
+) -> list[Linkage]:
+    raw_tags = []
+    linkages = []
+    for node in list_item.children:
+        if isinstance(node, WikiNode):
+            match node.kind:
+                case NodeKind.LINK:
+                    node_str = clean_node(wxr, None, node)
+                    if node_str != "":
+                        linkages.append(
+                            Linkage(word=node_str, raw_tags=raw_tags)
+                        )
+                        raw_tags.clear()
+                case NodeKind.TEMPLATE | NodeKind.ITALIC:
+                    node_str = clean_node(wxr, None, node)
+                    if node_str.startswith("(") and node_str.endswith(")"):
+                        raw_tags.append(node_str.strip("()"))
+        elif isinstance(node, str):
+            for word_str in node.split(","):
+                word_str = word_str.strip()
+                if word_str != "":
+                    linkages.append(Linkage(word=word_str, raw_tags=raw_tags))
+                    raw_tags.clear()
+
+    return linkages
diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py
@@ -60,6 +60,18 @@ class Sound(ItalianBaseModel):
     flac_url: str = ""
     tags: list[str] = []
     raw_tags: list[str] = []
+    sense: str = ""
+
+
+class Hyphenation(ItalianBaseModel):
+    hyphenation: str = ""
+    sense: str = ""
+
+
+class Linkage(ItalianBaseModel):
+    word: str
+    tags: list[str] = []
+    raw_tags: list[str] = []
 
 
 class WordEntry(ItalianBaseModel):
@@ -77,5 +89,12 @@ class WordEntry(ItalianBaseModel):
     forms: list[Form] = []
     etymology_texts: list[str] = []
     etymology_examples: list[Example] = []
-    hyphenation: str = ""
+    hyphenations: list[Hyphenation] = []
     sounds: list[Sound] = []
+    synonyms: list[Linkage] = []
+    antonyms: list[Linkage] = []
+    derived: list[Linkage] = []
+    related: list[Linkage] = []
+    hyponyms: list[Linkage] = []
+    hypernyms: list[Linkage] = []
+    proverbs: list[Linkage] = []
diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py
@@ -5,9 +5,10 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .etymology import extract_citation_section, extract_etymology_section
+from .linkage import extract_linkage_section
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
-from .section_titles import POS_DATA
+from .section_titles import LINKAGE_SECTIONS, POS_DATA
 from .sound import extract_hyphenation_section, extract_pronunciation_section
 from .translation import extract_translation_section
 
@@ -31,6 +32,10 @@ def parse_section(
         extract_hyphenation_section(wxr, page_data, level_node)
     elif title_text == "Pronuncia":
         extract_pronunciation_section(wxr, page_data, level_node)
+    elif title_text in LINKAGE_SECTIONS:
+        extract_linkage_section(
+            wxr, page_data, level_node, LINKAGE_SECTIONS[title_text]
+        )
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)

diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py
@@ -50,13 +50,11 @@ def extract_gloss_list_item(
     sense = Sense()
     for node in list_item.children:
         if isinstance(node, TemplateNode):
-            match node.template_name:
-                case "Term":
-                    raw_tag = clean_node(wxr, sense, node).strip("() \n")
-                    if raw_tag != "":
-                        sense.raw_tags.append(raw_tag)
-                case _:
-                    gloss_nodes.append(node)
+            t_str = clean_node(wxr, sense, node)
+            if t_str.startswith("(") and t_str.endswith(")"):
+                sense.raw_tags.append(t_str.strip("()"))
+            else:
+                gloss_nodes.append(t_str)
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
             if node.sarg.endswith("*"):
                 for example_list_item in node.find_child(NodeKind.LIST_ITEM):

diff --git a/src/wiktextract/extractor/it/section_titles.py b/src/wiktextract/extractor/it/section_titles.py
@@ -62,3 +62,18 @@
     "Codice / Simbolo": {"pos": "symbol"},
     "Carattere hiragana": {"pos": "character", "tags": ["hiragana"]},
 }
+
+
+LINKAGE_SECTIONS = {
+    "Sinonimi": "synonyms",
+    "Contrari": "antonyms",
+    "Derivati": "derived",
+    "Termini correlati": "related",
+    "Varianti": "related",
+    "Alterati": "related",
+    "Iponimi": "hyponyms",
+    "Iperonimi": "hypernyms",
+    "Da non confondere con": "related",
+    "Proverbi e modi di dire": "proverbs",
+    "Parole derivate": "derived",
+}
diff --git a/src/wiktextract/extractor/it/sound.py b/src/wiktextract/extractor/it/sound.py
@@ -1,47 +1,123 @@
-from wikitextprocessor import LevelNode, NodeKind
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from ..share import set_sound_file_url_fields
-from .models import Sound, WordEntry
+from .models import Hyphenation, Sound, WordEntry
 
 
 def extract_hyphenation_section(
     wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
 ) -> None:
-    hyphenation = ""
+    # https://it.wiktionary.org/wiki/Aiuto:Sillabazione
+    hyphenations = []
     for list_node in level_node.find_child(NodeKind.LIST):
-        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
-            hyphenation = clean_node(wxr, None, list_item.children)
+        match list_node.sarg:
+            case ";":
+                for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+                    h_str = clean_node(wxr, None, list_item.children)
+                    if h_str != "":
+                        hyphenations.append(Hyphenation(hyphenation=h_str))
+                        break
+            case "*":
+                for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+                    h_data = Hyphenation()
+                    for node in list_item.find_child(
+                        NodeKind.ITALIC | NodeKind.BOLD
+                    ):
+                        match node.kind:
+                            case NodeKind.ITALIC:
+                                h_data.sense = clean_node(
+                                    wxr, None, node
+                                ).strip("()")
+                            case NodeKind.BOLD:
+                                h_data.hyphenation = clean_node(wxr, None, node)
+                    if h_data.hyphenation != "":
+                        hyphenations.append(h_data)
+
+    # no list
+    for node in level_node.find_child(NodeKind.BOLD):
+        h_str = clean_node(wxr, None, node)
+        if h_str != "":
+            hyphenations.append(Hyphenation(hyphenation=h_str))
+
     for data in page_data:
         if data.lang_code == page_data[-1].lang_code:
-            data.hyphenation = hyphenation
+            data.hyphenations.extend(hyphenations)
 
 
 def extract_pronunciation_section(
     wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
 ) -> None:
+    # https://it.wiktionary.org/wiki/Aiuto:Pronuncia
     sounds = []
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            extract_sound_list_item(wxr, list_item, sounds)
+
+    # no list
     for t_node in level_node.find_child(NodeKind.TEMPLATE):
-        match t_node.template_name.lower():
-            case "ipa":
-                ipa = clean_node(
-                    wxr, None, t_node.template_parameters.get(1, "")
-                )
-                if ipa != "":
-                    sounds.append(Sound(ipa=ipa))
-            case "audio":
-                sound_file = clean_node(
-                    wxr, None, t_node.template_parameters.get(1, "")
-                )
-                if sound_file != "":
-                    if len(sounds) > 0:
-                        set_sound_file_url_fields(wxr, sound_file, sounds[-1])
-                    else:
-                        sound = Sound()
-                        set_sound_file_url_fields(wxr, sound_file, sound)
-                        sounds.append(sound)
+        extract_sound_template(wxr, t_node, sounds, "", [])
 
     for data in page_data:
         if data.lang_code == page_data[-1].lang_code:
             data.sounds.extend(sounds)
+
+
+def extract_sound_list_item(
+    wxr: WiktextractContext, list_item: WikiNode, sounds: list[Sound]
+) -> None:
+    sense = ""
+    raw_tags = []
+    for node in list_item.find_child(NodeKind.ITALIC | NodeKind.TEMPLATE):
+        match node.kind:
+            case NodeKind.ITALIC:
+                sense = clean_node(wxr, None, node).strip("()")
+            case NodeKind.TEMPLATE:
+                if node.template_name.lower() == "glossa":
+                    raw_tags.append(clean_node(wxr, None, node).strip("()"))
+                else:
+                    extract_sound_template(wxr, node, sounds, sense, raw_tags)
+
+
+def extract_sound_template(
+    wxr: WiktextractContext,
+    t_node: TemplateNode,
+    sounds: list[Sound],
+    sense: str,
+    raw_tags: list[str],
+) -> None:
+    match t_node.template_name:
+        case "IPA" | "SAMPA":
+            # https://it.wiktionary.org/wiki/Template:IPA
+            # https://it.wiktionary.org/wiki/Template:SAMPA
+            for arg_name in range(1, 5):
+                if arg_name not in t_node.template_parameters:
+                    break
+                ipa = clean_node(
+                    wxr, None, t_node.template_parameters.get(arg_name, "")
+                )
+                if ipa != "":
+                    sound = Sound(ipa=ipa, sense=sense, raw_tags=raw_tags)
+                    if t_node.template_name.lower() == "sampa":
+                        sound.tags.append("SAMPA")
+                    sounds.append(sound)
+        case "Audio" | "audio":
+            # https://it.wiktionary.org/wiki/Template:Audio
+            sound_file = clean_node(
+                wxr, None, t_node.template_parameters.get(1, "")
+            )
+            raw_tag = clean_node(
+                wxr, None, t_node.template_parameters.get(2, "")
+            )
+            if sound_file != "":
+                if len(sounds) > 0:
+                    set_sound_file_url_fields(wxr, sound_file, sounds[-1])
+                    if raw_tag != "":
+                        sounds[-1].raw_tags.append(raw_tag)
+                else:
+                    sound = Sound(sense=sense, raw_tags=raw_tags)
+                    set_sound_file_url_fields(wxr, sound_file, sound)
+                    if raw_tag != "":
+                        sound.raw_tags.append(raw_tag)
+                    sounds.append(sound)
diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py
@@ -0,0 +1,44 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.it.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestItLinkage(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="it"),
+            WiktionaryConfig(
+                dump_file_lang_code="it", capture_language_codes=None
+            ),
+        )
+
+    def test_synonyms(self):
+        self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
+        self.wxr.wtp.add_page(
+            "Template:Fig", 10, "<small>(''senso figurato'')</small>"
+        )
+        data = parse_page(
+            self.wxr,
+            "cane",
+            """== {{-it-}} ==
+===Sostantivo===
+# [[animale]]
+===Sinonimi===
+* [[animale]], amico dell’uomo
+* {{Fig}} ''(di freddo)'' [[forte]], [[intenso]]""",
+        )
+        self.assertEqual(
+            data[0]["synonyms"],
+            [
+                {"word": "animale"},
+                {"word": "amico dell’uomo"},
+                {"word": "forte", "raw_tags": ["senso figurato", "di freddo"]},
+                {"word": "intenso"},
+            ],
+        )