Merge pull request #592 from xxyzz/es

Fix some check JSON errors and pydantic errors in es edition
tatuylonen · Apr 17, 2024 · 95353f4 · 95353f4
2 parents 6e7adb9 + 72bf857
commit 95353f4
Show file tree

Hide file tree

Showing 8 changed files with 171 additions and 47 deletions.
diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py
@@ -31,7 +31,8 @@ def extract_gloss(
                 definition.append(node)
 
         gloss = clean_node(wxr, gloss_data, definition)
-        gloss_data.glosses.append(gloss)
+        if len(gloss) > 0:
+            gloss_data.glosses.append(gloss)
 
         gloss_note = clean_node(wxr, gloss_data, list_item.children)
         match = re.match(r"^(\d+)", gloss_note)
@@ -58,11 +59,7 @@ def extract_gloss(
         if len(other) > 0:
             for node in other:
                 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
-                    process_sense_data_list(
-                        wxr,
-                        page_data[-1].senses[-1],
-                        node,
-                    )
+                    process_sense_data_list(wxr, page_data[-1], node)
                 else:
                     wxr.wtp.debug(
                         f"Found nodes that are not part of definition: {node}",

diff --git a/src/wiktextract/extractor/es/linkage.py b/src/wiktextract/extractor/es/linkage.py
@@ -51,7 +51,10 @@ def process_linkage_template(
     for key, value_raw in template_node.template_parameters.items():
         value = clean_node(wxr, None, value_raw)
         if isinstance(key, int):
-            getattr(word_entry, linkage_type).append(Linkage(word=value))
+            linkage_data = Linkage(word=value)
+            if len(word_entry.senses) > 0:
+                linkage_data.senseid = word_entry.senses[-1].get("senseid")
+            getattr(word_entry, linkage_type).append(linkage_data)
         elif isinstance(key, str):
             if key.startswith("nota"):
                 idx = int(key[4:]) - 1 if len(key) > 4 else 0
@@ -79,4 +82,7 @@ def process_linkage_list_children(
         if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
             word = clean_node(wxr, None, node)
             if len(word) > 0:
-                getattr(word_entry, linkage_type).append(Linkage(word=word))
+                linkage_data = Linkage(word=word)
+                if len(word_entry.senses) > 0:
+                    linkage_data.senseid = word_entry.senses[-1].get("senseid")
+                getattr(word_entry, linkage_type).append(linkage_data)
diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py
@@ -16,6 +16,7 @@ class Linkage(BaseModelWrap):
     alternative_spelling: str = Field(
         default="", description="Alternative spelling of the word"
     )
+    senseid: str = ""
 
 
 class Translation(BaseModelWrap):
@@ -150,5 +151,6 @@ class WordEntry(BaseModelWrap):
     meronyms: list[Linkage] = []
     related: list[Linkage] = []
     synonyms: list[Linkage] = []
+    proverbs: list[Linkage] = []
     tags: list[str] = []
     extra_sounds: dict[str, str] = {}
diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py
@@ -146,13 +146,13 @@ def parse_section(
             process_etymology_block(wxr, base_data, level_node)
         for nested_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
             parse_section(wxr, page_data, base_data, nested_level_node)
-    elif section_title in TRANSLATIONS_TITLES:
-        if wxr.config.capture_translations:
-            for template_node in level_node.find_child_recursively(
-                NodeKind.TEMPLATE
-            ):
-                if template_node.template_name == "t+" and len(page_data) > 0:
-                    extract_translation(wxr, page_data[-1], template_node)
+    elif (
+        section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations
+    ):
+        if len(page_data) == 0:
+            page_data.append(base_data.model_copy(deep=True))
+        for template_node in level_node.find_child(NodeKind.TEMPLATE):
+            extract_translation(wxr, page_data[-1], template_node)
 
     elif section_title in LINKAGE_TITLES:
         if len(page_data) == 0:
@@ -262,9 +262,7 @@ def process_group(
             if template_name == "clear":
                 return
             elif template_name.removesuffix("s") in LINKAGE_TITLES:
-                process_linkage_template(
-                    wxr, page_data[-1].senses[-1], group[0]
-                )
+                process_linkage_template(wxr, page_data[-1], group[0])
             elif template_name == "ejemplo":
                 extract_example(wxr, page_data[-1].senses[-1], group)
             elif template_name == "uso":
@@ -282,7 +280,7 @@ def process_group(
             list_node = group[0]
             # List groups seem to not be followed by string nodes.
             # We, therefore, only process the list_node.
-            process_sense_data_list(wxr, page_data[-1].senses[-1], list_node)
+            process_sense_data_list(wxr, page_data[-1], list_node)
 
         elif (
             isinstance(child, WikiNode)

diff --git a/src/wiktextract/extractor/es/sense_data.py b/src/wiktextract/extractor/es/sense_data.py
@@ -4,15 +4,13 @@
 
 from .example import process_example_list
 from .linkage import process_linkage_list_children
-from .models import Sense
+from .models import WordEntry
 from .section_titles import LINKAGE_TITLES
 
 
 def process_sense_data_list(
-    wxr: WiktextractContext,
-    sense_data: Sense,
-    list_node: WikiNode,
-):
+    wxr: WiktextractContext, word_entry: list[WordEntry], list_node: WikiNode
+) -> None:
     list_marker = list_node.sarg
 
     if list_marker == ":;":
@@ -31,13 +29,10 @@ def process_sense_data_list(
             )
 
             if list_type == "ejemplo":
-                process_example_list(wxr, sense_data, list_item)
+                process_example_list(wxr, word_entry.senses[-1], list_item)
             elif list_type in LINKAGE_TITLES:
                 process_linkage_list_children(
-                    wxr,
-                    sense_data,
-                    children[1:],
-                    LINKAGE_TITLES[list_type],
+                    wxr, word_entry, children[1:], LINKAGE_TITLES[list_type]
                 )
             elif list_type == "ámbito":
                 # XXX: Extract scope tag
@@ -54,7 +49,7 @@ def process_sense_data_list(
     elif list_marker in ["::", ":::"]:
         # E.g. https://es.wiktionary.org/wiki/silepsis
         for list_item in list_node.find_child_recursively(NodeKind.LIST_ITEM):
-            process_example_list(wxr, sense_data, list_item)
+            process_example_list(wxr, word_entry.senses[-1], list_item)
 
     else:
         wxr.wtp.debug(

diff --git a/src/wiktextract/extractor/es/translation.py b/src/wiktextract/extractor/es/translation.py
@@ -1,7 +1,8 @@
+import itertools
 from typing import Optional
 
 from mediawiki_langcodes import code_to_name
-from wikitextprocessor import WikiNode
+from wikitextprocessor.parser import TemplateNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -10,11 +11,75 @@
 
 
 def extract_translation(
-    wxr: WiktextractContext,
-    word_entry: WordEntry,
-    template_node: WikiNode,
+    wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
 ):
-    # Documentation: https://es.wiktionary.org/wiki/Plantilla:t+
+    if template_node.template_name == "t":
+        process_t_template(wxr, word_entry, template_node)
+    elif template_node.template_name == "t+":
+        process_t_plus_template(wxr, word_entry, template_node)
+
+
+T_GENDERS = {
+    "m": "masculine",
+    "f": "feminine",
+    "mf": ["masculine", "feminine"],
+    "n": "neuter",
+}
+T_NUMBERS = {
+    "s": "singular",
+    "p": "plural",
+    "d": "dual",
+}
+
+
+def process_t_template(
+    wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
+) -> None:
+    # https://es.wiktionary.org/wiki/Plantilla:t
+    lang_code = template_node.template_parameters.get(1, "")
+    lang_name = code_to_name(lang_code, "es")
+    for tr_index in itertools.count(1):
+        if "t" + str(tr_index) not in template_node.template_parameters:
+            break
+        tr_data = Translation(lang_code=lang_code, lang=lang_name, word="")
+        for param_prefix, field in (
+            ("t", "word"),
+            ("a", "senseids"),
+            ("tl", "roman"),
+            ("nota", "raw_tags"),
+            ("g", "tags"),
+            ("n", "tags"),
+        ):
+            param = param_prefix + str(tr_index)
+            if param not in template_node.template_parameters:
+                continue
+            value = clean_node(
+                wxr, None, template_node.template_parameters[param]
+            )
+            if param_prefix == "g":
+                value = T_GENDERS.get(value)
+            elif param_prefix == "n":
+                value = T_NUMBERS.get(value)
+            if value is None:
+                continue
+
+            pre_value = getattr(tr_data, field)
+            if isinstance(pre_value, list):
+                if isinstance(value, list):
+                    pre_value.extend(value)
+                else:
+                    pre_value.append(value)
+            else:
+                setattr(tr_data, field, value)
+
+        if len(tr_data.word) > 0:
+            word_entry.translations.append(tr_data)
+
+
+def process_t_plus_template(
+    wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
+) -> None:
+    # obsolete template: https://es.wiktionary.org/wiki/Plantilla:t+
 
     lang_code = template_node.template_parameters.get(1)  # Language code
     lang = code_to_name(lang_code, "es")
@@ -29,17 +94,17 @@ def extract_translation(
         if key == 1:
             continue  # Skip language code
 
-        value = clean_node(
-            wxr, {}, template_node.template_parameters[key]
-        ).strip()
-
+        value = clean_node(wxr, None, template_node.template_parameters[key])
         if isinstance(key, int):
             if value == ",":
-                if current_translation:
+                if (
+                    current_translation is not None
+                    and len(current_translation.word) > 0
+                ):
                     word_entry.translations.append(current_translation)
 
-                    current_translation = None
-                    senseids = []
+                current_translation = None
+                senseids = []
             elif (
                 value.isdigit()
                 or (value != "," and "," in value)
@@ -98,5 +163,5 @@ def extract_translation(
                     current_translation.roman = value
 
     # Add the last translation if it exists
-    if current_translation:
+    if current_translation is not None and len(current_translation.word) > 0:
         word_entry.translations.append(current_translation)
diff --git a/tests/test_es_linkage.py b/tests/test_es_linkage.py
@@ -64,10 +64,7 @@ def test_es_process_linkage_template(self):
                 "input": "{{sinónimo|automóvil|coche|nota2=España|carro|nota3=Colombia, Estados Unidos, México, Venezuela}}",
                 "expected": [
                     {"word": "automóvil"},
-                    {
-                        "word": "coche",
-                        "note": "España",
-                    },
+                    {"word": "coche", "note": "España"},
                     {
                         "word": "carro",
                         "note": "Colombia, Estados Unidos, México, Venezuela",

diff --git a/tests/test_es_translation.py b/tests/test_es_translation.py
@@ -29,6 +29,11 @@ def get_default_page_data(self) -> list[WordEntry]:
     def test_es_extract_translation(self):
         # Test cases from https://es.wiktionary.org/wiki/Plantilla:t+
         test_cases = [
+            {
+                # https://es.wiktionary.org/wiki/calderón
+                "input": "{{t+|ar|}}",
+                "expected": [],
+            },
             {
                 "input": "{{t+|af|1|kat}}",
                 "expected": [
@@ -142,3 +147,62 @@ def test_es_extract_translation(self):
                     translations,
                     case["expected"],
                 )
+
+    def test_t_roman(self):
+        self.wxr.wtp.start_page("hola")
+        word_entry = WordEntry(word="hola", lang_code="es", lang="Español")
+        root = self.wxr.wtp.parse(
+            "{{t|zh|a1=1|t1=你好|tl1=nĭ hăo|t2=您好|tl2=nín hăo|nota2=formal}}"
+        )
+        extract_translation(self.wxr, word_entry, root.children[0])
+        self.assertEqual(
+            [
+                t.model_dump(exclude_defaults=True)
+                for t in word_entry.translations
+            ],
+            [
+                {
+                    "lang": "chino",
+                    "lang_code": "zh",
+                    "word": "你好",
+                    "senseids": ["1"],
+                    "roman": "nĭ hăo",
+                },
+                {
+                    "lang": "chino",
+                    "lang_code": "zh",
+                    "word": "您好",
+                    "roman": "nín hăo",
+                    "raw_tags": ["formal"],
+                },
+            ],
+        )
+
+    def test_t_gender(self):
+        self.wxr.wtp.start_page("hola")
+        word_entry = WordEntry(word="hola", lang_code="es", lang="Español")
+        root = self.wxr.wtp.parse(
+            "{{t|th|a1=1|t1=สวัสดีครับ|g1=m|t2=สวัสดีค่ะ|g2=f}}"
+        )
+        extract_translation(self.wxr, word_entry, root.children[0])
+        self.assertEqual(
+            [
+                t.model_dump(exclude_defaults=True)
+                for t in word_entry.translations
+            ],
+            [
+                {
+                    "lang": "tailandés",
+                    "lang_code": "th",
+                    "word": "สวัสดีครับ",
+                    "senseids": ["1"],
+                    "tags": ["masculine"],
+                },
+                {
+                    "lang": "tailandés",
+                    "lang_code": "th",
+                    "word": "สวัสดีค่ะ",
+                    "tags": ["feminine"],
+                },
+            ],
+        )