Merge pull request #482 from tatuylonen/de-ipa

More other-edition field stuff
tatuylonen · Jan 31, 2024 · b03fb53 · b03fb53
2 parents b38b8f6 + 372b117
commit b03fb53
Show file tree

Hide file tree

Showing 8 changed files with 113 additions and 65 deletions.
diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py
@@ -39,9 +39,6 @@ class Translation(BaseModelWrap):
         description="Tags specifying the translated term, usually gender information",
     )
     notes: list[str] = Field(default=[], description="A list of notes")
-    roman: str = Field(
-        default="", description="Transliteration in roman characters"
-    )
 
 
 class Example(BaseModelWrap):
@@ -123,22 +120,18 @@ class Sense(BaseModelWrap):
 
 
 class Sound(BaseModelWrap):
-    ipa: list[str] = Field(
-        default=[], description="International Phonetic Alphabet"
-    )
+    ipa: str = Field(default="", description="International Phonetic Alphabet")
     # phonetic_transcription: list[str] = Field(
     #     default=[], description="Phonetic transcription, less exact than IPA."
     # )
-    audio: list[str] = Field(default=[], description="Audio file name")
-    wav_url: list[str] = Field(default=[])
-    ogg_url: list[str] = Field(default=[])
-    mp3_url: list[str] = Field(default=[])
-    oga_url: list[str] = Field(default=[])
-    flac_url: list[str] = Field(default=[])
-    lang_code: list[str] = Field(
-        default=[], description="Wiktionary language code"
-    )
-    lang: list[str] = Field(default=[], description="Localized language name")
+    audio: str = Field(default="", description="Audio file name")
+    wav_url: str = Field(default="")
+    ogg_url: str = Field(default="")
+    mp3_url: str = Field(default="")
+    oga_url: str = Field(default="")
+    flac_url: str = Field(default="")
+    lang_code: str = Field(default="", description="Wiktionary language code")
+    lang: str = Field(default="", description="Localized language name")
     # roman: list[str] = Field(
     #     default=[], description="Translitaration to Roman characters"
     # )

diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
@@ -21,7 +21,8 @@ def extract_pronunciation(
             NodeKind.LIST_ITEM
         ):
             wxr.wtp.debug(
-                f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}",
+                f"Found unexpected non-list-item node in pronunciation "
+                f"section: {not_list_item_node}",
                 sortid="extractor/de/pronunciation/extract_pronunciation/28",
             )
 
@@ -37,8 +38,10 @@ def extract_pronunciation(
                 or not rest
             ):
                 wxr.wtp.debug(
-                    f"Found unexpected non-template node in pronunciation section: {head_template}",
-                    sortid="extractor/de/pronunciation/extract_pronunciation/37",
+                    f"Found unexpected non-template node in pronunciation "
+                    f"section: {head_template}",
+                    sortid="extractor/de/pronunciation/"
+                    "extract_pronunciation/37",
                 )
                 continue
             if head_template.template_name == "IPA":
@@ -91,29 +94,34 @@ def process_ipa(
 
 def process_lautschrift_template(
     wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
-):
+) -> None:
     template_parameters = node.template_parameters
 
     ipa = template_parameters.get(1)
 
     lang_code = template_parameters.get("spr")
     if lang_code:
         lang = code_to_name(lang_code, "de")
-        add_sound_data_without_appending_to_existing_properties(
-            wxr,
-            sound_data,
-            {
-                "ipa": [ipa],
-                "lang_code": lang_code,
-                "lang": lang,
-            },
-        )
+        new_data = {
+            "lang_code": lang_code,
+            "lang": lang,
+        }
     else:
-        sound_data[-1].ipa.append(ipa)
+        new_data = dict()
+
+    new_data["ipa"] = ipa
+
+    add_sound_data_without_appending_to_existing_properties(
+        wxr,
+        sound_data,
+        new_data,
+    )
 
 
 def process_hoerbeispiele(
-    wxr: WiktextractContext, sound_data: list[Sound], nodes: list[WikiNode]
+    wxr: WiktextractContext,
+    sound_data: list[Sound],
+    nodes: list[Union[str, WikiNode]],
 ):
     for node in nodes:
         if is_template_node_with_name(node, "Audio"):
@@ -157,7 +165,7 @@ def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
 def add_sound_data_without_appending_to_existing_properties(
     wxr: WiktextractContext,
     sound_data: list[Sound],
-    new_sound_data: list[dict],
+    new_sound_data: dict,
 ):
     """Creates a new IPA data entry if properties exist in previous entry."""
     if any(
@@ -171,7 +179,7 @@ def add_sound_data_without_appending_to_existing_properties(
     for key, value in new_sound_data.items():
         if key in sound_data[-1].model_fields:
             if isinstance(value, str):
-                getattr(sound_data[-1], key).append(value)
+                setattr(sound_data[-1], key, value)
             else:
                 getattr(sound_data[-1], key).extend(value)
         else:

diff --git a/src/wiktextract/extractor/es/etymology.py b/src/wiktextract/extractor/es/etymology.py
@@ -1,4 +1,6 @@
+from typing import cast
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import TemplateNode
 from wiktextract.extractor.es.models import EtymologyTemplate, WordEntry
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
@@ -23,23 +25,43 @@ def process_etymology_block(
     """
 
     has_etymology_info = False
+
     for template_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
+        # no-op type-annotation cast; we softly assert template_node is a
+        # TemplateNode, which has .template_name, to quiet the type-checker.
+        template_node = cast(TemplateNode, template_node)
+        if "etim" not in template_node.template_name:
+            # We don't want to keep any other template data other than
+            # the main etymology templates (and maybe Plantilla:etim)
+            continue
+
         entry.etymology_templates = entry.etymology_templates or []
 
         etymology_template = EtymologyTemplate(
             name=template_node.template_name,
             expansion=clean_node(wxr, None, template_node),
         )
 
+        if etymology_template.expansion in (
+            # "Please fill in this etymology, thank you..."
+            "Si puedes, incorpórala: ver cómo.",
+            "Préstamo no adaptado.",
+            "Este lema en este idioma es ampliable. "
+            "Retira este aviso si la mayor parte de las acepciones ya están incluidas.",
+        ):
+            continue
+
         args = {}
         for index, param in template_node.template_parameters.items():
             args[str(index)] = (
                 param
                 if isinstance(param, str)
                 else clean_node(wxr, None, param)
             )
+            # if any other index other than "leng" is encountered,
+            # has_etymology => True
             has_etymology_info = has_etymology_info or index != "leng"
-        if args:
+        if args and not (len(args) == 1 and "leng" in args):
             etymology_template.args = args
 
         entry.etymology_templates.append(etymology_template)

diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py
@@ -23,6 +23,7 @@ class Translation(BaseModelWrap):
     lang_code: str = Field(
         description="Wiktionary language code of the translation term"
     )
+    lang: str = Field(description="Name of the language of translation")
     senseids: list[str] = Field(
         default=[],
         description="List of senseids where this translation applies",

diff --git a/src/wiktextract/extractor/es/translation.py b/src/wiktextract/extractor/es/translation.py
@@ -1,5 +1,6 @@
 from typing import Optional
 
+from mediawiki_langcodes import code_to_name
 from wikitextprocessor import WikiNode
 from wiktextract.extractor.es.models import Translation, WordEntry
 from wiktextract.extractor.share import split_senseids
@@ -15,6 +16,9 @@ def extract_translation(
     # Documentation: https://es.wiktionary.org/wiki/Plantilla:t+
 
     lang_code = template_node.template_parameters.get(1)  # Language code
+    lang = code_to_name(lang_code, "es")
+    if not lang:
+        lang = f"Unknown({lang_code})"
 
     # Initialize variables
     current_translation: Optional[Translation] = None
@@ -82,7 +86,10 @@ def extract_translation(
 
                 else:
                     current_translation = Translation(
-                        word=value, lang_code=lang_code, senseids=list(senseids)
+                        word=value,
+                        lang_code=lang_code,
+                        lang=lang,
+                        senseids=list(senseids),
                     )
         elif isinstance(key, str):
             if key == "tr":

diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py
@@ -27,36 +27,37 @@ def test_de_process_ipa(self):
                 "input": "{{Lautschrift|ipa1}}",
                 "expected": [
                     {
-                        "ipa": ["ipa1"],
+                        "ipa": "ipa1",
                     }
                 ],
             },
             {
                 "input": "{{Lautschrift|ipa1|spr=de}}",
                 "expected": [
                     {
-                        "ipa": ["ipa1"],
-                        "lang": ["Deutsch"],
-                        "lang_code": ["de"],
+                        "ipa": "ipa1",
+                        "lang": "Deutsch",
+                        "lang_code": "de",
                     }
                 ],
             },
             {
                 "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}",
                 "expected": [
-                    {"ipa": ["ipa1", "ipa2"]},
+                    {"ipa": "ipa1"},
+                    {"ipa": "ipa2"},
                     {
-                        "ipa": ["ipa3"],
-                        "lang": ["Deutsch"],
-                        "lang_code": ["de"],
+                        "ipa": "ipa3",
+                        "lang": "Deutsch",
+                        "lang_code": "de",
                     },
                 ],
             },
             {
                 "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}",
                 "expected": [
-                    {"ipa": ["ipa1"]},
-                    {"ipa": ["ipa2"], "tags": ["tag1"]},
+                    {"ipa": "ipa1"},
+                    {"ipa": "ipa2", "tags": ["tag1"]},
                 ],
             },
         ]
@@ -90,7 +91,7 @@ def test_de_process_hoerbeispiele(self):
                 "input": "{{Audio|" + filename1 + "}}",
                 "expected": [
                     {
-                        "audio": [filename1],
+                        "audio": filename1,
                         "mp3_url": None,  # None indicates we don't care about the exact value
                         "ogg_url": None,
                     }
@@ -104,12 +105,12 @@ def test_de_process_hoerbeispiele(self):
                 + "}}",
                 "expected": [
                     {
-                        "audio": [filename1],
+                        "audio": filename1,
                         "mp3_url": None,
                         "ogg_url": None,
                     },
                     {
-                        "audio": [filename2],
+                        "audio": filename2,
                         "ogg_url": None,
                         "mp3_url": None,
                         "wav_url": None,
@@ -124,13 +125,13 @@ def test_de_process_hoerbeispiele(self):
                 + "}}",
                 "expected": [
                     {
-                        "audio": [filename1],
+                        "audio": filename1,
                         "mp3_url": None,
                         "ogg_url": None,
                         "tags": ["tag1"],
                     },
                     {
-                        "audio": [filename2],
+                        "audio": filename2,
                         "mp3_url": None,
                         "ogg_url": None,
                         "wav_url": None,

diff --git a/tests/test_es_etymology.py b/tests/test_es_etymology.py
@@ -34,17 +34,7 @@ def test_es_extract_etymology(self):
             {
                 # https://es.wiktionary.org/wiki/Schreck
                 "input": "{{etimología|leng=de}}",
-                "expected": {
-                    "etymology_templates": [
-                        {
-                            "args": {
-                                "leng": "de",
-                            },
-                            "name": "etimología",
-                            "expansion": "Si puedes, incorpórala: ver cómo.",
-                        },
-                    ],
-                },
+                "expected": dict(),
             },
             {
                 # https://es.wiktionary.org/wiki/bagre