From ea34867b325d59afb5805aff845b59f9a29773e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Tue, 30 Jan 2024 10:20:39 +0200
Subject: [PATCH 1/4] Fields should not default to lists

For example "ipa" should not be a list inside a larger
"sounds" list entry; if there are several "ipa" entries,
make them separate "sounds" list entries, with their own
tags etc. if necessary.
---
 src/wiktextract/extractor/de/models.py        | 25 ++++-------
 src/wiktextract/extractor/de/pronunciation.py | 42 +++++++++++--------
 tests/test_de_pronunciation.py                | 31 +++++++-------
 3 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py
index a5ebfbcc..1a0d9a7b 100644
--- a/src/wiktextract/extractor/de/models.py
+++ b/src/wiktextract/extractor/de/models.py
@@ -39,9 +39,6 @@ class Translation(BaseModelWrap):
         description="Tags specifying the translated term, usually gender information",
     )
     notes: list[str] = Field(default=[], description="A list of notes")
-    roman: str = Field(
-        default="", description="Transliteration in roman characters"
-    )
 
 
 class Example(BaseModelWrap):
@@ -123,22 +120,18 @@ class Sense(BaseModelWrap):
 
 
 class Sound(BaseModelWrap):
-    ipa: list[str] = Field(
-        default=[], description="International Phonetic Alphabet"
-    )
+    ipa: str = Field(default="", description="International Phonetic Alphabet")
     # phonetic_transcription: list[str] = Field(
     #     default=[], description="Phonetic transcription, less exact than IPA."
     # )
-    audio: list[str] = Field(default=[], description="Audio file name")
-    wav_url: list[str] = Field(default=[])
-    ogg_url: list[str] = Field(default=[])
-    mp3_url: list[str] = Field(default=[])
-    oga_url: list[str] = Field(default=[])
-    flac_url: list[str] = Field(default=[])
-    lang_code: list[str] = Field(
-        default=[], description="Wiktionary language code"
-    )
-    lang: list[str] = Field(default=[], description="Localized language name")
+    audio: str = Field(default="", description="Audio file name")
+    wav_url: str = Field(default="")
+    ogg_url: str = Field(default="")
+    mp3_url: str = Field(default="")
+    oga_url: str = Field(default="")
+    flac_url: str = Field(default="")
+    lang_code: str = Field(default="", description="Wiktionary language code")
+    lang: str = Field(default="", description="Localized language name")
     # roman: list[str] = Field(
     #     default=[], description="Translitaration to Roman characters"
     # )
diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
index 7c3e0919..90b1d171 100644
--- a/src/wiktextract/extractor/de/pronunciation.py
+++ b/src/wiktextract/extractor/de/pronunciation.py
@@ -21,7 +21,8 @@ def extract_pronunciation(
             NodeKind.LIST_ITEM
         ):
             wxr.wtp.debug(
-                f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}",
+                f"Found unexpected non-list-item node in pronunciation "
+                f"section: {not_list_item_node}",
                 sortid="extractor/de/pronunciation/extract_pronunciation/28",
             )
 
@@ -37,8 +38,10 @@ def extract_pronunciation(
                 or not rest
             ):
                 wxr.wtp.debug(
-                    f"Found unexpected non-template node in pronunciation section: {head_template}",
-                    sortid="extractor/de/pronunciation/extract_pronunciation/37",
+                    f"Found unexpected non-template node in pronunciation "
+                    f"section: {head_template}",
+                    sortid="extractor/de/pronunciation/"
+                    "extract_pronunciation/37",
                 )
                 continue
             if head_template.template_name == "IPA":
@@ -91,7 +94,7 @@ def process_ipa(
 
 def process_lautschrift_template(
     wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
-):
+) -> None:
     template_parameters = node.template_parameters
 
     ipa = template_parameters.get(1)
@@ -99,21 +102,26 @@ def process_lautschrift_template(
     lang_code = template_parameters.get("spr")
     if lang_code:
         lang = code_to_name(lang_code, "de")
-        add_sound_data_without_appending_to_existing_properties(
-            wxr,
-            sound_data,
-            {
-                "ipa": [ipa],
-                "lang_code": lang_code,
-                "lang": lang,
-            },
-        )
+        new_data = {
+            "lang_code": lang_code,
+            "lang": lang,
+        }
     else:
-        sound_data[-1].ipa.append(ipa)
+        new_data = dict()
+
+    new_data["ipa"] = ipa
+
+    add_sound_data_without_appending_to_existing_properties(
+        wxr,
+        sound_data,
+        new_data,
+    )
 
 
 def process_hoerbeispiele(
-    wxr: WiktextractContext, sound_data: list[Sound], nodes: list[WikiNode]
+    wxr: WiktextractContext,
+    sound_data: list[Sound],
+    nodes: list[Union[str, WikiNode]],
 ):
     for node in nodes:
         if is_template_node_with_name(node, "Audio"):
@@ -157,7 +165,7 @@ def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
 def add_sound_data_without_appending_to_existing_properties(
     wxr: WiktextractContext,
     sound_data: list[Sound],
-    new_sound_data: list[dict],
+    new_sound_data: dict,
 ):
     """Creates a new IPA data entry if properties exist in previous entry."""
     if any(
@@ -171,7 +179,7 @@ def add_sound_data_without_appending_to_existing_properties(
     for key, value in new_sound_data.items():
         if key in sound_data[-1].model_fields:
             if isinstance(value, str):
-                getattr(sound_data[-1], key).append(value)
+                setattr(sound_data[-1], key, value)
             else:
                 getattr(sound_data[-1], key).extend(value)
         else:
diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py
index 773d9b30..995e1adb 100644
--- a/tests/test_de_pronunciation.py
+++ b/tests/test_de_pronunciation.py
@@ -27,7 +27,7 @@ def test_de_process_ipa(self):
                 "input": "{{Lautschrift|ipa1}}",
                 "expected": [
                     {
-                        "ipa": ["ipa1"],
+                        "ipa": "ipa1",
                     }
                 ],
             },
@@ -35,28 +35,29 @@ def test_de_process_ipa(self):
                 "input": "{{Lautschrift|ipa1|spr=de}}",
                 "expected": [
                     {
-                        "ipa": ["ipa1"],
-                        "lang": ["Deutsch"],
-                        "lang_code": ["de"],
+                        "ipa": "ipa1",
+                        "lang": "Deutsch",
+                        "lang_code": "de",
                     }
                 ],
             },
             {
                 "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}",
                 "expected": [
-                    {"ipa": ["ipa1", "ipa2"]},
+                    {"ipa": "ipa1"},
+                    {"ipa": "ipa2"},
                     {
-                        "ipa": ["ipa3"],
-                        "lang": ["Deutsch"],
-                        "lang_code": ["de"],
+                        "ipa": "ipa3",
+                        "lang": "Deutsch",
+                        "lang_code": "de",
                     },
                 ],
             },
             {
                 "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}",
                 "expected": [
-                    {"ipa": ["ipa1"]},
-                    {"ipa": ["ipa2"], "tags": ["tag1"]},
+                    {"ipa": "ipa1"},
+                    {"ipa": "ipa2", "tags": ["tag1"]},
                 ],
             },
         ]
@@ -90,7 +91,7 @@ def test_de_process_hoerbeispiele(self):
                 "input": "{{Audio|" + filename1 + "}}",
                 "expected": [
                     {
-                        "audio": [filename1],
+                        "audio": filename1,
                         "mp3_url": None,  # None indicates we don't care about the exact value
                         "ogg_url": None,
                     }
@@ -104,12 +105,12 @@ def test_de_process_hoerbeispiele(self):
                 + "}}",
                 "expected": [
                     {
-                        "audio": [filename1],
+                        "audio": filename1,
                         "mp3_url": None,
                         "ogg_url": None,
                     },
                     {
-                        "audio": [filename2],
+                        "audio": filename2,
                         "ogg_url": None,
                         "mp3_url": None,
                         "wav_url": None,
@@ -124,13 +125,13 @@ def test_de_process_hoerbeispiele(self):
                 + "}}",
                 "expected": [
                     {
-                        "audio": [filename1],
+                        "audio": filename1,
                         "mp3_url": None,
                         "ogg_url": None,
                         "tags": ["tag1"],
                     },
                     {
-                        "audio": [filename2],
+                        "audio": filename2,
                         "mp3_url": None,
                         "ogg_url": None,
                         "wav_url": None,

From 197fc8bbe2e75f7e0e08cfda9ae3fa99445db409 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Tue, 30 Jan 2024 14:08:44 +0200
Subject: [PATCH 2/4] Es-edition: "lang"-field required

---
 src/wiktextract/extractor/es/models.py      |  1 +
 src/wiktextract/extractor/es/translation.py |  9 +++++-
 tests/test_es_translation.py                | 34 ++++++++++++++++++---
 3 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py
index afcb2795..c5e12c7b 100644
--- a/src/wiktextract/extractor/es/models.py
+++ b/src/wiktextract/extractor/es/models.py
@@ -23,6 +23,7 @@ class Translation(BaseModelWrap):
     lang_code: str = Field(
         description="Wiktionary language code of the translation term"
     )
+    lang: str = Field(description="Name of the language of translation")
     senseids: list[str] = Field(
         default=[],
         description="List of senseids where this translation applies",
diff --git a/src/wiktextract/extractor/es/translation.py b/src/wiktextract/extractor/es/translation.py
index 9da20996..e9532d9a 100644
--- a/src/wiktextract/extractor/es/translation.py
+++ b/src/wiktextract/extractor/es/translation.py
@@ -1,5 +1,6 @@
 from typing import Optional
 
+from mediawiki_langcodes import code_to_name
 from wikitextprocessor import WikiNode
 from wiktextract.extractor.es.models import Translation, WordEntry
 from wiktextract.extractor.share import split_senseids
@@ -15,6 +16,9 @@ def extract_translation(
     # Documentation: https://es.wiktionary.org/wiki/Plantilla:t+
 
     lang_code = template_node.template_parameters.get(1)  # Language code
+    lang = code_to_name(lang_code, "es")
+    if not lang:
+        lang = f"Unknown({lang_code})"
 
     # Initialize variables
     current_translation: Optional[Translation] = None
@@ -82,7 +86,10 @@ def extract_translation(
 
                 else:
                     current_translation = Translation(
-                        word=value, lang_code=lang_code, senseids=list(senseids)
+                        word=value,
+                        lang_code=lang_code,
+                        lang=lang,
+                        senseids=list(senseids),
                     )
         elif isinstance(key, str):
             if key == "tr":
diff --git a/tests/test_es_translation.py b/tests/test_es_translation.py
index 49c49982..9cee527b 100644
--- a/tests/test_es_translation.py
+++ b/tests/test_es_translation.py
@@ -32,13 +32,19 @@ def test_es_extract_translation(self):
             {
                 "input": "{{t+|af|1|kat}}",
                 "expected": [
-                    {"lang_code": "af", "word": "kat", "senseids": ["1"]}
+                    {
+                        "lang": "afrikáans",
+                        "lang_code": "af",
+                        "word": "kat",
+                        "senseids": ["1"],
+                    }
                 ],
             },
             {
                 "input": "{{t+|de|1, 2|Katze|f|,|1|Kater|m|nota|gato macho|,|8|Tic Tac Toe}}",
                 "expected": [
                     {
+                        "lang": "alemán",
                         "lang_code": "de",
                         "word": "Katze",
                         "senseids": ["1", "2"],
@@ -46,6 +52,7 @@ def test_es_extract_translation(self):
                     },
                     {
                         "lang_code": "de",
+                        "lang": "alemán",
                         "word": "Kater",
                         "senseids": ["1"],
                         "tags": ["m"],
@@ -53,6 +60,7 @@ def test_es_extract_translation(self):
                     },
                     {
                         "lang_code": "de",
+                        "lang": "alemán",
                         "word": "Tic Tac Toe",
                         "senseids": ["8"],
                     },
@@ -62,6 +70,7 @@ def test_es_extract_translation(self):
                 "input": "{{t+|fr|1|profession|nl|de|bateleur}}",
                 "expected": [
                     {
+                        "lang": "francés",
                         "lang_code": "fr",
                         "word": "profession de bateleur",
                         "senseids": ["1"],
@@ -72,6 +81,7 @@ def test_es_extract_translation(self):
                 "input": "{{t+|hy|1|կատու|tr|katu}}",
                 "expected": [
                     {
+                        "lang": "armenio",
                         "lang_code": "hy",
                         "word": "կատու",
                         "roman": "katu",
@@ -83,6 +93,7 @@ def test_es_extract_translation(self):
                 "input": "{{t+|hy|1|կատու|tr=katu}}",
                 "expected": [
                     {
+                        "lang": "armenio",
                         "lang_code": "hy",
                         "word": "կատու",
                         "roman": "katu",
@@ -93,9 +104,24 @@ def test_es_extract_translation(self):
             {
                 "input": "{{t+|de|amphibisch|adj|,|Amphibie|sust|,|Amphibium|sust}}",
                 "expected": [
-                    {"lang_code": "de", "word": "amphibisch", "tags": ["adj"]},
-                    {"lang_code": "de", "word": "Amphibie", "tags": ["sust"]},
-                    {"lang_code": "de", "word": "Amphibium", "tags": ["sust"]},
+                    {
+                        "lang": "alemán",
+                        "lang_code": "de",
+                        "word": "amphibisch",
+                        "tags": ["adj"],
+                    },
+                    {
+                        "lang": "alemán",
+                        "lang_code": "de",
+                        "word": "Amphibie",
+                        "tags": ["sust"],
+                    },
+                    {
+                        "lang": "alemán",
+                        "lang_code": "de",
+                        "word": "Amphibium",
+                        "tags": ["sust"],
+                    },
                 ],
             },
         ]

From 55e843dab7c9c018a2282955e1e8cfa9cd8d1d06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Wed, 31 Jan 2024 09:21:42 +0200
Subject: [PATCH 3/4] Es-edition: skip certain etymology templates

There's a lot of kludge here, so the next commit is probably
just... going to nuke this completely. Committing this to keep
the history, just in case there's something useful here.
---
 src/wiktextract/extractor/es/etymology.py | 45 ++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/src/wiktextract/extractor/es/etymology.py b/src/wiktextract/extractor/es/etymology.py
index 5460dc32..6adfb995 100644
--- a/src/wiktextract/extractor/es/etymology.py
+++ b/src/wiktextract/extractor/es/etymology.py
@@ -1,9 +1,19 @@
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.extractor.es.models import EtymologyTemplate, WordEntry
-from wiktextract.page import clean_node
+from wiktextract.page import clean_node, LEVEL_KINDS
 from wiktextract.wxr_context import WiktextractContext
 
 
+IGNORED_NODE_LEVELS = [NodeKind.HTML] + list(LEVEL_KINDS)
+
+SKIPPED_TEMPLATES = (
+    "ampliable",
+    "arcoiris",
+    "clear",
+    "cita requerida",
+)
+
+
 def process_etymology_block(
     wxr: WiktextractContext,
     entry: WordEntry,
@@ -23,7 +33,27 @@ def process_etymology_block(
     """
 
     has_etymology_info = False
+    ignore_these_templates: list[WikiNode] = []
+    for ignored_node in level_node.find_child_recursively(IGNORED_NODE_LEVELS):
+        # stuff inside <ref></ref> should be ignored, and obvious mistakes
+        # like a sub-section to etymology (wrong level kind, in es.wiktionary
+        # these should all apparently be sibligns, so Etymology sections
+        # shouldn't have other levels as their children (example:
+        # calzado around 2024-01-30
+        if ignored_node.kind == NodeKind.HTML and ignored_node.sarg == "reg":
+            # HTML nodes other than REG should be fine
+            continue
+        ignore_these_templates.extend(
+            ignored_node.find_child_recursively(NodeKind.TEMPLATE)
+        )
+
     for template_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
+        if template_node in ignore_these_templates:
+            continue
+
+        if template_node.template_name in SKIPPED_TEMPLATES:
+            continue
+
         entry.etymology_templates = entry.etymology_templates or []
 
         etymology_template = EtymologyTemplate(
@@ -31,6 +61,15 @@ def process_etymology_block(
             expansion=clean_node(wxr, None, template_node),
         )
 
+        if etymology_template.expansion in (
+            # "Please fill in this etymology, thank you..."
+            "Si puedes, incorpórala: ver cómo",
+            "Préstamo no adaptado.",
+            "Este lema en este idioma es ampliable. "
+            "Retira este aviso si la mayor parte de las acepciones ya están incluidas.",
+        ):
+            continue
+
         args = {}
         for index, param in template_node.template_parameters.items():
             args[str(index)] = (
@@ -42,6 +81,10 @@ def process_etymology_block(
         if args:
             etymology_template.args = args
 
+        # DEBUG
+        if not args:
+            print(f"EMPTY ARGS in {entry.word}, {etymology_template}")
+
         entry.etymology_templates.append(etymology_template)
 
     if has_etymology_info:

From 372b1172f392b82bdc9302dc77bd45b695769fbd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Wed, 31 Jan 2024 09:46:08 +0200
Subject: [PATCH 4/4] Es-edition: Remove kludges, use substring "whitelist"

Really, all the previous kludges in the previous commit are
unnecessary if I'd just realized we only want to keep
etymology template data for *etymology templates*, not
just any template. So a simple substring check should suffice.
It is unlikely that there are any conflicting templates
with "etim" in their name, but possible.
---
 src/wiktextract/extractor/es/etymology.py | 47 +++++++----------------
 tests/test_es_etymology.py                | 12 +-----
 2 files changed, 14 insertions(+), 45 deletions(-)

diff --git a/src/wiktextract/extractor/es/etymology.py b/src/wiktextract/extractor/es/etymology.py
index 6adfb995..9d9d020a 100644
--- a/src/wiktextract/extractor/es/etymology.py
+++ b/src/wiktextract/extractor/es/etymology.py
@@ -1,19 +1,11 @@
+from typing import cast
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import TemplateNode
 from wiktextract.extractor.es.models import EtymologyTemplate, WordEntry
-from wiktextract.page import clean_node, LEVEL_KINDS
+from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
 
-IGNORED_NODE_LEVELS = [NodeKind.HTML] + list(LEVEL_KINDS)
-
-SKIPPED_TEMPLATES = (
-    "ampliable",
-    "arcoiris",
-    "clear",
-    "cita requerida",
-)
-
-
 def process_etymology_block(
     wxr: WiktextractContext,
     entry: WordEntry,
@@ -33,25 +25,14 @@ def process_etymology_block(
     """
 
     has_etymology_info = False
-    ignore_these_templates: list[WikiNode] = []
-    for ignored_node in level_node.find_child_recursively(IGNORED_NODE_LEVELS):
-        # stuff inside <ref></ref> should be ignored, and obvious mistakes
-        # like a sub-section to etymology (wrong level kind, in es.wiktionary
-        # these should all apparently be sibligns, so Etymology sections
-        # shouldn't have other levels as their children (example:
-        # calzado around 2024-01-30
-        if ignored_node.kind == NodeKind.HTML and ignored_node.sarg == "reg":
-            # HTML nodes other than REG should be fine
-            continue
-        ignore_these_templates.extend(
-            ignored_node.find_child_recursively(NodeKind.TEMPLATE)
-        )
 
     for template_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
-        if template_node in ignore_these_templates:
-            continue
-
-        if template_node.template_name in SKIPPED_TEMPLATES:
+        # no-op type-annotation cast; we softly assert template_node is a
+        # TemplateNode, which has .template_name, to quiet the type-checker.
+        template_node = cast(TemplateNode, template_node)
+        if "etim" not in template_node.template_name:
+            # We don't want to keep any other template data other than
+            # the main etymology templates (and maybe Plantilla:etim)
             continue
 
         entry.etymology_templates = entry.etymology_templates or []
@@ -63,7 +44,7 @@ def process_etymology_block(
 
         if etymology_template.expansion in (
             # "Please fill in this etymology, thank you..."
-            "Si puedes, incorpórala: ver cómo",
+            "Si puedes, incorpórala: ver cómo.",
             "Préstamo no adaptado.",
             "Este lema en este idioma es ampliable. "
             "Retira este aviso si la mayor parte de las acepciones ya están incluidas.",
@@ -77,14 +58,12 @@ def process_etymology_block(
                 if isinstance(param, str)
                 else clean_node(wxr, None, param)
             )
+            # if any other index other than "leng" is encountered,
+            # has_etymology => True
             has_etymology_info = has_etymology_info or index != "leng"
-        if args:
+        if args and not (len(args) == 1 and "leng" in args):
             etymology_template.args = args
 
-        # DEBUG
-        if not args:
-            print(f"EMPTY ARGS in {entry.word}, {etymology_template}")
-
         entry.etymology_templates.append(etymology_template)
 
     if has_etymology_info:
diff --git a/tests/test_es_etymology.py b/tests/test_es_etymology.py
index 7527f143..f80a7f37 100644
--- a/tests/test_es_etymology.py
+++ b/tests/test_es_etymology.py
@@ -34,17 +34,7 @@ def test_es_extract_etymology(self):
             {
                 # https://es.wiktionary.org/wiki/Schreck
                 "input": "{{etimología|leng=de}}",
-                "expected": {
-                    "etymology_templates": [
-                        {
-                            "args": {
-                                "leng": "de",
-                            },
-                            "name": "etimología",
-                            "expansion": "Si puedes, incorpórala: ver cómo.",
-                        },
-                    ],
-                },
+                "expected": dict(),
             },
             {
                 # https://es.wiktionary.org/wiki/bagre