Merge pull request #565 from xxyzz/de

Fix exceptions/warnings and translate some raw tags in de edition
tatuylonen · Mar 29, 2024 · 8944339 · 8944339
2 parents e15ea73 + 4175115
commit 8944339
Show file tree

Hide file tree

Showing 8 changed files with 213 additions and 84 deletions.
diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py
@@ -51,35 +51,33 @@ def extract_examples(
             for ref_node in ref_nodes:
                 extract_reference(wxr, example_data, ref_node)
 
-            example_text = clean_node(wxr, {}, list_item_node.children)
+            example_text = clean_node(wxr, None, list_item_node.children)
 
             senseid, example_text = match_senseid(example_text)
 
-            if example_text:
+            if len(example_text) > 0:
                 example_data.text = example_text
-
-            if senseid:
-                for sense in word_entry.senses:
-                    if sense.senseid == senseid:
-                        sense.examples.append(copy.deepcopy(example_data))
-
-            else:
-                if example_data:
+                if len(senseid) > 0:
+                    for sense in word_entry.senses:
+                        if sense.senseid == senseid:
+                            sense.examples.append(copy.deepcopy(example_data))
+                else:
                     wxr.wtp.debug(
-                        f"Found example data without senseid and text: {example_data}",
+                        f"Found example data without senseid: {example_data}",
                         sortid="extractor/de/examples/extract_examples/28",
                     )
+
     for non_list_node in level_node.invert_find_child(NodeKind.LIST):
         wxr.wtp.debug(
-            f"Found unexpected non-list node in example section: {non_list_node}",
+            f"Found unexpected non-list node in examples: {non_list_node}",
             sortid="extractor/de/examples/extract_examples/33",
         )
 
 
 def extract_reference(
     wxr: WiktextractContext, example_data: Example, ref_node: WikiNode
 ):
-    example_data.raw_ref = clean_node(wxr, {}, ref_node.children)
+    example_data.raw_ref = clean_node(wxr, None, ref_node.children)
 
     template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE))
 

diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
@@ -1,11 +1,10 @@
-import re
-
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode, TemplateNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
 from .models import Sense, WordEntry
+from .tags import translate_raw_tags
 from .utils import match_senseid
 
 
@@ -59,19 +58,32 @@ def process_gloss_list_item(
                                 raw_tag = clean_node(wxr, None, k_arg_value)
                                 sense_data.raw_tags.append(raw_tag)
                         clean_node(wxr, sense_data, gloss_node)
+                    elif gloss_node.template_name.endswith("."):
+                        raw_tag = clean_node(
+                            wxr, sense_data, gloss_node
+                        ).removesuffix(":")
+                        sense_data.raw_tags.append(raw_tag)
                     elif gloss_node.template_name in (
                         "QS Herkunft",
                         "QS Bedeutungen",
                     ):
                         continue
+                    else:
+                        gloss_nodes.append(gloss_node)
                 elif (
                     isinstance(gloss_node, WikiNode)
                     and gloss_node.kind == NodeKind.ITALIC
                 ):
-                    raw_tag = clean_node(wxr, None, gloss_node).removesuffix(
-                        ":"
-                    )
-                    sense_data.raw_tags.append(raw_tag)
+                    italic_text = clean_node(wxr, None, gloss_node)
+                    if italic_text.endswith(":"):
+                        for raw_tag in italic_text.removesuffix(":").split(
+                            ", "
+                        ):
+                            raw_tag = raw_tag.strip()
+                            if len(raw_tag) > 0:
+                                sense_data.raw_tags.append(raw_tag)
+                    else:
+                        gloss_nodes.append(italic_text)
                 elif not (
                     isinstance(gloss_node, WikiNode)
                     and gloss_node.kind == NodeKind.LIST
@@ -95,7 +107,8 @@ def process_gloss_list_item(
                 )
 
             if len(gloss_text) > 0:
-                sense_data.glosses.append(gloss_text)
+                sense_data.glosses.append(gloss_text.removeprefix(", "))
+                translate_raw_tags(sense_data)
                 word_entry.senses.append(sense_data)
 
             for sub_list_node in list_item_node.find_child(NodeKind.LIST):
@@ -113,16 +126,3 @@ def process_gloss_list_item(
             )
             continue
     return parent_sense
-
-
-def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None:
-    parts = gloss_text.split(":", 1)
-    if len(parts) > 1:
-        tags_part = parts[0].strip()
-
-        categories = [c.strip() for c in re.split(",", tags_part)]
-        if all(c.isalnum() for c in categories):
-            sense_data.raw_tags.extend(categories)
-            return parts[1].strip()
-
-    return gloss_text
diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py
@@ -166,3 +166,4 @@ class WordEntry(BaseModelWrap):
     synonyms: list[Linkage] = []
     tags: list[str] = []
     raw_tags: list[str] = []
+    categories: list[str] = []
diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -10,7 +10,7 @@
 from .example import extract_examples
 from .gloss import extract_glosses
 from .linkage import extract_linkages
-from .models import WordEntry
+from .models import Sense, WordEntry
 from .pronunciation import extract_pronunciation
 from .section_titles import LINKAGE_TITLES, POS_SECTIONS
 from .translation import extract_translation
@@ -200,4 +200,7 @@ def parse_page(
                 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
                     parse_section(wxr, page_data, base_data, level3_node)
 
+    for data in page_data:
+        if len(data.senses) == 0:
+            data.senses.append(Sense(tags=["no-gloss"]))
     return [d.model_dump(exclude_defaults=True) for d in page_data]
diff --git a/src/wiktextract/extractor/de/tags.py b/src/wiktextract/extractor/de/tags.py
@@ -0,0 +1,130 @@
+from .models import WordEntry
+
+# https://de.wiktionary.org/wiki/Vorlage:K
+K_TEMPLATE_TAGS = {
+    "Abl.": "ablative",
+    "Ablativ": "ablative",
+    "abw.": "derogatory",
+    "AE": "US",
+    "AmE": "US",
+    "adv.": "adverbial",
+    "Akkusativ": "accusative",
+    "alemann.": "Alemannic",
+    "alemannisch": "Alemannic",
+    "allg.": "general",
+    "allgemein": "general",
+    "alltagsspr.": "colloquial",
+    "amtsspr.": "officialese",
+    # "ansonsten": "otherwise",  # combined with other text
+    "attr.": "attributive",
+    # "auch": "also",
+    "bair.": "Bavarian",
+    "bairisch": "Bavarian",
+    "bar.": "Bavarian",
+    "BE": "British",
+    "BrE": "British",
+    "Bedva.": "outdated",
+    "Bedvatd.": "outdated",
+    # "bei": "",
+    # "bes.": "especially",
+    # "besonders": "especially",
+    # "beziehungsweise": "",
+    # "bzw.": "",
+    # "bildungsspr.": "",
+    # "bis": "",
+    # "bisweilen": "",
+    # "das": "",
+    "Dativ": "dative",
+    # "DDR": "",
+    # "der": "",
+    "dichter.": "poetic",
+    # "die": "",
+    "Dim.": "diminutive",
+    "Dimin.": "diminutive",
+    "Diminutiv": "diminutive",
+    # "eher": "",
+    "erzg.": "Erzgebirgisch",
+    "erzgeb.": "Erzgebirgisch",
+    "erzgebirgisch": "Erzgebirgisch",
+    "euph.": "euphemistic",
+    "fachspr.": "jargon",
+    "fam.": "familiär",
+    "fig": "figurative",
+    "fig.": "figurative",
+    # "früher": "",
+    # "gegenwartslateinisch": "",
+    "geh.": "gehoben",
+    "Genitiv": "genitive",
+    "gsm": "Swiss German",
+    "häufig": "often",
+    "haben": "auxiliary",
+    "hebben": "auxiliary",
+    "hauptsächlich": "primarily",
+    "hist.": "historical",
+    "ieS": "narrowly",
+    "i.e.S.": "narrowly",
+    "i. e. S.": "narrowly",
+    # "im": "",
+    # "in": "",
+    # "in Bezug auf": "relational",
+    "indekl.": "indeclinable",
+    # "insbes.": "",
+    "Instrumental": "instrumental",
+    "intrans.": "intransitive",
+    "intransitiv": "intransitive",
+    # "iPl": "in plural",
+    "iron.": "ironic",
+    # "iwS": "",
+    # "jugendspr.": "",
+    "kinderspr.": "childish",
+    "kirchenlateinisch": "Church Latin",
+    "klasslat.": "Classical Latin",
+    "klassischlateinisch": "Classical Latin",
+    "kPl.": "no-plural",
+    "kSg.": "no-singulative",
+    "kSt.": "no-comparative",
+    "landsch.": "regional",
+    "lautm.": "onomatopoeic",
+    "Ling.": "linguistics",
+    "mA": "accusative",
+    "md.": "Central German",
+    "mdal.": "dialectal",
+    "Med.": "medicine",  # topic
+    # "meist": "mostly",
+    # "meistens": "mostly",
+    "metaphor.": "metaphoric",
+    "meton.": "metonymically",
+    "mG": "genitive",
+    "mitteld.": "Central German",
+    # "mitunter": "",
+    "mlat.": "Medieval Latin",
+    "mittellateinisch": "Medieval Latin",
+    "mundartl.": "dialectal",
+    "nDu.": "only-dual",
+    "nigr.": "Niger",
+    "nigrisch": "Niger",
+    "nkLat.": "post-Classical Latin",
+    "nachklassischlateinisch": "post-Classical Latin",
+    "nlat.": "New Latin",
+    "neulateinisch": "New Latin",
+    "nordd.": "North German",
+    "norddeutsch": "North German",
+    "nordwestd.": "Northwestern Germany",
+    "nPl.": "plural-only",
+    "Österreich": "Austrian German",
+    "österr.": "Austrian German",
+    "österreichisch": "Austrian German",
+    "ostfränkisch": "East Franconian German",
+    "pej.": "pejorative",
+    "poet.": "poetic",
+}
+
+
+def translate_raw_tags(data: WordEntry) -> None:
+    raw_tags = []
+    for raw_tag in data.raw_tags:
+        if raw_tag in K_TEMPLATE_TAGS:
+            data.tags.append(K_TEMPLATE_TAGS[raw_tag])
+        else:
+            raw_tags.append(raw_tag)
+    data.raw_tags = raw_tags
diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
@@ -4,9 +4,7 @@
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.de.gloss import (
     extract_glosses,
-    extract_tags_from_gloss_text,
 )
-from wiktextract.extractor.de.models import Sense
 from wiktextract.extractor.es.models import WordEntry
 from wiktextract.wxr_context import WiktextractContext
 
@@ -144,55 +142,15 @@ def test_k_template_multiple_tags(self):
                         "Verb transitiv (Deutsch)",
                         "Österreichisches Deutsch",
                     ],
-                    "raw_tags": ["trans.", "besonders", "bayrisch", "österr."],
+                    "tags": ["Austrian German"],
+                    "raw_tags": ["trans.", "besonders", "bayrisch"],
                     "glosses": ["Vieh auf der Alm halten"],
                     "senseid": "1",
                 },
             ],
         )
 
-    def test_de_extract_tags_from_gloss_text(self):
-        test_cases = [
-            # https://de.wiktionary.org/wiki/Hengst
-            {
-                "input": "Zoologie: männliches Tier aus der Familie der Einhufer und Kamele",
-                "expected_tags": ["Zoologie"],
-                "expected_gloss": "männliches Tier aus der Familie der Einhufer und Kamele",
-            },
-            # https://de.wiktionary.org/wiki/ARD
-            {
-                "input": "umgangssprachlich, Kurzwort, Akronym: für das erste Fernsehprogramm der ARD",
-                "expected_tags": ["umgangssprachlich", "Kurzwort", "Akronym"],
-                "expected_gloss": "für das erste Fernsehprogramm der ARD",
-            },
-            # https://de.wiktionary.org/wiki/Endspiel
-            {
-                "input": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg",
-                "expected_tags": None,
-                "expected_gloss": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg",
-            },
-            # Add more test cases as needed
-        ]
-        for case in test_cases:
-            with self.subTest(case=case):
-                sense_data = Sense()
-
-                gloss_text = extract_tags_from_gloss_text(
-                    sense_data, case["input"]
-                )
-
-                if case["expected_tags"] is None:
-                    self.assertEqual(
-                        sense_data.model_dump(exclude_defaults=True), {}
-                    )
-                else:
-                    self.assertEqual(
-                        sense_data.raw_tags,
-                        case["expected_tags"],
-                    )
-                self.assertEqual(gloss_text, case["expected_gloss"])
-
-    def test_handle_sense_modifier(self):
+    def test_italic_sense_modifier(self):
         # https://de.wiktionary.org/wiki/habitare
         wikitext = """
 * {{trans.}}
@@ -231,14 +189,47 @@ def test_handle_sense_modifier(self):
                     "senseid": "2.2",
                 },
                 {
-                    "raw_tags": ["intransitiv", "sich befinden"],
+                    "tags": ["intransitive"],
+                    "raw_tags": ["sich befinden"],
                     "glosses": ["wohnen"],
                     "senseid": "3",
                 },
                 {
-                    "raw_tags": ["intransitiv", "übertragen"],
+                    "tags": ["intransitive"],
+                    "raw_tags": ["übertragen"],
                     "glosses": ["sich aufhalten, heimisch sein, zu Hause sein"],
                     "senseid": "4",
                 },
             ],
         )
+
+    def test_italit_node_multiple_raw_tags(self):
+        self.wxr.wtp.add_page(
+            "Vorlage:K", 10, "<i>[[Deutschland]],&#32;[[Fernsehen]]&#58;</i>"
+        )
+        self.wxr.wtp.add_page("Vorlage:ugs.", 10, "''[[umgangssprachlich]]''")
+        self.wxr.wtp.start_page("ARD")
+        root = self.wxr.wtp.parse(
+            """===Bedeutungen===
+:[2] {{K|Deutschland|Fernsehen}} {{ugs.}}, ''[[Kurzwort]], [[Akronym]]:'' für das erste Fernsehprogramm der ARD"""
+        )
+        word_entry = WordEntry(
+            lang="Deutsch", lang_code="de", word="ARD", pos="noun"
+        )
+        extract_glosses(self.wxr, word_entry, root.children[0])
+        self.assertEqual(
+            [s.model_dump(exclude_defaults=True) for s in word_entry.senses],
+            [
+                {
+                    "raw_tags": [
+                        "Deutschland",
+                        "Fernsehen",
+                        "umgangssprachlich",
+                        "Kurzwort",
+                        "Akronym",
+                    ],
+                    "glosses": ["für das erste Fernsehprogramm der ARD"],
+                    "senseid": "2",
+                },
+            ],
+        )