Process K template in German Wiktionary glosses

This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
tatuylonen · Oct 16, 2023 · 433dd7c · 433dd7c
1 parent 7da1f49
commit 433dd7c
Show file tree

Hide file tree

Showing 2 changed files with 162 additions and 14 deletions.
diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
@@ -8,6 +8,7 @@
 
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
+import copy
 
 
 def extract_glosses(
@@ -63,7 +64,7 @@ def process_gloss_list_item(
             raw_gloss = clean_node(wxr, {}, list_item_node.children)
             gloss_data["raw_glosses"] = [raw_gloss]
 
-            extract_categories_from_gloss_node(wxr, gloss_data, list_item_node)
+            process_K_template(wxr, gloss_data, list_item_node)
 
             gloss_text = clean_node(wxr, gloss_data, list_item_node.children)
 
@@ -117,17 +118,37 @@ def handle_sense_modifier(wxr, list_item_node):
     pass
 
 
-def extract_categories_from_gloss_node(
+def process_K_template(
     wxr: WiktextractContext,
     gloss_data: defaultdict(list),
     list_item_node: NodeKind.LIST_ITEM,
 ) -> None:
     for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
         if template_node.template_name == "K":
-            categories = template_node.template_parameters.values()
-
-            categories = [clean_node(wxr, {}, [c]) for c in categories]
-
+            categories = []
+
+            temp_node = copy.deepcopy(template_node)
+            for key, value in template_node.template_parameters.items():
+                if isinstance(key, int):
+                    temp_node.largs = temp_node.largs[:1] + [[value]]
+                    # Cleaned K template will always end with ":". Remove it.
+                    category = clean_node(wxr, {}, temp_node)[:-1]
+                    if category:
+                        categories.append(category)
+                if key == "ft":
+                    # ft (free text) is used liberally to modify the context
+                    # template. Sometimes it seems to belong rather to the
+                    # gloss itself. Most of the time it is not useful.
+                    # XXX Treat free text in K templates.
+                    continue
+
+            prep = template_node.template_parameters.get("Prä")
+            case = template_node.template_parameters.get("Kas")
+            category = (prep if prep else "") + (" + " + case if case else "")
+            if category:
+                categories.append(category)
+
+            # Remove the template_node from the children of list_item_node
             list_item_node.children = [
                 c for c in list_item_node.children if c != template_node
             ]

diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
@@ -1,18 +1,35 @@
 import unittest
 from collections import defaultdict
+from unittest.mock import patch
 
 from wikitextprocessor import Wtp
 
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.de.gloss import (
     extract_glosses,
-    extract_categories_from_gloss_node,
+    process_K_template,
     extract_categories_from_gloss_text,
 )
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
+def mock_clean_node(wxr, sense_data, value):
+    param = value.largs[1][0]
+    map = {
+        "trans.": "transitiv",
+        "intrans.": "intransitiv",
+        "refl.": "reflexiv",
+        "kPl.": "kein Plural",
+        "ugs.": "umgangssprachlich",
+        "sein": "Hilfsverb sein",
+        "österr.": "österreichisch",
+    }
+    if param in map:
+        return map[param] + ":"
+    return param + ":"
+
+
 class TestGlossList(unittest.TestCase):
     maxDiff = None
 
@@ -92,7 +109,7 @@ def test_de_extract_glosses_with_subglosses(self):
 
     def test_de_extract_glosses_with_only_subglosses(self):
         self.wxr.wtp.start_page("")
-        self.wxr.wtp.add_page("Vorlage:K", 10, "")
+        self.wxr.wtp.add_page("Vorlage:K", 10, "category:")
         root = self.wxr.wtp.parse(
             ":[1] {{K|category}}\n::[a] subglossA\n::[1b] subglossB"
         )
@@ -123,16 +140,17 @@ def test_de_extract_glosses_with_only_subglosses(self):
             ],
         )
 
-    def test_de_extract_categories_from_gloss_node(self):
+    @patch("wiktextract.extractor.de.gloss.clean_node", mock_clean_node)
+    def test_process_K_template_removes_K_template_nodes(self):
         self.wxr.wtp.start_page("")
-        self.wxr.wtp.add_page("Vorlage:K", 10, "")
-        root = self.wxr.wtp.parse(":[1] {{K|category1|category2}} gloss1")
-
-        list_item_node = root.children[0].children[0]
+        # self.wxr.wtp.add_page("Vorlage:K", 10, "")
+        root = self.wxr.wtp.parse("{{K|category1|category2}} gloss1")
 
         gloss_data = defaultdict(list)
 
-        extract_categories_from_gloss_node(self.wxr, gloss_data, list_item_node)
+        self.assertEqual(len(root.children), 2)
+
+        process_K_template(self.wxr, gloss_data, root)
 
         self.assertEqual(
             gloss_data,
@@ -141,6 +159,115 @@ def test_de_extract_categories_from_gloss_node(self):
             },
         )
 
+        self.assertEqual(len(root.children), 1)
+
+    @patch("wiktextract.extractor.de.gloss.clean_node", mock_clean_node)
+    def test_process_K_template(self):
+        # Test cases chosen from:
+        # https://de.wiktionary.org/wiki/Vorlage:K/Doku
+        test_cases = [
+            # https://de.wiktionary.org/wiki/delektieren
+            {"input": "{{K|refl.}}", "expected_categories": ["reflexiv"]},
+            # https://de.wiktionary.org/wiki/delektieren
+            {"input": "{{K|trans.}}", "expected_categories": ["transitiv"]},
+            # https://de.wiktionary.org/wiki/abbreviare
+            {
+                "input": "{{K|trans.|ft=etwas in seinem [[räumlich]]en oder [[zeitlich]]en [[Ausmaß]] verringern|spr=it}}",
+                "expected_categories": ["transitiv"],
+            },
+            # https://de.wiktionary.org/wiki/abbreviare
+            {
+                "input": "{{K|trans.|Linguistik|Wortbildung|spr=it}}",
+                "expected_categories": [
+                    "transitiv",
+                    "Linguistik",
+                    "Wortbildung",
+                ],
+            },
+            # https://de.wiktionary.org/wiki/Bakterie
+            {"input": "{{K|Biologie}}", "expected_categories": ["Biologie"]},
+            # https://de.wiktionary.org/wiki/Kraut
+            {
+                "input": "{{K|kPl.|ugs.}}",
+                "expected_categories": ["kein Plural", "umgangssprachlich"],
+            },
+            # https://de.wiktionary.org/wiki/almen
+            # Ideally we would filter out "besonders" but there doesn't seem
+            # to be a general rule which categories are semmantially relevant
+            {
+                "input": "{{K|trans.|t1=;|besonders|t2=_|bayrisch|österr.}}",
+                "expected_categories": [
+                    "transitiv",
+                    "besonders",
+                    "bayrisch",
+                    "österreichisch",
+                ],
+            },
+            # https://de.wiktionary.org/wiki/Agentur
+            {
+                "input": "{{K|Behörde|ft=seit etwa 2000 in Deutschland}}",
+                "expected_categories": ["Behörde"],
+            },
+            # https://de.wiktionary.org/wiki/Objekt
+            {
+                "input": "{{K|Astronomie|ft=kurz für}}",
+                "expected_categories": ["Astronomie"],
+            },
+            # https://de.wiktionary.org/wiki/einlaufen
+            {
+                "input": "{{K|intrans.|Nautik|t7=_|ft=(von Schiffen)}}",
+                "expected_categories": ["intransitiv", "Nautik"],
+            },
+            # https://de.wiktionary.org/wiki/Pfund
+            {
+                "input": "{{K|veraltet|veraltend|t1=;|t7=_|ft=(in Deutschland)}}",
+                "expected_categories": ["veraltet", "veraltend"],
+            },
+            # https://de.wiktionary.org/wiki/umkippen
+            {"input": "{{K|sein}}", "expected_categories": ["Hilfsverb sein"]},
+            # https://de.wiktionary.org/wiki/umkippen
+            {
+                "input": "{{K|sein|salopp}}",
+                "expected_categories": ["Hilfsverb sein", "salopp"],
+            },
+            # https://de.wiktionary.org/wiki/Hasskommentar
+            {
+                "input": "{{K|Internet|ft=[[soziales Netzwerk{{!}}soziale Netzwerke]]}}",
+                "expected_categories": ["Internet"],
+            },
+            # https://de.wiktionary.org/wiki/abominabilis
+            {
+                "input": "{{K|spätlateinisch|spr=la}}",
+                "expected_categories": ["spätlateinisch"],
+            },
+            # https://de.wiktionary.org/wiki/zählen
+            {
+                "input": "{{K|intrans.|Prä=auf|Kas=Akk.|ft=(auf jemanden/etwas zählen)}}",
+                "expected_categories": ["intransitiv", "auf + Akk."],
+            },
+            # https://de.wiktionary.org/wiki/bojovat
+            {
+                "input": "{{K|intrans.|Prä=proti|Kas=Dativ||ft=bojovat [[proti]] + [[Dativ]]|spr=cs}}",
+                "expected_categories": ["intransitiv", "proti + Dativ"],
+            },
+        ]
+
+        for case in test_cases:
+            with self.subTest(case=case):
+                gloss_data = defaultdict(list)
+
+                self.wxr.wtp.start_page("")
+
+                root = self.wxr.wtp.parse(case["input"])
+
+                process_K_template(self.wxr, gloss_data, root)
+                self.assertEqual(
+                    gloss_data,
+                    {
+                        "categories": case["expected_categories"],
+                    },
+                )
+
     def test_de_extract_categories_from_gloss_text(self):
         test_cases = [
             {