Extract semantic relations from German Wiktionary

This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. Fix types for python3.9
tatuylonen · Oct 20, 2023 · 551b95b · 551b95b
1 parent bcef60e
commit 551b95b
Show file tree

Hide file tree

Showing 6 changed files with 303 additions and 10 deletions.
diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -7,12 +7,12 @@
 from wikitextprocessor.parser import LevelNode
 
 from wiktextract.datautils import append_base_data
-from wiktextract.extractor.de.pronunciation import extract_pronunciation
 from wiktextract.wxr_context import WiktextractContext
 
 from .example import extract_examples
 from .gloss import extract_glosses
 from .pronunciation import extract_pronunciation
+from .semantic_relations import extract_semantic_relations
 from .translation import extract_translation
 
 # Templates that are used to form panels on pages and that should be ignored in
@@ -67,12 +67,24 @@ def parse_section(
         wxr.wtp.start_subsection(section_name)
         if section_name == "Bedeutungen":
             extract_glosses(wxr, page_data, level_node)
-        if section_name == "Aussprache":
+        elif section_name == "Aussprache":
             extract_pronunciation(wxr, page_data, level_node)
-        if section_name == "Beispiele":
+        elif section_name == "Beispiele":
             extract_examples(wxr, page_data, level_node)
-        if section_name == "Übersetzungen":
+        elif section_name == "Übersetzungen":
             extract_translation(wxr, page_data, level_node)
+        elif section_name in [
+            "Gegenwörter",
+            "Holonyme",
+            "Oberbegriffe",
+            "Redewendungen",
+            "Sinnverwandte Wörter",
+            "Sprichwörter",
+            "Synonyme",
+            "Unterbegriffe",
+            "Wortbildungen",
+        ]:
+            extract_semantic_relations(wxr, page_data, level_node)
 
 
 FORM_POS = {

diff --git a/src/wiktextract/extractor/de/semantic_relations.py b/src/wiktextract/extractor/de/semantic_relations.py
@@ -0,0 +1,97 @@
+import re
+from typing import Dict, List
+
+from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import LevelNode
+
+from wiktextract.extractor.de.utils import split_senseids
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+RELATION_TYPES = {
+    "Gegenwörter": "antonyms",
+    "Holonyme": "holonyms",
+    "Oberbegriffe": "hypernyms",
+    "Redewendungen": "expressions",
+    "Sinnverwandte Wörter": "coordinate_terms",
+    "Sprichwörter": "proverbs",
+    "Synonyme": "synonyms",
+    "Unterbegriffe": "hyponyms",
+    "Wortbildungen": "derived",
+}
+
+
+def extract_semantic_relations(
+    wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode
+):
+    relation_key = RELATION_TYPES.get(level_node.largs[0][0])
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            # Get the senseids
+            senseids = (
+                split_senseids(list_item.children[0])
+                if (
+                    len(list_item.children) > 0
+                    and isinstance(list_item.children[0], str)
+                )
+                else []
+            )
+
+            # Extract links
+            semantic_links = []
+            if relation_key == "expressions":
+                for child in list_item.children:
+                    if isinstance(child, str) and contains_dash(child):
+                        # XXX Capture the part after the dash as an explanatory note to the expression, e.g.:
+                        # https://de.wiktionary.org/wiki/Beispiel
+                        # ":[[ein gutes Beispiel geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]"
+                        break
+                    elif (
+                        isinstance(child, WikiNode)
+                        and child.kind == NodeKind.LINK
+                    ):
+                        process_link(wxr, semantic_links, child)
+            else:
+                for link in list_item.find_child(NodeKind.LINK):
+                    process_link(wxr, semantic_links, link)
+
+            # Add links to the page data
+            if len(page_data[-1]["senses"]) == 1:
+                page_data[-1]["senses"][0][relation_key].extend(semantic_links)
+            elif len(senseids) > 0:
+                for senseid in senseids:
+                    for sense in page_data[-1]["senses"]:
+                        if sense["senseid"] == senseid:
+                            sense[relation_key].extend(semantic_links)
+            else:
+                page_data[-1][relation_key].extend(semantic_links)
+
+            # Check for potentially missed data
+            for non_link in list_item.invert_find_child(NodeKind.LINK):
+                if (
+                    relation_key == "expressions"
+                    and isinstance(non_link, str)
+                    and contains_dash(non_link)
+                ):
+                    break
+                elif isinstance(non_link, str) and (
+                    non_link.startswith("[") or len(non_link.strip()) <= 3
+                ):
+                    continue
+                wxr.wtp.debug(
+                    f"Found unexpected non-link node '{non_link}' in: {list_item}",
+                    sortid="extractor/de/semantic_relations/extract_semantic_relations/84",
+                )
+
+
+def process_link(
+    wxr: WiktextractContext, semantic_links: List[str], link: WikiNode
+):
+    clean_link = clean_node(wxr, {}, link)
+    if clean_link.startswith("Verzeichnis:"):
+        return
+    semantic_links.append(clean_link)
+
+
+def contains_dash(text: str):
+    return re.search(r"[–—―‒-]", text)
diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py
@@ -112,10 +112,12 @@ def process_translation_list(
             if node.template_name[-1] == "?":
                 translation_data["uncertain"] = True
 
-            translation_data["word"] = node.template_parameters.get(2)
+            translation_data["word"] = clean_node(
+                wxr, {}, node.template_parameters.get(2)
+            )
 
             if node.template_name.removesuffix("?") == "Ü":
-                process_Ü_template(translation_data, node)
+                process_Ü_template(wxr, translation_data, node)
 
             if node.template_name.removesuffix("?") == "Üt":
                 process_Üt_template(wxr, translation_data, node)
@@ -134,12 +136,13 @@ def is_translation_template(node: any) -> bool:
 
 
 def process_Ü_template(
+    wxr: WiktextractContext,
     translation_data: Dict[str, Union[str, List, bool]],
     template_node: TemplateNode,
 ):
-    overwrite_word = template_node.template_parameters.get(3)
-    if overwrite_word:
-        translation_data["word"] = overwrite_word
+    overwrite_word(
+        wxr, translation_data, template_node.template_parameters.get(3)
+    )
 
 
 def process_Üt_template(
@@ -158,7 +161,19 @@ def process_Üt_template(
         if match:
             translation_data["roman"] = match.group(1)
 
-    overwrite_word = template_node.template_parameters.get(4)
+    overwrite_word(
+        wxr, translation_data, template_node.template_parameters.get(4)
+    )
+
+
+def overwrite_word(
+    wxr: WiktextractContext,
+    translation_data: Dict[str, Union[str, List, bool]],
+    nodes: Union[List[Union[WikiNode, str]], WikiNode, str, None],
+):
+    if nodes == None:
+        return
+    overwrite_word = clean_node(wxr, {}, nodes).strip()
     if overwrite_word:
         translation_data["word"] = overwrite_word
 

diff --git a/src/wiktextract/extractor/de/utils.py b/src/wiktextract/extractor/de/utils.py
@@ -1,4 +1,5 @@
 import re
+from typing import List
 
 from wikitextprocessor import NodeKind, WikiNode
 
@@ -23,3 +24,31 @@ def find_and_remove_child(node: WikiNode, kind: NodeKind, cb=None):
         del node.children[idx]
         children.append(child)
     return reversed(children)
+
+
+def split_senseids(senseids_str: str) -> List[str]:
+    senseids = []
+    raw_ids = (
+        senseids_str.strip().removeprefix("[").removesuffix("]").split(",")
+    )
+    for raw_id in raw_ids:
+        range_split = raw_id.split("-")
+        if len(range_split) == 1:
+            senseids.append(raw_id.strip())
+        elif len(range_split) == 2:
+            try:
+                start = re.sub(r"[a-z]", "", range_split[0].strip())
+                end = re.sub(r"[a-z]", "", range_split[1].strip())
+                senseids.extend(
+                    [
+                        str(id)
+                        for id in range(
+                            int(start),
+                            int(end) + 1,
+                        )
+                    ]
+                )
+            except:
+                pass
+
+    return senseids
diff --git a/tests/test_de_semantic_relations.py b/tests/test_de_semantic_relations.py
@@ -0,0 +1,116 @@
+import unittest
+from collections import defaultdict
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.de.semantic_relations import (
+    extract_semantic_relations,
+)
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestDETranslation(unittest.TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de")
+        )
+
+    def tearDown(self) -> None:
+        self.wxr.wtp.close_db_conn()
+
+    def test_de_extract_semantic_relations(self):
+        test_cases = [
+            # https://de.wiktionary.org/wiki/Beispiel
+            # Extracts linkages and places them in the correct sense.
+            {
+                "input": "==== Sinnverwandte Wörter ====\n:[1] [[Beleg]], [[Exempel]]\n:[2] [[Muster]], [[Vorbild]]",
+                "page_data": [
+                    defaultdict(
+                        list,
+                        {
+                            "senses": [
+                                defaultdict(list, {"senseid": "1"}),
+                                defaultdict(list, {"senseid": "2"}),
+                            ]
+                        },
+                    )
+                ],
+                "expected": [
+                    {
+                        "senses": [
+                            {
+                                "senseid": "1",
+                                "coordinate_terms": ["Beleg", "Exempel"],
+                            },
+                            {
+                                "senseid": "2",
+                                "coordinate_terms": ["Muster", "Vorbild"],
+                            },
+                        ]
+                    }
+                ],
+            },
+            # https://de.wiktionary.org/wiki/Beispiel
+            # Cleans explanatory text from expressions.
+            {
+                "input": "====Redewendungen====\n:[[ein gutes Beispiel geben|ein gutes ''Beispiel'' geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]",
+                "page_data": [defaultdict(list)],
+                "expected": [
+                    {
+                        "expressions": ["ein gutes Beispiel geben"],
+                        "senses": [],
+                    },
+                ],
+            },
+            # Always places relations in first sense if just one sense.
+            {
+                "input": "====Synonyme====\n:[[Synonym1]]",
+                "page_data": [
+                    defaultdict(
+                        list, {"senses": [defaultdict(list, {"senseid": "1"})]}
+                    )
+                ],
+                "expected": [
+                    {
+                        "senses": [{"senseid": "1", "synonyms": ["Synonym1"]}],
+                    },
+                ],
+            },
+            # https://de.wiktionary.org/wiki/Kokospalme
+            # Ignores modifiers of relations and all other text.
+            {
+                "input": "====Synonyme====\n:[1] [[Kokosnusspalme]], ''wissenschaftlich:'' [[Cocos nucifera]]",
+                "page_data": [
+                    defaultdict(
+                        list, {"senses": [defaultdict(list, {"senseid": "1"})]}
+                    )
+                ],
+                "expected": [
+                    {
+                        "senses": [
+                            {
+                                "senseid": "1",
+                                "synonyms": [
+                                    "Kokosnusspalme",
+                                    "Cocos nucifera",
+                                ],
+                            }
+                        ],
+                    },
+                ],
+            },
+        ]
+
+        for case in test_cases:
+            with self.subTest(case=case):
+                self.wxr.wtp.start_page("")
+                root = self.wxr.wtp.parse(case["input"])
+
+                extract_semantic_relations(
+                    self.wxr, case["page_data"], root.children[0]
+                )
+
+                self.assertEqual(case["page_data"], case["expected"])
diff --git a/tests/test_de_utils.py b/tests/test_de_utils.py
@@ -0,0 +1,24 @@
+import unittest
+
+from wiktextract.extractor.de.utils import split_senseids
+
+
+class TestDEUtils(unittest.TestCase):
+    maxDiff = None
+
+    def test_split_senseids(self):
+        test_cases = [
+            ("[1]", ["1"]),
+            ("[1,2]", ["1", "2"]),
+            ("[1, 2]", ["1", "2"]),
+            ("[1, 2 ]", ["1", "2"]),
+            ("[1-3]", ["1", "2", "3"]),
+            ("[1, 3-5]", ["1", "3", "4", "5"]),
+            ("[1, 3-4, 6]", ["1", "3", "4", "6"]),
+            ("[1a]", ["1a"]),
+            ("[1, 2a]", ["1", "2a"]),
+            ("[1, 2a-3]", ["1", "2", "3"]),
+        ]
+
+        for test_case in test_cases:
+            self.assertEqual(split_senseids(test_case[0]), test_case[1])