From 551b95bc8045cddf7ade56c6d6a6e6470c7950c3 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Fri, 20 Oct 2023 12:55:48 +0300 Subject: [PATCH] Extract semantic relations from German Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. Fix types for python3.9 --- src/wiktextract/extractor/de/page.py | 20 ++- .../extractor/de/semantic_relations.py | 97 +++++++++++++++ src/wiktextract/extractor/de/translation.py | 27 +++- src/wiktextract/extractor/de/utils.py | 29 +++++ tests/test_de_semantic_relations.py | 116 ++++++++++++++++++ tests/test_de_utils.py | 24 ++++ 6 files changed, 303 insertions(+), 10 deletions(-) create mode 100644 src/wiktextract/extractor/de/semantic_relations.py create mode 100644 tests/test_de_semantic_relations.py create mode 100644 tests/test_de_utils.py diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index dbac69d95..96b9e209b 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -7,12 +7,12 @@ from wikitextprocessor.parser import LevelNode from wiktextract.datautils import append_base_data -from wiktextract.extractor.de.pronunciation import extract_pronunciation from wiktextract.wxr_context import WiktextractContext from .example import extract_examples from .gloss import extract_glosses from .pronunciation import extract_pronunciation +from .semantic_relations import extract_semantic_relations from .translation import extract_translation # Templates that are used to form panels on pages and that should be ignored in @@ -67,12 +67,24 @@ def parse_section( wxr.wtp.start_subsection(section_name) if section_name == "Bedeutungen": extract_glosses(wxr, page_data, level_node) - if section_name == "Aussprache": + elif section_name == "Aussprache": extract_pronunciation(wxr, page_data, level_node) - if section_name == "Beispiele": + elif section_name == "Beispiele": extract_examples(wxr, page_data, level_node) - if section_name == "Übersetzungen": + elif section_name == "Übersetzungen": extract_translation(wxr, page_data, level_node) + elif section_name in [ + "Gegenwörter", + "Holonyme", + "Oberbegriffe", + "Redewendungen", + "Sinnverwandte Wörter", + "Sprichwörter", + "Synonyme", + "Unterbegriffe", + "Wortbildungen", + ]: + extract_semantic_relations(wxr, page_data, level_node) FORM_POS = { diff --git a/src/wiktextract/extractor/de/semantic_relations.py b/src/wiktextract/extractor/de/semantic_relations.py new file mode 100644 index 000000000..fd70ad51f --- /dev/null +++ b/src/wiktextract/extractor/de/semantic_relations.py @@ -0,0 +1,97 @@ +import re +from typing import Dict, List + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import LevelNode + +from wiktextract.extractor.de.utils import split_senseids +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + +RELATION_TYPES = { + "Gegenwörter": "antonyms", + "Holonyme": "holonyms", + "Oberbegriffe": "hypernyms", + "Redewendungen": "expressions", + "Sinnverwandte Wörter": "coordinate_terms", + "Sprichwörter": "proverbs", + "Synonyme": "synonyms", + "Unterbegriffe": "hyponyms", + "Wortbildungen": "derived", +} + + +def extract_semantic_relations( + wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode +): + relation_key = RELATION_TYPES.get(level_node.largs[0][0]) + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + # Get the senseids + senseids = ( + split_senseids(list_item.children[0]) + if ( + len(list_item.children) > 0 + and isinstance(list_item.children[0], str) + ) + else [] + ) + + # Extract links + semantic_links = [] + if relation_key == "expressions": + for child in list_item.children: + if isinstance(child, str) and contains_dash(child): + # XXX Capture the part after the dash as an explanatory note to the expression, e.g.: + # https://de.wiktionary.org/wiki/Beispiel + # ":[[ein gutes Beispiel geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]" + break + elif ( + isinstance(child, WikiNode) + and child.kind == NodeKind.LINK + ): + process_link(wxr, semantic_links, child) + else: + for link in list_item.find_child(NodeKind.LINK): + process_link(wxr, semantic_links, link) + + # Add links to the page data + if len(page_data[-1]["senses"]) == 1: + page_data[-1]["senses"][0][relation_key].extend(semantic_links) + elif len(senseids) > 0: + for senseid in senseids: + for sense in page_data[-1]["senses"]: + if sense["senseid"] == senseid: + sense[relation_key].extend(semantic_links) + else: + page_data[-1][relation_key].extend(semantic_links) + + # Check for potentially missed data + for non_link in list_item.invert_find_child(NodeKind.LINK): + if ( + relation_key == "expressions" + and isinstance(non_link, str) + and contains_dash(non_link) + ): + break + elif isinstance(non_link, str) and ( + non_link.startswith("[") or len(non_link.strip()) <= 3 + ): + continue + wxr.wtp.debug( + f"Found unexpected non-link node '{non_link}' in: {list_item}", + sortid="extractor/de/semantic_relations/extract_semantic_relations/84", + ) + + +def process_link( + wxr: WiktextractContext, semantic_links: List[str], link: WikiNode +): + clean_link = clean_node(wxr, {}, link) + if clean_link.startswith("Verzeichnis:"): + return + semantic_links.append(clean_link) + + +def contains_dash(text: str): + return re.search(r"[–—―‒-]", text) diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py index da6cc92d3..008c96a8e 100644 --- a/src/wiktextract/extractor/de/translation.py +++ b/src/wiktextract/extractor/de/translation.py @@ -112,10 +112,12 @@ def process_translation_list( if node.template_name[-1] == "?": translation_data["uncertain"] = True - translation_data["word"] = node.template_parameters.get(2) + translation_data["word"] = clean_node( + wxr, {}, node.template_parameters.get(2) + ) if node.template_name.removesuffix("?") == "Ü": - process_Ü_template(translation_data, node) + process_Ü_template(wxr, translation_data, node) if node.template_name.removesuffix("?") == "Üt": process_Üt_template(wxr, translation_data, node) @@ -134,12 +136,13 @@ def is_translation_template(node: any) -> bool: def process_Ü_template( + wxr: WiktextractContext, translation_data: Dict[str, Union[str, List, bool]], template_node: TemplateNode, ): - overwrite_word = template_node.template_parameters.get(3) - if overwrite_word: - translation_data["word"] = overwrite_word + overwrite_word( + wxr, translation_data, template_node.template_parameters.get(3) + ) def process_Üt_template( @@ -158,7 +161,19 @@ def process_Üt_template( if match: translation_data["roman"] = match.group(1) - overwrite_word = template_node.template_parameters.get(4) + overwrite_word( + wxr, translation_data, template_node.template_parameters.get(4) + ) + + +def overwrite_word( + wxr: WiktextractContext, + translation_data: Dict[str, Union[str, List, bool]], + nodes: Union[List[Union[WikiNode, str]], WikiNode, str, None], +): + if nodes == None: + return + overwrite_word = clean_node(wxr, {}, nodes).strip() if overwrite_word: translation_data["word"] = overwrite_word diff --git a/src/wiktextract/extractor/de/utils.py b/src/wiktextract/extractor/de/utils.py index a50ab6409..ad6302700 100644 --- a/src/wiktextract/extractor/de/utils.py +++ b/src/wiktextract/extractor/de/utils.py @@ -1,4 +1,5 @@ import re +from typing import List from wikitextprocessor import NodeKind, WikiNode @@ -23,3 +24,31 @@ def find_and_remove_child(node: WikiNode, kind: NodeKind, cb=None): del node.children[idx] children.append(child) return reversed(children) + + +def split_senseids(senseids_str: str) -> List[str]: + senseids = [] + raw_ids = ( + senseids_str.strip().removeprefix("[").removesuffix("]").split(",") + ) + for raw_id in raw_ids: + range_split = raw_id.split("-") + if len(range_split) == 1: + senseids.append(raw_id.strip()) + elif len(range_split) == 2: + try: + start = re.sub(r"[a-z]", "", range_split[0].strip()) + end = re.sub(r"[a-z]", "", range_split[1].strip()) + senseids.extend( + [ + str(id) + for id in range( + int(start), + int(end) + 1, + ) + ] + ) + except: + pass + + return senseids diff --git a/tests/test_de_semantic_relations.py b/tests/test_de_semantic_relations.py new file mode 100644 index 000000000..78ad86c27 --- /dev/null +++ b/tests/test_de_semantic_relations.py @@ -0,0 +1,116 @@ +import unittest +from collections import defaultdict + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.de.semantic_relations import ( + extract_semantic_relations, +) +from wiktextract.wxr_context import WiktextractContext + + +class TestDETranslation(unittest.TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def test_de_extract_semantic_relations(self): + test_cases = [ + # https://de.wiktionary.org/wiki/Beispiel + # Extracts linkages and places them in the correct sense. + { + "input": "==== Sinnverwandte Wörter ====\n:[1] [[Beleg]], [[Exempel]]\n:[2] [[Muster]], [[Vorbild]]", + "page_data": [ + defaultdict( + list, + { + "senses": [ + defaultdict(list, {"senseid": "1"}), + defaultdict(list, {"senseid": "2"}), + ] + }, + ) + ], + "expected": [ + { + "senses": [ + { + "senseid": "1", + "coordinate_terms": ["Beleg", "Exempel"], + }, + { + "senseid": "2", + "coordinate_terms": ["Muster", "Vorbild"], + }, + ] + } + ], + }, + # https://de.wiktionary.org/wiki/Beispiel + # Cleans explanatory text from expressions. + { + "input": "====Redewendungen====\n:[[ein gutes Beispiel geben|ein gutes ''Beispiel'' geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]", + "page_data": [defaultdict(list)], + "expected": [ + { + "expressions": ["ein gutes Beispiel geben"], + "senses": [], + }, + ], + }, + # Always places relations in first sense if just one sense. + { + "input": "====Synonyme====\n:[[Synonym1]]", + "page_data": [ + defaultdict( + list, {"senses": [defaultdict(list, {"senseid": "1"})]} + ) + ], + "expected": [ + { + "senses": [{"senseid": "1", "synonyms": ["Synonym1"]}], + }, + ], + }, + # https://de.wiktionary.org/wiki/Kokospalme + # Ignores modifiers of relations and all other text. + { + "input": "====Synonyme====\n:[1] [[Kokosnusspalme]], ''wissenschaftlich:'' [[Cocos nucifera]]", + "page_data": [ + defaultdict( + list, {"senses": [defaultdict(list, {"senseid": "1"})]} + ) + ], + "expected": [ + { + "senses": [ + { + "senseid": "1", + "synonyms": [ + "Kokosnusspalme", + "Cocos nucifera", + ], + } + ], + }, + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse(case["input"]) + + extract_semantic_relations( + self.wxr, case["page_data"], root.children[0] + ) + + self.assertEqual(case["page_data"], case["expected"]) diff --git a/tests/test_de_utils.py b/tests/test_de_utils.py new file mode 100644 index 000000000..4888f0cb4 --- /dev/null +++ b/tests/test_de_utils.py @@ -0,0 +1,24 @@ +import unittest + +from wiktextract.extractor.de.utils import split_senseids + + +class TestDEUtils(unittest.TestCase): + maxDiff = None + + def test_split_senseids(self): + test_cases = [ + ("[1]", ["1"]), + ("[1,2]", ["1", "2"]), + ("[1, 2]", ["1", "2"]), + ("[1, 2 ]", ["1", "2"]), + ("[1-3]", ["1", "2", "3"]), + ("[1, 3-5]", ["1", "3", "4", "5"]), + ("[1, 3-4, 6]", ["1", "3", "4", "6"]), + ("[1a]", ["1a"]), + ("[1, 2a]", ["1", "2a"]), + ("[1, 2a-3]", ["1", "2", "3"]), + ] + + for test_case in test_cases: + self.assertEqual(split_senseids(test_case[0]), test_case[1])