From 433dd7cff04475f404a734aff79d5e10f7281e31 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Mon, 16 Oct 2023 17:31:40 +0300 Subject: [PATCH] Process K template in German Wiktionary glosses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/de/gloss.py | 33 ++++-- tests/test_de_gloss.py | 143 ++++++++++++++++++++++++-- 2 files changed, 162 insertions(+), 14 deletions(-) diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index 6315259d2..db1d1fdad 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -8,6 +8,7 @@ from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +import copy def extract_glosses( @@ -63,7 +64,7 @@ def process_gloss_list_item( raw_gloss = clean_node(wxr, {}, list_item_node.children) gloss_data["raw_glosses"] = [raw_gloss] - extract_categories_from_gloss_node(wxr, gloss_data, list_item_node) + process_K_template(wxr, gloss_data, list_item_node) gloss_text = clean_node(wxr, gloss_data, list_item_node.children) @@ -117,17 +118,37 @@ def handle_sense_modifier(wxr, list_item_node): pass -def extract_categories_from_gloss_node( +def process_K_template( wxr: WiktextractContext, gloss_data: defaultdict(list), list_item_node: NodeKind.LIST_ITEM, ) -> None: for template_node in list_item_node.find_child(NodeKind.TEMPLATE): if template_node.template_name == "K": - categories = template_node.template_parameters.values() - - categories = [clean_node(wxr, {}, [c]) for c in categories] - + categories = [] + + temp_node = copy.deepcopy(template_node) + for key, value in template_node.template_parameters.items(): + if isinstance(key, int): + temp_node.largs = temp_node.largs[:1] + [[value]] + # Cleaned K template will always end with ":". Remove it. + category = clean_node(wxr, {}, temp_node)[:-1] + if category: + categories.append(category) + if key == "ft": + # ft (free text) is used liberally to modify the context + # template. Sometimes it seems to belong rather to the + # gloss itself. Most of the time it is not useful. + # XXX Treat free text in K templates. + continue + + prep = template_node.template_parameters.get("Prä") + case = template_node.template_parameters.get("Kas") + category = (prep if prep else "") + (" + " + case if case else "") + if category: + categories.append(category) + + # Remove the template_node from the children of list_item_node list_item_node.children = [ c for c in list_item_node.children if c != template_node ] diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index e43cd4d67..35a08f700 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -1,18 +1,35 @@ import unittest from collections import defaultdict +from unittest.mock import patch from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.gloss import ( extract_glosses, - extract_categories_from_gloss_node, + process_K_template, extract_categories_from_gloss_text, ) from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext +def mock_clean_node(wxr, sense_data, value): + param = value.largs[1][0] + map = { + "trans.": "transitiv", + "intrans.": "intransitiv", + "refl.": "reflexiv", + "kPl.": "kein Plural", + "ugs.": "umgangssprachlich", + "sein": "Hilfsverb sein", + "österr.": "österreichisch", + } + if param in map: + return map[param] + ":" + return param + ":" + + class TestGlossList(unittest.TestCase): maxDiff = None @@ -92,7 +109,7 @@ def test_de_extract_glosses_with_subglosses(self): def test_de_extract_glosses_with_only_subglosses(self): self.wxr.wtp.start_page("") - self.wxr.wtp.add_page("Vorlage:K", 10, "") + self.wxr.wtp.add_page("Vorlage:K", 10, "category:") root = self.wxr.wtp.parse( ":[1] {{K|category}}\n::[a] subglossA\n::[1b] subglossB" ) @@ -123,16 +140,17 @@ def test_de_extract_glosses_with_only_subglosses(self): ], ) - def test_de_extract_categories_from_gloss_node(self): + @patch("wiktextract.extractor.de.gloss.clean_node", mock_clean_node) + def test_process_K_template_removes_K_template_nodes(self): self.wxr.wtp.start_page("") - self.wxr.wtp.add_page("Vorlage:K", 10, "") - root = self.wxr.wtp.parse(":[1] {{K|category1|category2}} gloss1") - - list_item_node = root.children[0].children[0] + # self.wxr.wtp.add_page("Vorlage:K", 10, "") + root = self.wxr.wtp.parse("{{K|category1|category2}} gloss1") gloss_data = defaultdict(list) - extract_categories_from_gloss_node(self.wxr, gloss_data, list_item_node) + self.assertEqual(len(root.children), 2) + + process_K_template(self.wxr, gloss_data, root) self.assertEqual( gloss_data, @@ -141,6 +159,115 @@ def test_de_extract_categories_from_gloss_node(self): }, ) + self.assertEqual(len(root.children), 1) + + @patch("wiktextract.extractor.de.gloss.clean_node", mock_clean_node) + def test_process_K_template(self): + # Test cases chosen from: + # https://de.wiktionary.org/wiki/Vorlage:K/Doku + test_cases = [ + # https://de.wiktionary.org/wiki/delektieren + {"input": "{{K|refl.}}", "expected_categories": ["reflexiv"]}, + # https://de.wiktionary.org/wiki/delektieren + {"input": "{{K|trans.}}", "expected_categories": ["transitiv"]}, + # https://de.wiktionary.org/wiki/abbreviare + { + "input": "{{K|trans.|ft=etwas in seinem [[räumlich]]en oder [[zeitlich]]en [[Ausmaß]] verringern|spr=it}}", + "expected_categories": ["transitiv"], + }, + # https://de.wiktionary.org/wiki/abbreviare + { + "input": "{{K|trans.|Linguistik|Wortbildung|spr=it}}", + "expected_categories": [ + "transitiv", + "Linguistik", + "Wortbildung", + ], + }, + # https://de.wiktionary.org/wiki/Bakterie + {"input": "{{K|Biologie}}", "expected_categories": ["Biologie"]}, + # https://de.wiktionary.org/wiki/Kraut + { + "input": "{{K|kPl.|ugs.}}", + "expected_categories": ["kein Plural", "umgangssprachlich"], + }, + # https://de.wiktionary.org/wiki/almen + # Ideally we would filter out "besonders" but there doesn't seem + # to be a general rule which categories are semmantially relevant + { + "input": "{{K|trans.|t1=;|besonders|t2=_|bayrisch|österr.}}", + "expected_categories": [ + "transitiv", + "besonders", + "bayrisch", + "österreichisch", + ], + }, + # https://de.wiktionary.org/wiki/Agentur + { + "input": "{{K|Behörde|ft=seit etwa 2000 in Deutschland}}", + "expected_categories": ["Behörde"], + }, + # https://de.wiktionary.org/wiki/Objekt + { + "input": "{{K|Astronomie|ft=kurz für}}", + "expected_categories": ["Astronomie"], + }, + # https://de.wiktionary.org/wiki/einlaufen + { + "input": "{{K|intrans.|Nautik|t7=_|ft=(von Schiffen)}}", + "expected_categories": ["intransitiv", "Nautik"], + }, + # https://de.wiktionary.org/wiki/Pfund + { + "input": "{{K|veraltet|veraltend|t1=;|t7=_|ft=(in Deutschland)}}", + "expected_categories": ["veraltet", "veraltend"], + }, + # https://de.wiktionary.org/wiki/umkippen + {"input": "{{K|sein}}", "expected_categories": ["Hilfsverb sein"]}, + # https://de.wiktionary.org/wiki/umkippen + { + "input": "{{K|sein|salopp}}", + "expected_categories": ["Hilfsverb sein", "salopp"], + }, + # https://de.wiktionary.org/wiki/Hasskommentar + { + "input": "{{K|Internet|ft=[[soziales Netzwerk{{!}}soziale Netzwerke]]}}", + "expected_categories": ["Internet"], + }, + # https://de.wiktionary.org/wiki/abominabilis + { + "input": "{{K|spätlateinisch|spr=la}}", + "expected_categories": ["spätlateinisch"], + }, + # https://de.wiktionary.org/wiki/zählen + { + "input": "{{K|intrans.|Prä=auf|Kas=Akk.|ft=(auf jemanden/etwas zählen)}}", + "expected_categories": ["intransitiv", "auf + Akk."], + }, + # https://de.wiktionary.org/wiki/bojovat + { + "input": "{{K|intrans.|Prä=proti|Kas=Dativ||ft=bojovat [[proti]] + [[Dativ]]|spr=cs}}", + "expected_categories": ["intransitiv", "proti + Dativ"], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + gloss_data = defaultdict(list) + + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse(case["input"]) + + process_K_template(self.wxr, gloss_data, root) + self.assertEqual( + gloss_data, + { + "categories": case["expected_categories"], + }, + ) + def test_de_extract_categories_from_gloss_text(self): test_cases = [ {