From 454e95d4f63c0e9d9c60c6693ef6af7b47b4a826 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Mar 2024 13:20:50 +0800 Subject: [PATCH 1/6] Add "categories" field to de edition's `WordEntry` pydantic model Fix `AttributeError` exception in some pages --- src/wiktextract/extractor/de/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py index caadd143..9a9640b6 100644 --- a/src/wiktextract/extractor/de/models.py +++ b/src/wiktextract/extractor/de/models.py @@ -166,3 +166,4 @@ class WordEntry(BaseModelWrap): synonyms: list[Linkage] = [] tags: list[str] = [] raw_tags: list[str] = [] + categories: list[str] = [] From 714f659ba956537457e7e28e5fd23081b5ca7694 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Mar 2024 13:30:53 +0800 Subject: [PATCH 2/6] Add empty sense with "no-gloss" tag for de pages don't have gloss --- src/wiktextract/extractor/de/page.py | 5 ++++- tests/test_de_page.py | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index 4e512b5f..8eb8e75f 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -10,7 +10,7 @@ from .example import extract_examples from .gloss import extract_glosses from .linkage import extract_linkages -from .models import WordEntry +from .models import Sense, WordEntry from .pronunciation import extract_pronunciation from .section_titles import LINKAGE_TITLES, POS_SECTIONS from .translation import extract_translation @@ -200,4 +200,7 @@ def parse_page( for level3_node in level2_node.find_child(NodeKind.LEVEL3): parse_section(wxr, page_data, base_data, level3_node) + for data in page_data: + if len(data.senses) == 0: + data.senses.append(Sense(tags=["no-gloss"])) return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/tests/test_de_page.py b/tests/test_de_page.py index 23ec2750..d9d4250d 100644 --- a/tests/test_de_page.py +++ b/tests/test_de_page.py @@ -39,6 +39,7 @@ def test_de_parse_page(self): "lang_code": "de", "word": "Beispiel", "pos": "noun", + "senses": [{"tags": ["no-gloss"]}], } ], ) @@ -65,6 +66,7 @@ def test_de_parse_page_skipping_head_templates(self): "lang_code": "de", "word": "Beispiel", "pos": "noun", + "senses": [{"tags": ["no-gloss"]}], } ], ) From dc79940955468d611889f26824f33a1c3bec0b66 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Mar 2024 13:51:52 +0800 Subject: [PATCH 3/6] Don't add empty example data in de edition code --- src/wiktextract/extractor/de/example.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py index 63d8e9a5..2b76d710 100644 --- a/src/wiktextract/extractor/de/example.py +++ b/src/wiktextract/extractor/de/example.py @@ -51,27 +51,25 @@ def extract_examples( for ref_node in ref_nodes: extract_reference(wxr, example_data, ref_node) - example_text = clean_node(wxr, {}, list_item_node.children) + example_text = clean_node(wxr, None, list_item_node.children) senseid, example_text = match_senseid(example_text) - if example_text: + if len(example_text) > 0: example_data.text = example_text - - if senseid: - for sense in word_entry.senses: - if sense.senseid == senseid: - sense.examples.append(copy.deepcopy(example_data)) - - else: - if example_data: + if len(senseid) > 0: + for sense in word_entry.senses: + if sense.senseid == senseid: + sense.examples.append(copy.deepcopy(example_data)) + else: wxr.wtp.debug( - f"Found example data without senseid and text: {example_data}", + f"Found example data without senseid: {example_data}", sortid="extractor/de/examples/extract_examples/28", ) + for non_list_node in level_node.invert_find_child(NodeKind.LIST): wxr.wtp.debug( - f"Found unexpected non-list node in example section: {non_list_node}", + f"Found unexpected non-list node in examples: {non_list_node}", sortid="extractor/de/examples/extract_examples/33", ) @@ -79,7 +77,7 @@ def extract_examples( def extract_reference( wxr: WiktextractContext, example_data: Example, ref_node: WikiNode ): - example_data.raw_ref = clean_node(wxr, {}, ref_node.children) + example_data.raw_ref = clean_node(wxr, None, ref_node.children) template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE)) From 227100ed8729482dcf9e808d8cd4fcf2b487d370 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Mar 2024 15:21:59 +0800 Subject: [PATCH 4/6] Split italic sense tags in de edition pages --- src/wiktextract/extractor/de/gloss.py | 37 +++++++------ tests/test_de_gloss.py | 76 +++++++++++---------------- 2 files changed, 50 insertions(+), 63 deletions(-) diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index 7cb27953..922b6e4f 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -1,4 +1,3 @@ -import re from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode, TemplateNode @@ -59,19 +58,32 @@ def process_gloss_list_item( raw_tag = clean_node(wxr, None, k_arg_value) sense_data.raw_tags.append(raw_tag) clean_node(wxr, sense_data, gloss_node) + elif gloss_node.template_name.endswith("."): + raw_tag = clean_node( + wxr, sense_data, gloss_node + ).removesuffix(":") + sense_data.raw_tags.append(raw_tag) elif gloss_node.template_name in ( "QS Herkunft", "QS Bedeutungen", ): continue + else: + gloss_nodes.append(gloss_node) elif ( isinstance(gloss_node, WikiNode) and gloss_node.kind == NodeKind.ITALIC ): - raw_tag = clean_node(wxr, None, gloss_node).removesuffix( - ":" - ) - sense_data.raw_tags.append(raw_tag) + italic_text = clean_node(wxr, None, gloss_node) + if italic_text.endswith(":"): + for raw_tag in italic_text.removesuffix(":").split( + ", " + ): + raw_tag = raw_tag.strip() + if len(raw_tag) > 0: + sense_data.raw_tags.append(raw_tag) + else: + gloss_nodes.append(italic_text) elif not ( isinstance(gloss_node, WikiNode) and gloss_node.kind == NodeKind.LIST @@ -95,7 +107,7 @@ def process_gloss_list_item( ) if len(gloss_text) > 0: - sense_data.glosses.append(gloss_text) + sense_data.glosses.append(gloss_text.removeprefix(", ")) word_entry.senses.append(sense_data) for sub_list_node in list_item_node.find_child(NodeKind.LIST): @@ -113,16 +125,3 @@ def process_gloss_list_item( ) continue return parent_sense - - -def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None: - parts = gloss_text.split(":", 1) - if len(parts) > 1: - tags_part = parts[0].strip() - - categories = [c.strip() for c in re.split(",", tags_part)] - if all(c.isalnum() for c in categories): - sense_data.raw_tags.extend(categories) - return parts[1].strip() - - return gloss_text diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index eab94d8e..bc450e22 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -4,9 +4,7 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.gloss import ( extract_glosses, - extract_tags_from_gloss_text, ) -from wiktextract.extractor.de.models import Sense from wiktextract.extractor.es.models import WordEntry from wiktextract.wxr_context import WiktextractContext @@ -151,48 +149,7 @@ def test_k_template_multiple_tags(self): ], ) - def test_de_extract_tags_from_gloss_text(self): - test_cases = [ - # https://de.wiktionary.org/wiki/Hengst - { - "input": "Zoologie: männliches Tier aus der Familie der Einhufer und Kamele", - "expected_tags": ["Zoologie"], - "expected_gloss": "männliches Tier aus der Familie der Einhufer und Kamele", - }, - # https://de.wiktionary.org/wiki/ARD - { - "input": "umgangssprachlich, Kurzwort, Akronym: für das erste Fernsehprogramm der ARD", - "expected_tags": ["umgangssprachlich", "Kurzwort", "Akronym"], - "expected_gloss": "für das erste Fernsehprogramm der ARD", - }, - # https://de.wiktionary.org/wiki/Endspiel - { - "input": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg", - "expected_tags": None, - "expected_gloss": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg", - }, - # Add more test cases as needed - ] - for case in test_cases: - with self.subTest(case=case): - sense_data = Sense() - - gloss_text = extract_tags_from_gloss_text( - sense_data, case["input"] - ) - - if case["expected_tags"] is None: - self.assertEqual( - sense_data.model_dump(exclude_defaults=True), {} - ) - else: - self.assertEqual( - sense_data.raw_tags, - case["expected_tags"], - ) - self.assertEqual(gloss_text, case["expected_gloss"]) - - def test_handle_sense_modifier(self): + def test_italic_sense_modifier(self): # https://de.wiktionary.org/wiki/habitare wikitext = """ * {{trans.}} @@ -242,3 +199,34 @@ def test_handle_sense_modifier(self): }, ], ) + + def test_italit_node_multiple_raw_tags(self): + self.wxr.wtp.add_page( + "Vorlage:K", 10, "[[Deutschland]], [[Fernsehen]]:" + ) + self.wxr.wtp.add_page("Vorlage:ugs.", 10, "''[[umgangssprachlich]]''") + self.wxr.wtp.start_page("ARD") + root = self.wxr.wtp.parse( + """===Bedeutungen=== +:[2] {{K|Deutschland|Fernsehen}} {{ugs.}}, ''[[Kurzwort]], [[Akronym]]:'' für das erste Fernsehprogramm der ARD""" + ) + word_entry = WordEntry( + lang="Deutsch", lang_code="de", word="ARD", pos="noun" + ) + extract_glosses(self.wxr, word_entry, root.children[0]) + self.assertEqual( + [s.model_dump(exclude_defaults=True) for s in word_entry.senses], + [ + { + "raw_tags": [ + "Deutschland", + "Fernsehen", + "umgangssprachlich", + "Kurzwort", + "Akronym", + ], + "glosses": ["für das erste Fernsehprogramm der ARD"], + "senseid": "2", + }, + ], + ) From 4d75df16fd0b4a0d18ca81256e4f6fe3a908a79d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Mar 2024 17:32:53 +0800 Subject: [PATCH 5/6] Translate some raw tags extracted from the "K" template arguments --- src/wiktextract/extractor/de/gloss.py | 3 +- src/wiktextract/extractor/de/tags.py | 130 ++++++++++++++++++++++++++ tests/test_de_gloss.py | 9 +- 3 files changed, 138 insertions(+), 4 deletions(-) create mode 100644 src/wiktextract/extractor/de/tags.py diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index 922b6e4f..1e88e882 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -1,10 +1,10 @@ - from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode, TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext from .models import Sense, WordEntry +from .tags import translate_raw_tags from .utils import match_senseid @@ -108,6 +108,7 @@ def process_gloss_list_item( if len(gloss_text) > 0: sense_data.glosses.append(gloss_text.removeprefix(", ")) + translate_raw_tags(sense_data) word_entry.senses.append(sense_data) for sub_list_node in list_item_node.find_child(NodeKind.LIST): diff --git a/src/wiktextract/extractor/de/tags.py b/src/wiktextract/extractor/de/tags.py new file mode 100644 index 00000000..fa9ed520 --- /dev/null +++ b/src/wiktextract/extractor/de/tags.py @@ -0,0 +1,130 @@ +from .models import WordEntry + +# https://de.wiktionary.org/wiki/Vorlage:K +K_TEMPLATE_TAGS = { + "Abl.": "ablative", + "Ablativ": "ablative", + "abw.": "derogatory", + "AE": "US", + "AmE": "US", + "adv.": "adverbial", + "Akkusativ": "accusative", + "alemann.": "Alemannic", + "alemannisch": "Alemannic", + "allg.": "general", + "allgemein": "general", + "alltagsspr.": "colloquial", + "amtsspr.": "officialese", + # "ansonsten": "otherwise", # combined with other text + "attr.": "attributive", + # "auch": "also", + "bair.": "Bavarian", + "bairisch": "Bavarian", + "bar.": "Bavarian", + "BE": "British", + "BrE": "British", + "Bedva.": "outdated", + "Bedvatd.": "outdated", + # "bei": "", + # "bes.": "especially", + # "besonders": "especially", + # "beziehungsweise": "", + # "bzw.": "", + # "bildungsspr.": "", + # "bis": "", + # "bisweilen": "", + # "das": "", + "Dativ": "dative", + # "DDR": "", + # "der": "", + "dichter.": "poetic", + # "die": "", + "Dim.": "diminutive", + "Dimin.": "diminutive", + "Diminutiv": "diminutive", + # "eher": "", + "erzg.": "Erzgebirgisch", + "erzgeb.": "Erzgebirgisch", + "erzgebirgisch": "Erzgebirgisch", + "euph.": "euphemistic", + "fachspr.": "jargon", + "fam.": "familiär", + "fig": "figurative", + "fig.": "figurative", + # "früher": "", + # "gegenwartslateinisch": "", + "geh.": "gehoben", + "Genitiv": "genitive", + "gsm": "Swiss German", + "häufig": "often", + "haben": "auxiliary", + "hebben": "auxiliary", + "hauptsächlich": "primarily", + "hist.": "historical", + "ieS": "narrowly", + "i.e.S.": "narrowly", + "i. e. S.": "narrowly", + # "im": "", + # "in": "", + # "in Bezug auf": "relational", + "indekl.": "indeclinable", + # "insbes.": "", + "Instrumental": "instrumental", + "intrans.": "intransitive", + "intransitiv": "intransitive", + # "iPl": "in plural", + "iron.": "ironic", + # "iwS": "", + # "jugendspr.": "", + "kinderspr.": "childish", + "kirchenlateinisch": "Church Latin", + "klasslat.": "Classical Latin", + "klassischlateinisch": "Classical Latin", + "kPl.": "no-plural", + "kSg.": "no-singulative", + "kSt.": "no-comparative", + "landsch.": "regional", + "lautm.": "onomatopoeic", + "Ling.": "linguistics", + "mA": "accusative", + "md.": "Central German", + "mdal.": "dialectal", + "Med.": "medicine", # topic + # "meist": "mostly", + # "meistens": "mostly", + "metaphor.": "metaphoric", + "meton.": "metonymically", + "mG": "genitive", + "mitteld.": "Central German", + # "mitunter": "", + "mlat.": "Medieval Latin", + "mittellateinisch": "Medieval Latin", + "mundartl.": "dialectal", + "nDu.": "only-dual", + "nigr.": "Niger", + "nigrisch": "Niger", + "nkLat.": "post-Classical Latin", + "nachklassischlateinisch": "post-Classical Latin", + "nlat.": "New Latin", + "neulateinisch": "New Latin", + "nordd.": "North German", + "norddeutsch": "North German", + "nordwestd.": "Northwestern Germany", + "nPl.": "plural-only", + "Österreich": "Austrian German", + "österr.": "Austrian German", + "österreichisch": "Austrian German", + "ostfränkisch": "East Franconian German", + "pej.": "pejorative", + "poet.": "poetic", +} + + +def translate_raw_tags(data: WordEntry) -> None: + raw_tags = [] + for raw_tag in data.raw_tags: + if raw_tag in K_TEMPLATE_TAGS: + data.tags.append(K_TEMPLATE_TAGS[raw_tag]) + else: + raw_tags.append(raw_tag) + data.raw_tags = raw_tags diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index bc450e22..1fcbdf7b 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -142,7 +142,8 @@ def test_k_template_multiple_tags(self): "Verb transitiv (Deutsch)", "Österreichisches Deutsch", ], - "raw_tags": ["trans.", "besonders", "bayrisch", "österr."], + "tags": ["Austrian German"], + "raw_tags": ["trans.", "besonders", "bayrisch"], "glosses": ["Vieh auf der Alm halten"], "senseid": "1", }, @@ -188,12 +189,14 @@ def test_italic_sense_modifier(self): "senseid": "2.2", }, { - "raw_tags": ["intransitiv", "sich befinden"], + "tags": ["intransitive"], + "raw_tags": ["sich befinden"], "glosses": ["wohnen"], "senseid": "3", }, { - "raw_tags": ["intransitiv", "übertragen"], + "tags": ["intransitive"], + "raw_tags": ["übertragen"], "glosses": ["sich aufhalten, heimisch sein, zu Hause sein"], "senseid": "4", }, From 41751155abaa94ea9cc4abcdd13cc0734a832767 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Mar 2024 17:53:13 +0800 Subject: [PATCH 6/6] Update `test_tr24` test result This test failed because the latest mediawiki_langcodes package has the "Puxian Min" language name. --- tests/test_translations.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_translations.py b/tests/test_translations.py index 0973e196..c4704a68 100644 --- a/tests/test_translations.py +++ b/tests/test_translations.py @@ -289,9 +289,13 @@ def test_tr23(self): def test_tr24(self): data = self.runtr("Puxian Min: foo", lang="Chinese") self.assertEqual(self.wxr.wtp.debugs, []) - self.assertEqual(data, {"translations": [ - {"word": "foo", "lang": "Chinese", "code": "zh", - "tags": ["Puxian-Min"]}]}) + self.assertEqual( + data, + { + "translations": [ + {"code": "cpx", "lang": "Puxian Min Chinese", "word": "foo"} + ], + }) def test_tr25(self): data = self.runtr("Hallig and Mooring: foo", lang="Danish")