diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index 18f898a5..34ffcd15 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -3,11 +3,12 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode -from wiktextract.extractor.de.models import Sense, WordEntry -from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import Sense, WordEntry +from .utils import find_and_remove_child, match_senseid + def extract_glosses( wxr: WiktextractContext, @@ -37,7 +38,7 @@ def process_gloss_list_item( item_type = list_item_node.sarg if item_type == "*": handle_sense_modifier(wxr, base_sense, list_item_node) - elif item_type in [":", "::"]: + elif item_type.endswith(":"): if any( [ template_node.template_name @@ -60,9 +61,6 @@ def process_gloss_list_item( find_and_remove_child(list_item_node, NodeKind.LIST) ) - raw_gloss = clean_node(wxr, {}, list_item_node.children) - sense_data.raw_glosses = [raw_gloss] - process_K_template(wxr, sense_data, list_item_node) gloss_text = clean_node(wxr, sense_data, list_item_node.children) @@ -81,8 +79,8 @@ def process_gloss_list_item( # XXX: Extract tags from nodes instead using Italic and Template gloss_text = extract_tags_from_gloss_text(sense_data, gloss_text) - if gloss_text or not sub_glosses_list_nodes: - sense_data.glosses = [gloss_text] + if len(gloss_text) > 0: + sense_data.glosses.append(gloss_text) word_entry.senses.append(sense_data) for sub_list_node in sub_glosses_list_nodes: @@ -92,7 +90,7 @@ def process_gloss_list_item( base_sense, sub_list_node, senseid, - sense_data if not gloss_text else None, + sense_data, ) else: @@ -112,7 +110,7 @@ def handle_sense_modifier( f"Found more than one child in sense modifier: {list_item_node.children}", sortid="extractor/de/gloss/handle_sense_modifier/114", ) - modifier = clean_node(wxr, None, list_item_node.children) + modifier = clean_node(wxr, None, list_item_node.children).removesuffix(":") if modifier != "": sense.raw_tags = [modifier] @@ -120,7 +118,7 @@ def handle_sense_modifier( def process_K_template( wxr: WiktextractContext, sense_data: Sense, - list_item_node: NodeKind.LIST_ITEM, + list_item_node: WikiNode, ) -> None: for template_node in list_item_node.find_child(NodeKind.TEMPLATE): if template_node.template_name == "K": @@ -128,7 +126,9 @@ def process_K_template( text = clean_node(wxr, categories, template_node).removesuffix(":") sense_data.categories.extend(categories["categories"]) tags = re.split(r";|,", text) - sense_data.raw_tags.extend([t.strip() for t in tags]) + sense_data.raw_tags.extend( + [t.strip() for t in tags if len(t.strip()) > 0] + ) # Prepositional and case information is sometimes only expanded to # category links and not present in cleaned node. We still want it diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py index bcaa930d..98738b59 100644 --- a/src/wiktextract/extractor/de/models.py +++ b/src/wiktextract/extractor/de/models.py @@ -85,10 +85,6 @@ class Sense(BaseModelWrap): default=[], description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", ) - raw_glosses: list[str] = Field( - default=[], - description="list of uncleaned raw glosses for the word sense (usually only one).", - ) raw_tags: list[str] = Field( default=[], description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index e43efb5f..13f904a5 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -45,81 +45,81 @@ def test_de_extract_glosses(self): [ { "glosses": ["gloss1"], - "raw_glosses": ["[1] gloss1"], "senseid": "1", }, { "glosses": ["gloss2"], - "raw_glosses": ["[2] gloss2"], "senseid": "2", }, ], ) - def test_de_extract_glosses_with_subglosses(self): - self.wxr.wtp.start_page("") + def test_nested_gloss(self): + self.wxr.wtp.start_page("Keim") + self.wxr.wtp.add_page("Vorlage:K", 10, "{{{1|}}}, {{{2|}}}:") root = self.wxr.wtp.parse( - ":[1] gloss1\n::[a] subglossA\n::[b] subglossB" + """===Bedeutungen=== +:[2] das erste [[Entwicklungsstadium]] +::[a] {{K|Botanik}} erster [[Trieb]] einer Pflanze +::[b] {{K|Biologie|Medizin}} befruchtete [[Eizelle]], [[Embryo]]""" ) - - word_entry = self.get_default_word_entry() - - extract_glosses(self.wxr, word_entry, root) - - senses = [ - s.model_dump(exclude_defaults=True) for s in word_entry.senses - ] - + word_entry = WordEntry( + lang="Deutsch", lang_code="de", word="Keim", pos="noun" + ) + extract_glosses(self.wxr, word_entry, root.children[0]) self.assertEqual( - senses, + [s.model_dump(exclude_defaults=True) for s in word_entry.senses], [ { - "glosses": ["gloss1"], - "raw_glosses": ["[1] gloss1"], - "senseid": "1", + "glosses": ["das erste Entwicklungsstadium"], + "senseid": "2", }, { - "glosses": ["subglossA"], - "raw_glosses": ["[a] subglossA"], - "senseid": "1a", + "glosses": [ + "das erste Entwicklungsstadium", + "erster Trieb einer Pflanze", + ], + "raw_tags": ["Botanik"], + "senseid": "2a", }, { - "glosses": ["subglossB"], - "raw_glosses": ["[b] subglossB"], - "senseid": "1b", + "glosses": [ + "das erste Entwicklungsstadium", + "befruchtete Eizelle, Embryo", + ], + "raw_tags": ["Biologie", "Medizin"], + "senseid": "2b", }, ], ) - def test_de_extract_glosses_with_only_subglosses(self): - self.wxr.wtp.add_page("Vorlage:K", 10, "tag") - self.wxr.wtp.start_page("") + def test_nested_gloss_without_parent_gloss(self): + self.wxr.wtp.add_page("Vorlage:K", 10, "{{{1}}}:") + self.wxr.wtp.start_page("eingeben") root = self.wxr.wtp.parse( - ":[1] {{K|tag}}\n::[a] subglossA\n::[1b] subglossB" + """===Bedeutungen=== +*{{K|fachsprachlich}} +:[4] {{K|Technik}} etwas, was eine Maschine bearbeiten soll, an diese übergeben +:[5] {{K|EDV}} etwas in einen Computer übertragen""" ) - - word_entry = self.get_default_word_entry() - - extract_glosses(self.wxr, word_entry, root) - - senses = [ - s.model_dump(exclude_defaults=True) for s in word_entry.senses - ] - + word_entry = WordEntry( + lang="Deutsch", lang_code="de", word="eingeben", pos="verb" + ) + extract_glosses(self.wxr, word_entry, root.children[0]) self.assertEqual( - senses, + [s.model_dump(exclude_defaults=True) for s in word_entry.senses], [ { - "raw_tags": ["tag"], - "glosses": ["subglossA"], - "raw_glosses": ["[a] subglossA"], - "senseid": "1a", + "raw_tags": ["fachsprachlich", "Technik"], + "glosses": [ + "etwas, was eine Maschine bearbeiten soll, an diese übergeben" + ], + "senseid": "4", }, { - "raw_tags": ["tag"], - "glosses": ["subglossB"], - "raw_glosses": ["[1b] subglossB"], - "senseid": "1b", + "raw_tags": ["fachsprachlich", "EDV"], + "glosses": ["etwas in einen Computer übertragen"], + "senseid": "5", }, ], ) diff --git a/tests/test_de_page.py b/tests/test_de_page.py index 6a282c05..e206386f 100644 --- a/tests/test_de_page.py +++ b/tests/test_de_page.py @@ -91,9 +91,6 @@ def test_multiple_pos(self): "glosses": [ "Staat in Südosteuropa, im Süden der Balkanhalbinsel" ], - "raw_glosses": [ - "[1] Staat in Südosteuropa, im Süden der Balkanhalbinsel" - ], "senseid": "1", } ],