Merge pull request #563 from xxyzz/de

Some changes for de edition's `page.py` and `gloss.py` file
tatuylonen · Mar 27, 2024 · 39ac701 · 39ac701
2 parents f65abdf + 35ac365
commit 39ac701
Show file tree

Hide file tree

Showing 7 changed files with 138 additions and 219 deletions.
diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
@@ -3,11 +3,12 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-from wiktextract.extractor.de.models import Sense, WordEntry
-from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
+from .models import Sense, WordEntry
+from .utils import find_and_remove_child, match_senseid
+
 
 def extract_glosses(
     wxr: WiktextractContext,
@@ -37,7 +38,7 @@ def process_gloss_list_item(
         item_type = list_item_node.sarg
         if item_type == "*":
             handle_sense_modifier(wxr, base_sense, list_item_node)
-        elif item_type in [":", "::"]:
+        elif item_type.endswith(":"):
             if any(
                 [
                     template_node.template_name
@@ -60,9 +61,6 @@ def process_gloss_list_item(
                 find_and_remove_child(list_item_node, NodeKind.LIST)
             )
 
-            raw_gloss = clean_node(wxr, {}, list_item_node.children)
-            sense_data.raw_glosses = [raw_gloss]
-
             process_K_template(wxr, sense_data, list_item_node)
 
             gloss_text = clean_node(wxr, sense_data, list_item_node.children)
@@ -81,8 +79,8 @@ def process_gloss_list_item(
             # XXX: Extract tags from nodes instead using Italic and Template
             gloss_text = extract_tags_from_gloss_text(sense_data, gloss_text)
 
-            if gloss_text or not sub_glosses_list_nodes:
-                sense_data.glosses = [gloss_text]
+            if len(gloss_text) > 0:
+                sense_data.glosses.append(gloss_text)
                 word_entry.senses.append(sense_data)
 
             for sub_list_node in sub_glosses_list_nodes:
@@ -92,7 +90,7 @@ def process_gloss_list_item(
                     base_sense,
                     sub_list_node,
                     senseid,
-                    sense_data if not gloss_text else None,
+                    sense_data,
                 )
 
         else:
@@ -112,23 +110,25 @@ def handle_sense_modifier(
             f"Found more than one child in sense modifier: {list_item_node.children}",
             sortid="extractor/de/gloss/handle_sense_modifier/114",
         )
-    modifier = clean_node(wxr, None, list_item_node.children)
+    modifier = clean_node(wxr, None, list_item_node.children).removesuffix(":")
     if modifier != "":
         sense.raw_tags = [modifier]
 
 
 def process_K_template(
     wxr: WiktextractContext,
     sense_data: Sense,
-    list_item_node: NodeKind.LIST_ITEM,
+    list_item_node: WikiNode,
 ) -> None:
     for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
         if template_node.template_name == "K":
             categories = {"categories": []}
             text = clean_node(wxr, categories, template_node).removesuffix(":")
             sense_data.categories.extend(categories["categories"])
             tags = re.split(r";|,", text)
-            sense_data.raw_tags.extend([t.strip() for t in tags])
+            sense_data.raw_tags.extend(
+                [t.strip() for t in tags if len(t.strip()) > 0]
+            )
 
             # Prepositional and case information is sometimes only expanded to
             # category links and not present in cleaned node. We still want it

diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py
@@ -85,10 +85,6 @@ class Sense(BaseModelWrap):
         default=[],
         description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
     )
-    raw_glosses: list[str] = Field(
-        default=[],
-        description="list of uncleaned raw glosses for the word sense (usually only one).",
-    )
     raw_tags: list[str] = Field(
         default=[],
         description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
@@ -144,6 +140,7 @@ class WordEntry(BaseModelWrap):
 
     word: str = Field(description="word string")
     pos: str = Field(default="", description="Part of speech type")
+    other_pos: list[str] = []
     # pos_title: str = Field(default=None, description="Original POS title")
     lang_code: str = Field(
         description="Wiktionary language code", examples=["es"]

diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -1,15 +1,16 @@
 import logging
-from typing import Union
+from typing import Any
 
 from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import LevelNode
-from wiktextract.extractor.de.models import WordEntry
+from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
 from .example import extract_examples
 from .gloss import extract_glosses
 from .linkage import extract_linkages
+from .models import WordEntry
 from .pronunciation import extract_pronunciation
 from .section_titles import LINKAGE_TITLES, POS_SECTIONS
 from .translation import extract_translation
@@ -31,65 +32,36 @@ def parse_section(
     wxr: WiktextractContext,
     page_data: list[WordEntry],
     base_data: WordEntry,
-    level_node_or_children: Union[WikiNode, list[Union[WikiNode, str]]],
+    level_node: WikiNode,
 ) -> None:
     # Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage
-    if isinstance(level_node_or_children, list):
-        for x in level_node_or_children:
-            parse_section(wxr, page_data, base_data, x)
-        return
-
-    elif not isinstance(level_node_or_children, WikiNode):
-        if (
-            not isinstance(level_node_or_children, str)
-            or not level_node_or_children.strip() == ""
-        ):
-            wxr.wtp.debug(
-                f"Unexpected node type in parse_section: {level_node_or_children}",
-                sortid="extractor/de/page/parse_section/31",
-            )
-        return
-
     # Level 3 headings are used to start POS sections like
     # === {{Wortart|Verb|Deutsch}} ===
-    elif level_node_or_children.kind == NodeKind.LEVEL3:
-        for template_node in level_node_or_children.find_content(
-            NodeKind.TEMPLATE
-        ):
-            # German Wiktionary uses a `Wortart` template to define the POS
-            if template_node.template_name == "Wortart":
-                process_pos_section(
-                    wxr,
-                    page_data,
-                    base_data,
-                    level_node_or_children,
-                    template_node,
-                )
-        return
-
+    if level_node.kind == NodeKind.LEVEL3:
+        process_pos_section(wxr, page_data, base_data, level_node)
     # Level 4 headings were introduced by overriding the default templates.
     # See overrides/de.json for details.
-    elif level_node_or_children.kind == NodeKind.LEVEL4:
-        section_name = level_node_or_children.largs[0][0]
+    elif level_node.kind == NodeKind.LEVEL4:
+        section_name = clean_node(wxr, None, level_node.largs)
         wxr.wtp.start_subsection(section_name)
         if not len(page_data) > 0:
             wxr.wtp.debug(
-                f"Reached section without extracting some page data first: {level_node_or_children}",
+                f"Reached section without extracting some page data first: {level_node}",
                 sortid="extractor/de/page/parse_section/55",
             )
             return
         if section_name == "Bedeutungen":
-            extract_glosses(wxr, page_data[-1], level_node_or_children)
+            extract_glosses(wxr, page_data[-1], level_node)
         elif wxr.config.capture_pronunciation and section_name == "Aussprache":
-            extract_pronunciation(wxr, page_data[-1], level_node_or_children)
+            extract_pronunciation(wxr, page_data[-1], level_node)
         elif wxr.config.capture_examples and section_name == "Beispiele":
-            extract_examples(wxr, page_data[-1], level_node_or_children)
+            extract_examples(wxr, page_data[-1], level_node)
         elif (
             wxr.config.capture_translations and section_name == "Übersetzungen"
         ):
-            extract_translation(wxr, page_data[-1], level_node_or_children)
+            extract_translation(wxr, page_data[-1], level_node)
         elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES:
-            extract_linkages(wxr, page_data[-1], level_node_or_children)
+            extract_linkages(wxr, page_data[-1], level_node)
 
 
 FORM_POS = {
@@ -116,29 +88,36 @@ def process_pos_section(
     page_data: list[WordEntry],
     base_data: WordEntry,
     level_node: LevelNode,
-    pos_template_node: WikiNode,
 ) -> None:
-    # Extract the POS
-    pos_argument = pos_template_node.template_parameters.get(1)
-    if pos_argument in IGNORE_POS:
-        return
-    if pos_argument in FORM_POS:
-        # XXX: Extract form from form pages. Investigate first if this is needed
-        # at all or redundant with form tables.
+    pos_arguments = []
+    for template_node in level_node.find_content(NodeKind.TEMPLATE):
+        if template_node.template_name == "Wortart":
+            pos_argument = template_node.template_parameters.get(1, "")
+            if pos_argument in IGNORE_POS:
+                continue
+            if pos_argument in FORM_POS:
+                # XXX: Extract form from form pages. Investigate first if this is needed
+                # at all or redundant with form tables.
+                continue
+            if pos_argument in POS_SECTIONS:
+                pos_arguments.append(pos_argument)
+            else:
+                wxr.wtp.debug(
+                    f"Unknown Wortart template POS argument: {pos_argument}",
+                    sortid="extractor/de/page/process_pos_section/55",
+                )
+    if len(pos_arguments) == 0:
         return
-
-    pos = ""
-    if pos_argument in POS_SECTIONS:
+    for pos_index, pos_argument in enumerate(pos_arguments):
         pos = POS_SECTIONS[pos_argument]["pos"]
-    else:
-        wxr.wtp.debug(
-            f"Unknown POS type: {pos_argument}",
-            sortid="extractor/de/page/process_pos_section/55",
-        )
-    base_data.pos = pos
+        pos_tags = POS_SECTIONS[pos_argument].get("tags", [])
+        base_data.tags.extend(pos_tags)
+        if pos_index == 0:
+            base_data.pos = pos
+        else:
+            base_data.other_pos.append(pos)
     page_data.append(base_data.model_copy(deep=True))
-
-    wxr.wtp.start_section(page_data[-1].lang_code + "_" + pos)
+    wxr.wtp.start_subsection(clean_node(wxr, None, level_node.largs))
 
     # There might be other templates in the level node that define grammatical
     # features other than the POS. Extract them here.
@@ -242,12 +221,11 @@ def process_pos_section(
                 f"Unexpected node in pos section: {non_l4_node}",
                 sortid="extractor/de/page/process_pos_section/41",
             )
-    return
 
 
 def parse_page(
     wxr: WiktextractContext, page_title: str, page_text: str
-) -> list[dict[str, any]]:
+) -> list[dict[str, Any]]:
     if wxr.config.verbose:
         logging.info(f"Parsing page: {page_title}")
 
@@ -270,22 +248,22 @@ def parse_page(
             # where <title> is the title of the page and <lang> is the
             # German name of the language of the section.
             if subtitle_template.template_name == "Sprache":
-                lang = subtitle_template.template_parameters.get(1)
-                lang_code = name_to_code(lang, "de")
+                lang_name = subtitle_template.template_parameters.get(1, "")
+                lang_code = name_to_code(lang_name, "de")
                 if lang_code == "":
                     wxr.wtp.warning(
-                        f"Unknown language: {lang}",
+                        f"Unknown language: {lang_name}",
                         sortid="extractor/de/page/parse_page/76",
                     )
                 if (
                     wxr.config.capture_language_codes is not None
                     and lang_code not in wxr.config.capture_language_codes
                 ):
                     continue
-
                 base_data = WordEntry(
-                    lang=lang, lang_code=lang_code, word=wxr.wtp.title
+                    lang=lang_name, lang_code=lang_code, word=page_title
                 )
-                parse_section(wxr, page_data, base_data, level2_node.children)
+                for level3_node in level2_node.find_child(NodeKind.LEVEL3):
+                    parse_section(wxr, page_data, base_data, level3_node)
 
     return [d.model_dump(exclude_defaults=True) for d in page_data]
diff --git a/src/wiktextract/extractor/de/section_titles.py b/src/wiktextract/extractor/de/section_titles.py
@@ -2,8 +2,8 @@
 
 # argument of title template https://de.wiktionary.org/wiki/Vorlage:Wortart
 POS_SECTIONS: POSSubtitleData = {
-    "Abkürzung (Deutsch)": {"pos": "abbrev"},
-    "Abkürzung": {"pos": "abbrev"},
+    "Abkürzung (Deutsch)": {"pos": "abbrev", "tags": ["abbreviation"]},
+    "Abkürzung": {"pos": "abbrev", "tags": ["abbreviation"]},
     "Abtönungspartikel": {"pos": "particle"},
     "Adjektiv ": {"pos": "adj"},
     "Adjektiv": {"pos": "adj"},
@@ -17,7 +17,7 @@
     "Demonstrativpronomen": {"pos": "pron"},
     "Eigenname ": {"pos": "name"},
     "Eigenname": {"pos": "name"},
-    "Enklitikon": {"pos": "suffix"},
+    "Enklitikon": {"pos": "suffix", "tags": ["morpheme"]},
     "Fokuspartikel": {"pos": "particle"},
     "Formel": {"pos": "phrase"},
     "Gebundenes Lexem": {"pos": "lexeme"},
@@ -59,8 +59,8 @@
     "Possessivpronomen ": {"pos": "pron"},
     "Possessivpronomen": {"pos": "pron"},
     "Postposition": {"pos": "postp"},
-    "Präfix": {"pos": "prefix"},
-    "Präfixoid": {"pos": "prefix"},
+    "Präfix": {"pos": "prefix", "tags": ["morpheme"]},
+    "Präfixoid": {"pos": "prefix", "tags": ["morpheme"]},
     "Präposition ": {"pos": "prep"},
     "Präposition": {"pos": "prep"},
     "Pronomen": {"pos": "pron"},
@@ -75,8 +75,8 @@
     "Straßenname": {"pos": "name"},
     "Subjunktion": {"pos": "conj"},
     "Substantiv": {"pos": "noun"},
-    "Suffix": {"pos": "suffix"},
-    "Suffixoid": {"pos": "suffix"},
+    "Suffix": {"pos": "suffix", "tags": ["morpheme"]},
+    "Suffixoid": {"pos": "suffix", "tags": ["morpheme"]},
     "Symbol": {"pos": "symbol"},
     "Temporaladverb": {"pos": "adv"},
     "Temporaldverb": {"pos": "adv"},
@@ -89,7 +89,7 @@
     "Wortverbindung": {"pos": "phrase"},
     "Zahlklassifikator": {"pos": "noun"},
     "Zahlzeichen": {"pos": "num"},
-    "Zirkumfix": {"pos": "circumfix"},
+    "Zirkumfix": {"pos": "circumfix", "tags": ["morpheme"]},
     "Zirkumposition": {"pos": "circumpos"},
 }
 

diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py
@@ -25,7 +25,7 @@ def extract_translation(
             )
         else:
             sense_translations = []
-            sense_id = level_node_child.template_parameters.get(1, "")
+            sense_id = str(level_node_child.template_parameters.get(1, ""))
             base_translation_data = Translation(sense_id=sense_id)
             if sense_id == "":
                 # XXX: Sense-disambiguate where senseids are in Ü-Liste (ca. 0.03% of pages), e.g.: