Merge pull request #512 from xxyzz/de

Move de edition title JSON files to Python code
tatuylonen · Feb 22, 2024 · efa31c5 · efa31c5
2 parents dde8c4c + fbad2f3
commit efa31c5
Show file tree

Hide file tree

Showing 12 changed files with 150 additions and 137 deletions.
diff --git a/src/wiktextract/data/de/linkage_subtitles.json b/src/wiktextract/data/de/linkage_subtitles.json
diff --git a/src/wiktextract/data/de/other_subtitles.json b/src/wiktextract/data/de/other_subtitles.json
diff --git a/src/wiktextract/data/de/pos_subtitles.json b/src/wiktextract/data/de/pos_subtitles.json
diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
@@ -68,15 +68,11 @@ def process_gloss_list_item(
             gloss_text = clean_node(wxr, sense_data, list_item_node.children)
 
             senseid, gloss_text = match_senseid(gloss_text)
-
-            if senseid:
-                senseid = (
-                    senseid
-                    if senseid[0].isnumeric()
-                    else parent_senseid + senseid
-                )
+            if senseid != "":
+                if not senseid[0].isnumeric():
+                    senseid = parent_senseid + senseid
                 sense_data.senseid = senseid
-            elif gloss_text.strip():
+            elif len(gloss_text.strip()) > 0:
                 wxr.wtp.debug(
                     f"Failed to extract sense number from gloss node: {list_item_node}",
                     sortid="extractor/de/glosses/extract_glosses/28",

diff --git a/src/wiktextract/extractor/de/linkage.py b/src/wiktextract/extractor/de/linkage.py
@@ -7,11 +7,13 @@
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
+from .section_titles import LINKAGE_TITLES
+
 
 def extract_linkages(
     wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
 ):
-    linkage_type = wxr.config.LINKAGE_SUBTITLES.get(level_node.largs[0][0])
+    linkage_type = LINKAGE_TITLES.get(level_node.largs[0][0])
     for list_node in level_node.find_child(NodeKind.LIST):
         for list_item in list_node.find_child(NodeKind.LIST_ITEM):
             # Get the senseids

diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py
@@ -1,4 +1,3 @@
-import copy
 import logging
 from typing import Union
 
@@ -12,6 +11,7 @@
 from .gloss import extract_glosses
 from .linkage import extract_linkages
 from .pronunciation import extract_pronunciation
+from .section_titles import LINKAGE_TITLES, POS_SECTIONS
 from .translation import extract_translation
 
 # Templates that are used to form panels on pages and that should be ignored in
@@ -88,10 +88,7 @@ def parse_section(
             wxr.config.capture_translations and section_name == "Übersetzungen"
         ):
             extract_translation(wxr, page_data[-1], level_node_or_children)
-        elif (
-            wxr.config.capture_linkages
-            and section_name in wxr.config.LINKAGE_SUBTITLES
-        ):
+        elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES:
             extract_linkages(wxr, page_data[-1], level_node_or_children)
 
 
@@ -130,18 +127,16 @@ def process_pos_section(
         # at all or redundant with form tables.
         return
 
-    pos_type = wxr.config.POS_SUBTITLES.get(pos_argument)
-
-    if pos_type is None:
+    pos = ""
+    if pos_argument in POS_SECTIONS:
+        pos = POS_SECTIONS[pos_argument]["pos"]
+    else:
         wxr.wtp.debug(
             f"Unknown POS type: {pos_argument}",
             sortid="extractor/de/page/process_pos_section/55",
         )
-        return
-    pos = pos_type["pos"]
-
     base_data.pos = pos
-    page_data.append(copy.deepcopy(base_data))
+    page_data.append(base_data.model_copy(deep=True))
 
     wxr.wtp.start_section(page_data[-1].lang_code + "_" + pos)
 

diff --git a/src/wiktextract/extractor/de/section_titles.py b/src/wiktextract/extractor/de/section_titles.py
@@ -0,0 +1,110 @@
+from wiktextract.config import POSSubtitleData
+
+# argument of title template https://de.wiktionary.org/wiki/Vorlage:Wortart
+POS_SECTIONS: POSSubtitleData = {
+    "Abkürzung (Deutsch)": {"pos": "abbrev"},
+    "Abkürzung": {"pos": "abbrev"},
+    "Abtönungspartikel": {"pos": "particle"},
+    "Adjektiv ": {"pos": "adj"},
+    "Adjektiv": {"pos": "adj"},
+    "Adverb ": {"pos": "adv"},
+    "Adverb": {"pos": "adv"},
+    "Affix": {"pos": "affix"},
+    "Antwortpartikel": {"pos": "particle"},
+    "Artikel": {"pos": "det"},
+    "Bruchzahlwort": {"pos": "num"},
+    "Buchstabe": {"pos": "character"},
+    "Demonstrativpronomen": {"pos": "pron"},
+    "Eigenname ": {"pos": "name"},
+    "Eigenname": {"pos": "name"},
+    "Enklitikon": {"pos": "suffix"},
+    "Fokuspartikel": {"pos": "particle"},
+    "Formel": {"pos": "phrase"},
+    "Gebundenes Lexem": {"pos": "lexeme"},
+    "Geflügeltes Wort": {"pos": "phrase"},
+    "Gentilname": {"pos": "name"},
+    "Gradpartikel": {"pos": "particle"},
+    "Grußformel": {"pos": "phrase"},
+    "Hilfsverb": {"pos": "aux"},
+    "Hiragana": {"pos": "character"},
+    "Indefinitpronomen": {"pos": "pron"},
+    "Infinitiv ": {"pos": "verb"},
+    "Infinitiv": {"pos": "verb"},
+    "Infix": {"pos": "infix"},
+    "Interfix": {"pos": "interfix"},
+    "Interjektion": {"pos": "intj"},
+    "Interrogativadverb": {"pos": "adv"},
+    "Interrogativpronomen": {"pos": "pron"},
+    "Kardinalzahl": {"pos": "num"},
+    "Kausaladverb": {"pos": "adv"},
+    "Kognomen": {"pos": "nomen"},
+    "Konjunktion": {"pos": "conj"},
+    "Konjunktionaladverb": {"pos": "adv"},
+    "Kontraktion": {"pos": "abbrev"},
+    "Lokaladverb": {"pos": "adv"},
+    "Merkspruch": {"pos": "phrase"},
+    "Modaladverb": {"pos": "adv"},
+    "Modalpartikel": {"pos": "particle"},
+    "Nachname": {"pos": "name"},
+    "Negationspartikel": {"pos": "particle"},
+    "Numerale": {"pos": "num"},
+    "Onomatopoetikum": {"pos": "intj"},
+    "Ortsnamengrundwort": {"pos": "name"},
+    "Ordinalzahl": {"pos": "num"},
+    "Partikel": {"pos": "particle"},
+    "Partikelverb": {"pos": "verb"},
+    "Patronym": {"pos": "name"},
+    "Personalpronomen ": {"pos": "pron"},
+    "Personalpronomen": {"pos": "pron"},
+    "Possessivpronomen ": {"pos": "pron"},
+    "Possessivpronomen": {"pos": "pron"},
+    "Postposition": {"pos": "postp"},
+    "Präfix": {"pos": "prefix"},
+    "Präfixoid": {"pos": "prefix"},
+    "Präposition ": {"pos": "prep"},
+    "Präposition": {"pos": "prep"},
+    "Pronomen": {"pos": "pron"},
+    "Pronominaladverb": {"pos": "adv"},
+    "Redewendung": {"pos": "phrase"},
+    "Reflexives Personalpronomen": {"pos": "pron"},
+    "Reflexivpronomen": {"pos": "pron"},
+    "Relativpronomen": {"pos": "pron"},
+    "Reziprokpronomen": {"pos": "pron"},
+    "Schriftzeichen": {"pos": "character"},
+    "Sprichwort": {"pos": "phrase"},
+    "Straßenname": {"pos": "name"},
+    "Subjunktion": {"pos": "conj"},
+    "Substantiv": {"pos": "noun"},
+    "Suffix": {"pos": "suffix"},
+    "Suffixoid": {"pos": "suffix"},
+    "Symbol": {"pos": "symbol"},
+    "Temporaladverb": {"pos": "adv"},
+    "Temporaldverb": {"pos": "adv"},
+    "Toponym": {"pos": "name"},
+    "Verb": {"pos": "verb"},
+    "Vergleichspartikel": {"pos": "particle"},
+    "Vervielfältigungszahlwort": {"pos": "num"},
+    "Vorname": {"pos": "name"},
+    "Wiederholungszahlwort": {"pos": "num"},
+    "Wortverbindung": {"pos": "phrase"},
+    "Zahlklassifikator": {"pos": "noun"},
+    "Zahlzeichen": {"pos": "num"},
+    "Zirkumfix": {"pos": "circumfix"},
+    "Zirkumposition": {"pos": "circumpos"},
+}
+
+LINKAGE_TITLES: dict[str, str] = {
+    "Gegenwörter": "antonyms",
+    "Holonyme": "holonyms",
+    "Oberbegriffe": "hypernyms",
+    "Redewendungen": "expressions",
+    "Sinnverwandte Wörter": "coordinate_terms",
+    "Sprichwörter": "proverbs",
+    "Synonyme": "synonyms",
+    "Unterbegriffe": "hyponyms",
+    "Wortbildungen": "derived",
+}
+
+ETYMOLOGY_TITLES: frozenset[str] = frozenset(["Herkunft"])
+
+PRONUNCIATION_TITLES: frozenset[str] = frozenset(["Aussprache"])
diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py
@@ -101,7 +101,7 @@ def process_translation_list(
                 translation_data.uncertain = True
 
             translation_data.word = clean_node(
-                wxr, {}, node.template_parameters.get(2)
+                wxr, None, node.template_parameters.get(2, "")
             )
 
             if node.template_name.removesuffix("?") == "Ü":
@@ -110,7 +110,8 @@ def process_translation_list(
             if node.template_name.removesuffix("?") == "Üt":
                 process_Üt_template(wxr, translation_data, node)
 
-            sense_translations.append(translation_data)
+            if len(translation_data.word) > 0:
+                sense_translations.append(translation_data)
     # Process modifiers at the end of the list
     process_modifiers(wxr, sense_translations, Translation(), modifiers)
 

diff --git a/src/wiktextract/extractor/de/utils.py b/src/wiktextract/extractor/de/utils.py
@@ -3,14 +3,14 @@
 from wikitextprocessor import NodeKind, WikiNode
 
 
-def match_senseid(node_text: str):
+def match_senseid(node_text: str) -> tuple[str, str]:
     match = re.match(r"\[(\d*(?:[a-z]|(?:\.\d+))?)\]", node_text)
 
     if match:
         senseid = match.group(1)
         node_text = node_text[match.end() :].strip()
     else:
-        senseid = None
+        senseid = ""
 
     return senseid, node_text
 

diff --git a/tests/test_de_linkages.py b/tests/test_de_linkages.py
@@ -3,7 +3,7 @@
 from wikitextprocessor import Wtp
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.de.linkage import extract_linkages
-from wiktextract.extractor.de.models import Sense, WordEntry
+from wiktextract.extractor.de.models import WordEntry
 from wiktextract.wxr_context import WiktextractContext
 
 

diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py
@@ -302,3 +302,18 @@ def test_de_process_translation_list_with_modifiers(self):
                 self.assertEqual(
                     translations, case["expected_sense_translations"]
                 )
+
+    def test_empty_translation(self):
+        self.wxr.wtp.start_page("AM")
+        word_entry = WordEntry(word="AM", lang="English", lang_code="en")
+        root = self.wxr.wtp.parse(
+            """==== {{Übersetzungen}} ====
+{{Ü-Tabelle|Ü-Liste=
+*{{fr}}: [1] {{Ü|fr|}}
+}}"""
+        )
+        extract_translation(self.wxr, word_entry, root)
+        self.assertEqual(
+            word_entry.model_dump(exclude_defaults=True),
+            {"word": "AM", "lang": "English", "lang_code": "en"},
+        )