Skip to content

Commit

Permalink
Merge pull request #512 from xxyzz/de
Browse files Browse the repository at this point in the history
Move de edition title JSON files to Python code
  • Loading branch information
xxyzz authored Feb 22, 2024
2 parents dde8c4c + fbad2f3 commit efa31c5
Show file tree
Hide file tree
Showing 12 changed files with 150 additions and 137 deletions.
11 changes: 0 additions & 11 deletions src/wiktextract/data/de/linkage_subtitles.json

This file was deleted.

4 changes: 0 additions & 4 deletions src/wiktextract/data/de/other_subtitles.json

This file was deleted.

91 changes: 0 additions & 91 deletions src/wiktextract/data/de/pos_subtitles.json

This file was deleted.

12 changes: 4 additions & 8 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,11 @@ def process_gloss_list_item(
gloss_text = clean_node(wxr, sense_data, list_item_node.children)

senseid, gloss_text = match_senseid(gloss_text)

if senseid:
senseid = (
senseid
if senseid[0].isnumeric()
else parent_senseid + senseid
)
if senseid != "":
if not senseid[0].isnumeric():
senseid = parent_senseid + senseid
sense_data.senseid = senseid
elif gloss_text.strip():
elif len(gloss_text.strip()) > 0:
wxr.wtp.debug(
f"Failed to extract sense number from gloss node: {list_item_node}",
sortid="extractor/de/glosses/extract_glosses/28",
Expand Down
4 changes: 3 additions & 1 deletion src/wiktextract/extractor/de/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .section_titles import LINKAGE_TITLES


def extract_linkages(
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
):
linkage_type = wxr.config.LINKAGE_SUBTITLES.get(level_node.largs[0][0])
linkage_type = LINKAGE_TITLES.get(level_node.largs[0][0])
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
# Get the senseids
Expand Down
19 changes: 7 additions & 12 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import copy
import logging
from typing import Union

Expand All @@ -12,6 +11,7 @@
from .gloss import extract_glosses
from .linkage import extract_linkages
from .pronunciation import extract_pronunciation
from .section_titles import LINKAGE_TITLES, POS_SECTIONS
from .translation import extract_translation

# Templates that are used to form panels on pages and that should be ignored in
Expand Down Expand Up @@ -88,10 +88,7 @@ def parse_section(
wxr.config.capture_translations and section_name == "Übersetzungen"
):
extract_translation(wxr, page_data[-1], level_node_or_children)
elif (
wxr.config.capture_linkages
and section_name in wxr.config.LINKAGE_SUBTITLES
):
elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES:
extract_linkages(wxr, page_data[-1], level_node_or_children)


Expand Down Expand Up @@ -130,18 +127,16 @@ def process_pos_section(
# at all or redundant with form tables.
return

pos_type = wxr.config.POS_SUBTITLES.get(pos_argument)

if pos_type is None:
pos = ""
if pos_argument in POS_SECTIONS:
pos = POS_SECTIONS[pos_argument]["pos"]
else:
wxr.wtp.debug(
f"Unknown POS type: {pos_argument}",
sortid="extractor/de/page/process_pos_section/55",
)
return
pos = pos_type["pos"]

base_data.pos = pos
page_data.append(copy.deepcopy(base_data))
page_data.append(base_data.model_copy(deep=True))

wxr.wtp.start_section(page_data[-1].lang_code + "_" + pos)

Expand Down
110 changes: 110 additions & 0 deletions src/wiktextract/extractor/de/section_titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from wiktextract.config import POSSubtitleData

# argument of title template https://de.wiktionary.org/wiki/Vorlage:Wortart
POS_SECTIONS: POSSubtitleData = {
"Abkürzung (Deutsch)": {"pos": "abbrev"},
"Abkürzung": {"pos": "abbrev"},
"Abtönungspartikel": {"pos": "particle"},
"Adjektiv ": {"pos": "adj"},
"Adjektiv": {"pos": "adj"},
"Adverb ": {"pos": "adv"},
"Adverb": {"pos": "adv"},
"Affix": {"pos": "affix"},
"Antwortpartikel": {"pos": "particle"},
"Artikel": {"pos": "det"},
"Bruchzahlwort": {"pos": "num"},
"Buchstabe": {"pos": "character"},
"Demonstrativpronomen": {"pos": "pron"},
"Eigenname ": {"pos": "name"},
"Eigenname": {"pos": "name"},
"Enklitikon": {"pos": "suffix"},
"Fokuspartikel": {"pos": "particle"},
"Formel": {"pos": "phrase"},
"Gebundenes Lexem": {"pos": "lexeme"},
"Geflügeltes Wort": {"pos": "phrase"},
"Gentilname": {"pos": "name"},
"Gradpartikel": {"pos": "particle"},
"Grußformel": {"pos": "phrase"},
"Hilfsverb": {"pos": "aux"},
"Hiragana": {"pos": "character"},
"Indefinitpronomen": {"pos": "pron"},
"Infinitiv ": {"pos": "verb"},
"Infinitiv": {"pos": "verb"},
"Infix": {"pos": "infix"},
"Interfix": {"pos": "interfix"},
"Interjektion": {"pos": "intj"},
"Interrogativadverb": {"pos": "adv"},
"Interrogativpronomen": {"pos": "pron"},
"Kardinalzahl": {"pos": "num"},
"Kausaladverb": {"pos": "adv"},
"Kognomen": {"pos": "nomen"},
"Konjunktion": {"pos": "conj"},
"Konjunktionaladverb": {"pos": "adv"},
"Kontraktion": {"pos": "abbrev"},
"Lokaladverb": {"pos": "adv"},
"Merkspruch": {"pos": "phrase"},
"Modaladverb": {"pos": "adv"},
"Modalpartikel": {"pos": "particle"},
"Nachname": {"pos": "name"},
"Negationspartikel": {"pos": "particle"},
"Numerale": {"pos": "num"},
"Onomatopoetikum": {"pos": "intj"},
"Ortsnamengrundwort": {"pos": "name"},
"Ordinalzahl": {"pos": "num"},
"Partikel": {"pos": "particle"},
"Partikelverb": {"pos": "verb"},
"Patronym": {"pos": "name"},
"Personalpronomen ": {"pos": "pron"},
"Personalpronomen": {"pos": "pron"},
"Possessivpronomen ": {"pos": "pron"},
"Possessivpronomen": {"pos": "pron"},
"Postposition": {"pos": "postp"},
"Präfix": {"pos": "prefix"},
"Präfixoid": {"pos": "prefix"},
"Präposition ": {"pos": "prep"},
"Präposition": {"pos": "prep"},
"Pronomen": {"pos": "pron"},
"Pronominaladverb": {"pos": "adv"},
"Redewendung": {"pos": "phrase"},
"Reflexives Personalpronomen": {"pos": "pron"},
"Reflexivpronomen": {"pos": "pron"},
"Relativpronomen": {"pos": "pron"},
"Reziprokpronomen": {"pos": "pron"},
"Schriftzeichen": {"pos": "character"},
"Sprichwort": {"pos": "phrase"},
"Straßenname": {"pos": "name"},
"Subjunktion": {"pos": "conj"},
"Substantiv": {"pos": "noun"},
"Suffix": {"pos": "suffix"},
"Suffixoid": {"pos": "suffix"},
"Symbol": {"pos": "symbol"},
"Temporaladverb": {"pos": "adv"},
"Temporaldverb": {"pos": "adv"},
"Toponym": {"pos": "name"},
"Verb": {"pos": "verb"},
"Vergleichspartikel": {"pos": "particle"},
"Vervielfältigungszahlwort": {"pos": "num"},
"Vorname": {"pos": "name"},
"Wiederholungszahlwort": {"pos": "num"},
"Wortverbindung": {"pos": "phrase"},
"Zahlklassifikator": {"pos": "noun"},
"Zahlzeichen": {"pos": "num"},
"Zirkumfix": {"pos": "circumfix"},
"Zirkumposition": {"pos": "circumpos"},
}

LINKAGE_TITLES: dict[str, str] = {
"Gegenwörter": "antonyms",
"Holonyme": "holonyms",
"Oberbegriffe": "hypernyms",
"Redewendungen": "expressions",
"Sinnverwandte Wörter": "coordinate_terms",
"Sprichwörter": "proverbs",
"Synonyme": "synonyms",
"Unterbegriffe": "hyponyms",
"Wortbildungen": "derived",
}

ETYMOLOGY_TITLES: frozenset[str] = frozenset(["Herkunft"])

PRONUNCIATION_TITLES: frozenset[str] = frozenset(["Aussprache"])
5 changes: 3 additions & 2 deletions src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def process_translation_list(
translation_data.uncertain = True

translation_data.word = clean_node(
wxr, {}, node.template_parameters.get(2)
wxr, None, node.template_parameters.get(2, "")
)

if node.template_name.removesuffix("?") == "Ü":
Expand All @@ -110,7 +110,8 @@ def process_translation_list(
if node.template_name.removesuffix("?") == "Üt":
process_Üt_template(wxr, translation_data, node)

sense_translations.append(translation_data)
if len(translation_data.word) > 0:
sense_translations.append(translation_data)
# Process modifiers at the end of the list
process_modifiers(wxr, sense_translations, Translation(), modifiers)

Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
from wikitextprocessor import NodeKind, WikiNode


def match_senseid(node_text: str):
def match_senseid(node_text: str) -> tuple[str, str]:
match = re.match(r"\[(\d*(?:[a-z]|(?:\.\d+))?)\]", node_text)

if match:
senseid = match.group(1)
node_text = node_text[match.end() :].strip()
else:
senseid = None
senseid = ""

return senseid, node_text

Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_linkages.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.linkage import extract_linkages
from wiktextract.extractor.de.models import Sense, WordEntry
from wiktextract.extractor.de.models import WordEntry
from wiktextract.wxr_context import WiktextractContext


Expand Down
15 changes: 15 additions & 0 deletions tests/test_de_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,18 @@ def test_de_process_translation_list_with_modifiers(self):
self.assertEqual(
translations, case["expected_sense_translations"]
)

def test_empty_translation(self):
self.wxr.wtp.start_page("AM")
word_entry = WordEntry(word="AM", lang="English", lang_code="en")
root = self.wxr.wtp.parse(
"""==== {{Übersetzungen}} ====
{{Ü-Tabelle|Ü-Liste=
*{{fr}}: [1] {{Ü|fr|}}
}}"""
)
extract_translation(self.wxr, word_entry, root)
self.assertEqual(
word_entry.model_dump(exclude_defaults=True),
{"word": "AM", "lang": "English", "lang_code": "en"},
)
Loading

0 comments on commit efa31c5

Please sign in to comment.