Skip to content

Commit

Permalink
Merge pull request #565 from xxyzz/de
Browse files Browse the repository at this point in the history
Fix exceptions/warnings and translate some raw tags in de edition
  • Loading branch information
xxyzz authored Mar 29, 2024
2 parents e15ea73 + 4175115 commit 8944339
Show file tree
Hide file tree
Showing 8 changed files with 213 additions and 84 deletions.
24 changes: 11 additions & 13 deletions src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,35 +51,33 @@ def extract_examples(
for ref_node in ref_nodes:
extract_reference(wxr, example_data, ref_node)

example_text = clean_node(wxr, {}, list_item_node.children)
example_text = clean_node(wxr, None, list_item_node.children)

senseid, example_text = match_senseid(example_text)

if example_text:
if len(example_text) > 0:
example_data.text = example_text

if senseid:
for sense in word_entry.senses:
if sense.senseid == senseid:
sense.examples.append(copy.deepcopy(example_data))

else:
if example_data:
if len(senseid) > 0:
for sense in word_entry.senses:
if sense.senseid == senseid:
sense.examples.append(copy.deepcopy(example_data))
else:
wxr.wtp.debug(
f"Found example data without senseid and text: {example_data}",
f"Found example data without senseid: {example_data}",
sortid="extractor/de/examples/extract_examples/28",
)

for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
f"Found unexpected non-list node in example section: {non_list_node}",
f"Found unexpected non-list node in examples: {non_list_node}",
sortid="extractor/de/examples/extract_examples/33",
)


def extract_reference(
wxr: WiktextractContext, example_data: Example, ref_node: WikiNode
):
example_data.raw_ref = clean_node(wxr, {}, ref_node.children)
example_data.raw_ref = clean_node(wxr, None, ref_node.children)

template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE))

Expand Down
40 changes: 20 additions & 20 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import re

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .models import Sense, WordEntry
from .tags import translate_raw_tags
from .utils import match_senseid


Expand Down Expand Up @@ -59,19 +58,32 @@ def process_gloss_list_item(
raw_tag = clean_node(wxr, None, k_arg_value)
sense_data.raw_tags.append(raw_tag)
clean_node(wxr, sense_data, gloss_node)
elif gloss_node.template_name.endswith("."):
raw_tag = clean_node(
wxr, sense_data, gloss_node
).removesuffix(":")
sense_data.raw_tags.append(raw_tag)
elif gloss_node.template_name in (
"QS Herkunft",
"QS Bedeutungen",
):
continue
else:
gloss_nodes.append(gloss_node)
elif (
isinstance(gloss_node, WikiNode)
and gloss_node.kind == NodeKind.ITALIC
):
raw_tag = clean_node(wxr, None, gloss_node).removesuffix(
":"
)
sense_data.raw_tags.append(raw_tag)
italic_text = clean_node(wxr, None, gloss_node)
if italic_text.endswith(":"):
for raw_tag in italic_text.removesuffix(":").split(
", "
):
raw_tag = raw_tag.strip()
if len(raw_tag) > 0:
sense_data.raw_tags.append(raw_tag)
else:
gloss_nodes.append(italic_text)
elif not (
isinstance(gloss_node, WikiNode)
and gloss_node.kind == NodeKind.LIST
Expand All @@ -95,7 +107,8 @@ def process_gloss_list_item(
)

if len(gloss_text) > 0:
sense_data.glosses.append(gloss_text)
sense_data.glosses.append(gloss_text.removeprefix(", "))
translate_raw_tags(sense_data)
word_entry.senses.append(sense_data)

for sub_list_node in list_item_node.find_child(NodeKind.LIST):
Expand All @@ -113,16 +126,3 @@ def process_gloss_list_item(
)
continue
return parent_sense


def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None:
parts = gloss_text.split(":", 1)
if len(parts) > 1:
tags_part = parts[0].strip()

categories = [c.strip() for c in re.split(",", tags_part)]
if all(c.isalnum() for c in categories):
sense_data.raw_tags.extend(categories)
return parts[1].strip()

return gloss_text
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,4 @@ class WordEntry(BaseModelWrap):
synonyms: list[Linkage] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
5 changes: 4 additions & 1 deletion src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .example import extract_examples
from .gloss import extract_glosses
from .linkage import extract_linkages
from .models import WordEntry
from .models import Sense, WordEntry
from .pronunciation import extract_pronunciation
from .section_titles import LINKAGE_TITLES, POS_SECTIONS
from .translation import extract_translation
Expand Down Expand Up @@ -200,4 +200,7 @@ def parse_page(
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [d.model_dump(exclude_defaults=True) for d in page_data]
130 changes: 130 additions & 0 deletions src/wiktextract/extractor/de/tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from .models import WordEntry

# https://de.wiktionary.org/wiki/Vorlage:K
K_TEMPLATE_TAGS = {
"Abl.": "ablative",
"Ablativ": "ablative",
"abw.": "derogatory",
"AE": "US",
"AmE": "US",
"adv.": "adverbial",
"Akkusativ": "accusative",
"alemann.": "Alemannic",
"alemannisch": "Alemannic",
"allg.": "general",
"allgemein": "general",
"alltagsspr.": "colloquial",
"amtsspr.": "officialese",
# "ansonsten": "otherwise", # combined with other text
"attr.": "attributive",
# "auch": "also",
"bair.": "Bavarian",
"bairisch": "Bavarian",
"bar.": "Bavarian",
"BE": "British",
"BrE": "British",
"Bedva.": "outdated",
"Bedvatd.": "outdated",
# "bei": "",
# "bes.": "especially",
# "besonders": "especially",
# "beziehungsweise": "",
# "bzw.": "",
# "bildungsspr.": "",
# "bis": "",
# "bisweilen": "",
# "das": "",
"Dativ": "dative",
# "DDR": "",
# "der": "",
"dichter.": "poetic",
# "die": "",
"Dim.": "diminutive",
"Dimin.": "diminutive",
"Diminutiv": "diminutive",
# "eher": "",
"erzg.": "Erzgebirgisch",
"erzgeb.": "Erzgebirgisch",
"erzgebirgisch": "Erzgebirgisch",
"euph.": "euphemistic",
"fachspr.": "jargon",
"fam.": "familiär",
"fig": "figurative",
"fig.": "figurative",
# "früher": "",
# "gegenwartslateinisch": "",
"geh.": "gehoben",
"Genitiv": "genitive",
"gsm": "Swiss German",
"häufig": "often",
"haben": "auxiliary",
"hebben": "auxiliary",
"hauptsächlich": "primarily",
"hist.": "historical",
"ieS": "narrowly",
"i.e.S.": "narrowly",
"i. e. S.": "narrowly",
# "im": "",
# "in": "",
# "in Bezug auf": "relational",
"indekl.": "indeclinable",
# "insbes.": "",
"Instrumental": "instrumental",
"intrans.": "intransitive",
"intransitiv": "intransitive",
# "iPl": "in plural",
"iron.": "ironic",
# "iwS": "",
# "jugendspr.": "",
"kinderspr.": "childish",
"kirchenlateinisch": "Church Latin",
"klasslat.": "Classical Latin",
"klassischlateinisch": "Classical Latin",
"kPl.": "no-plural",
"kSg.": "no-singulative",
"kSt.": "no-comparative",
"landsch.": "regional",
"lautm.": "onomatopoeic",
"Ling.": "linguistics",
"mA": "accusative",
"md.": "Central German",
"mdal.": "dialectal",
"Med.": "medicine", # topic
# "meist": "mostly",
# "meistens": "mostly",
"metaphor.": "metaphoric",
"meton.": "metonymically",
"mG": "genitive",
"mitteld.": "Central German",
# "mitunter": "",
"mlat.": "Medieval Latin",
"mittellateinisch": "Medieval Latin",
"mundartl.": "dialectal",
"nDu.": "only-dual",
"nigr.": "Niger",
"nigrisch": "Niger",
"nkLat.": "post-Classical Latin",
"nachklassischlateinisch": "post-Classical Latin",
"nlat.": "New Latin",
"neulateinisch": "New Latin",
"nordd.": "North German",
"norddeutsch": "North German",
"nordwestd.": "Northwestern Germany",
"nPl.": "plural-only",
"Österreich": "Austrian German",
"österr.": "Austrian German",
"österreichisch": "Austrian German",
"ostfränkisch": "East Franconian German",
"pej.": "pejorative",
"poet.": "poetic",
}


def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in K_TEMPLATE_TAGS:
data.tags.append(K_TEMPLATE_TAGS[raw_tag])
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
85 changes: 38 additions & 47 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.gloss import (
extract_glosses,
extract_tags_from_gloss_text,
)
from wiktextract.extractor.de.models import Sense
from wiktextract.extractor.es.models import WordEntry
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -144,55 +142,15 @@ def test_k_template_multiple_tags(self):
"Verb transitiv (Deutsch)",
"Österreichisches Deutsch",
],
"raw_tags": ["trans.", "besonders", "bayrisch", "österr."],
"tags": ["Austrian German"],
"raw_tags": ["trans.", "besonders", "bayrisch"],
"glosses": ["Vieh auf der Alm halten"],
"senseid": "1",
},
],
)

def test_de_extract_tags_from_gloss_text(self):
test_cases = [
# https://de.wiktionary.org/wiki/Hengst
{
"input": "Zoologie: männliches Tier aus der Familie der Einhufer und Kamele",
"expected_tags": ["Zoologie"],
"expected_gloss": "männliches Tier aus der Familie der Einhufer und Kamele",
},
# https://de.wiktionary.org/wiki/ARD
{
"input": "umgangssprachlich, Kurzwort, Akronym: für das erste Fernsehprogramm der ARD",
"expected_tags": ["umgangssprachlich", "Kurzwort", "Akronym"],
"expected_gloss": "für das erste Fernsehprogramm der ARD",
},
# https://de.wiktionary.org/wiki/Endspiel
{
"input": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg",
"expected_tags": None,
"expected_gloss": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg",
},
# Add more test cases as needed
]
for case in test_cases:
with self.subTest(case=case):
sense_data = Sense()

gloss_text = extract_tags_from_gloss_text(
sense_data, case["input"]
)

if case["expected_tags"] is None:
self.assertEqual(
sense_data.model_dump(exclude_defaults=True), {}
)
else:
self.assertEqual(
sense_data.raw_tags,
case["expected_tags"],
)
self.assertEqual(gloss_text, case["expected_gloss"])

def test_handle_sense_modifier(self):
def test_italic_sense_modifier(self):
# https://de.wiktionary.org/wiki/habitare
wikitext = """
* {{trans.}}
Expand Down Expand Up @@ -231,14 +189,47 @@ def test_handle_sense_modifier(self):
"senseid": "2.2",
},
{
"raw_tags": ["intransitiv", "sich befinden"],
"tags": ["intransitive"],
"raw_tags": ["sich befinden"],
"glosses": ["wohnen"],
"senseid": "3",
},
{
"raw_tags": ["intransitiv", "übertragen"],
"tags": ["intransitive"],
"raw_tags": ["übertragen"],
"glosses": ["sich aufhalten, heimisch sein, zu Hause sein"],
"senseid": "4",
},
],
)

def test_italit_node_multiple_raw_tags(self):
self.wxr.wtp.add_page(
"Vorlage:K", 10, "<i>[[Deutschland]],&#32;[[Fernsehen]]&#58;</i>"
)
self.wxr.wtp.add_page("Vorlage:ugs.", 10, "''[[umgangssprachlich]]''")
self.wxr.wtp.start_page("ARD")
root = self.wxr.wtp.parse(
"""===Bedeutungen===
:[2] {{K|Deutschland|Fernsehen}} {{ugs.}}, ''[[Kurzwort]], [[Akronym]]:'' für das erste Fernsehprogramm der ARD"""
)
word_entry = WordEntry(
lang="Deutsch", lang_code="de", word="ARD", pos="noun"
)
extract_glosses(self.wxr, word_entry, root.children[0])
self.assertEqual(
[s.model_dump(exclude_defaults=True) for s in word_entry.senses],
[
{
"raw_tags": [
"Deutschland",
"Fernsehen",
"umgangssprachlich",
"Kurzwort",
"Akronym",
],
"glosses": ["für das erste Fernsehprogramm der ARD"],
"senseid": "2",
},
],
)
Loading

0 comments on commit 8944339

Please sign in to comment.