Skip to content

Commit

Permalink
Process K template in German Wiktionary glosses
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Oct 16, 2023
1 parent 7da1f49 commit 433dd7c
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 14 deletions.
33 changes: 27 additions & 6 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
import copy


def extract_glosses(
Expand Down Expand Up @@ -63,7 +64,7 @@ def process_gloss_list_item(
raw_gloss = clean_node(wxr, {}, list_item_node.children)
gloss_data["raw_glosses"] = [raw_gloss]

extract_categories_from_gloss_node(wxr, gloss_data, list_item_node)
process_K_template(wxr, gloss_data, list_item_node)

gloss_text = clean_node(wxr, gloss_data, list_item_node.children)

Expand Down Expand Up @@ -117,17 +118,37 @@ def handle_sense_modifier(wxr, list_item_node):
pass


def extract_categories_from_gloss_node(
def process_K_template(
wxr: WiktextractContext,
gloss_data: defaultdict(list),
list_item_node: NodeKind.LIST_ITEM,
) -> None:
for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "K":
categories = template_node.template_parameters.values()

categories = [clean_node(wxr, {}, [c]) for c in categories]

categories = []

temp_node = copy.deepcopy(template_node)
for key, value in template_node.template_parameters.items():
if isinstance(key, int):
temp_node.largs = temp_node.largs[:1] + [[value]]
# Cleaned K template will always end with ":". Remove it.
category = clean_node(wxr, {}, temp_node)[:-1]
if category:
categories.append(category)
if key == "ft":
# ft (free text) is used liberally to modify the context
# template. Sometimes it seems to belong rather to the
# gloss itself. Most of the time it is not useful.
# XXX Treat free text in K templates.
continue

prep = template_node.template_parameters.get("Prä")
case = template_node.template_parameters.get("Kas")
category = (prep if prep else "") + (" + " + case if case else "")
if category:
categories.append(category)

# Remove the template_node from the children of list_item_node
list_item_node.children = [
c for c in list_item_node.children if c != template_node
]
Expand Down
143 changes: 135 additions & 8 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,35 @@
import unittest
from collections import defaultdict
from unittest.mock import patch

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.gloss import (
extract_glosses,
extract_categories_from_gloss_node,
process_K_template,
extract_categories_from_gloss_text,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


def mock_clean_node(wxr, sense_data, value):
param = value.largs[1][0]
map = {
"trans.": "transitiv",
"intrans.": "intransitiv",
"refl.": "reflexiv",
"kPl.": "kein Plural",
"ugs.": "umgangssprachlich",
"sein": "Hilfsverb sein",
"österr.": "österreichisch",
}
if param in map:
return map[param] + ":"
return param + ":"


class TestGlossList(unittest.TestCase):
maxDiff = None

Expand Down Expand Up @@ -92,7 +109,7 @@ def test_de_extract_glosses_with_subglosses(self):

def test_de_extract_glosses_with_only_subglosses(self):
self.wxr.wtp.start_page("")
self.wxr.wtp.add_page("Vorlage:K", 10, "")
self.wxr.wtp.add_page("Vorlage:K", 10, "category:")
root = self.wxr.wtp.parse(
":[1] {{K|category}}\n::[a] subglossA\n::[1b] subglossB"
)
Expand Down Expand Up @@ -123,16 +140,17 @@ def test_de_extract_glosses_with_only_subglosses(self):
],
)

def test_de_extract_categories_from_gloss_node(self):
@patch("wiktextract.extractor.de.gloss.clean_node", mock_clean_node)
def test_process_K_template_removes_K_template_nodes(self):
self.wxr.wtp.start_page("")
self.wxr.wtp.add_page("Vorlage:K", 10, "")
root = self.wxr.wtp.parse(":[1] {{K|category1|category2}} gloss1")

list_item_node = root.children[0].children[0]
# self.wxr.wtp.add_page("Vorlage:K", 10, "")
root = self.wxr.wtp.parse("{{K|category1|category2}} gloss1")

gloss_data = defaultdict(list)

extract_categories_from_gloss_node(self.wxr, gloss_data, list_item_node)
self.assertEqual(len(root.children), 2)

process_K_template(self.wxr, gloss_data, root)

self.assertEqual(
gloss_data,
Expand All @@ -141,6 +159,115 @@ def test_de_extract_categories_from_gloss_node(self):
},
)

self.assertEqual(len(root.children), 1)

@patch("wiktextract.extractor.de.gloss.clean_node", mock_clean_node)
def test_process_K_template(self):
# Test cases chosen from:
# https://de.wiktionary.org/wiki/Vorlage:K/Doku
test_cases = [
# https://de.wiktionary.org/wiki/delektieren
{"input": "{{K|refl.}}", "expected_categories": ["reflexiv"]},
# https://de.wiktionary.org/wiki/delektieren
{"input": "{{K|trans.}}", "expected_categories": ["transitiv"]},
# https://de.wiktionary.org/wiki/abbreviare
{
"input": "{{K|trans.|ft=etwas in seinem [[räumlich]]en oder [[zeitlich]]en [[Ausmaß]] verringern|spr=it}}",
"expected_categories": ["transitiv"],
},
# https://de.wiktionary.org/wiki/abbreviare
{
"input": "{{K|trans.|Linguistik|Wortbildung|spr=it}}",
"expected_categories": [
"transitiv",
"Linguistik",
"Wortbildung",
],
},
# https://de.wiktionary.org/wiki/Bakterie
{"input": "{{K|Biologie}}", "expected_categories": ["Biologie"]},
# https://de.wiktionary.org/wiki/Kraut
{
"input": "{{K|kPl.|ugs.}}",
"expected_categories": ["kein Plural", "umgangssprachlich"],
},
# https://de.wiktionary.org/wiki/almen
# Ideally we would filter out "besonders" but there doesn't seem
# to be a general rule which categories are semmantially relevant
{
"input": "{{K|trans.|t1=;|besonders|t2=_|bayrisch|österr.}}",
"expected_categories": [
"transitiv",
"besonders",
"bayrisch",
"österreichisch",
],
},
# https://de.wiktionary.org/wiki/Agentur
{
"input": "{{K|Behörde|ft=seit etwa 2000 in Deutschland}}",
"expected_categories": ["Behörde"],
},
# https://de.wiktionary.org/wiki/Objekt
{
"input": "{{K|Astronomie|ft=kurz für}}",
"expected_categories": ["Astronomie"],
},
# https://de.wiktionary.org/wiki/einlaufen
{
"input": "{{K|intrans.|Nautik|t7=_|ft=(von Schiffen)}}",
"expected_categories": ["intransitiv", "Nautik"],
},
# https://de.wiktionary.org/wiki/Pfund
{
"input": "{{K|veraltet|veraltend|t1=;|t7=_|ft=(in Deutschland)}}",
"expected_categories": ["veraltet", "veraltend"],
},
# https://de.wiktionary.org/wiki/umkippen
{"input": "{{K|sein}}", "expected_categories": ["Hilfsverb sein"]},
# https://de.wiktionary.org/wiki/umkippen
{
"input": "{{K|sein|salopp}}",
"expected_categories": ["Hilfsverb sein", "salopp"],
},
# https://de.wiktionary.org/wiki/Hasskommentar
{
"input": "{{K|Internet|ft=[[soziales Netzwerk{{!}}soziale Netzwerke]]}}",
"expected_categories": ["Internet"],
},
# https://de.wiktionary.org/wiki/abominabilis
{
"input": "{{K|spätlateinisch|spr=la}}",
"expected_categories": ["spätlateinisch"],
},
# https://de.wiktionary.org/wiki/zählen
{
"input": "{{K|intrans.|Prä=auf|Kas=Akk.|ft=(auf jemanden/etwas zählen)}}",
"expected_categories": ["intransitiv", "auf + Akk."],
},
# https://de.wiktionary.org/wiki/bojovat
{
"input": "{{K|intrans.|Prä=proti|Kas=Dativ||ft=bojovat [[proti]] + [[Dativ]]|spr=cs}}",
"expected_categories": ["intransitiv", "proti + Dativ"],
},
]

for case in test_cases:
with self.subTest(case=case):
gloss_data = defaultdict(list)

self.wxr.wtp.start_page("")

root = self.wxr.wtp.parse(case["input"])

process_K_template(self.wxr, gloss_data, root)
self.assertEqual(
gloss_data,
{
"categories": case["expected_categories"],
},
)

def test_de_extract_categories_from_gloss_text(self):
test_cases = [
{
Expand Down

0 comments on commit 433dd7c

Please sign in to comment.