Skip to content

Commit

Permalink
Process K templates in German Wiktionary glosses
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Oct 16, 2023
1 parent 7da1f49 commit ed17506
Show file tree
Hide file tree
Showing 2 changed files with 184 additions and 38 deletions.
43 changes: 31 additions & 12 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
import copy


def extract_glosses(
Expand Down Expand Up @@ -63,7 +64,7 @@ def process_gloss_list_item(
raw_gloss = clean_node(wxr, {}, list_item_node.children)
gloss_data["raw_glosses"] = [raw_gloss]

extract_categories_from_gloss_node(wxr, gloss_data, list_item_node)
process_K_template(wxr, gloss_data, list_item_node)

gloss_text = clean_node(wxr, gloss_data, list_item_node.children)

Expand All @@ -82,9 +83,7 @@ def process_gloss_list_item(
sortid="extractor/de/glosses/extract_glosses/28",
)

gloss_text = extract_categories_from_gloss_text(
gloss_data, gloss_text
)
gloss_text = extract_tags_from_gloss_text(gloss_data, gloss_text)

if gloss_text or not sub_glosses_list_nodes:
gloss_data["glosses"] = [gloss_text]
Expand Down Expand Up @@ -117,25 +116,45 @@ def handle_sense_modifier(wxr, list_item_node):
pass


def extract_categories_from_gloss_node(
def process_K_template(
wxr: WiktextractContext,
gloss_data: defaultdict(list),
list_item_node: NodeKind.LIST_ITEM,
) -> None:
for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "K":
categories = template_node.template_parameters.values()

categories = [clean_node(wxr, {}, [c]) for c in categories]

categories = []

temp_node = copy.deepcopy(template_node)
for key, value in template_node.template_parameters.items():
if isinstance(key, int):
temp_node.largs = temp_node.largs[:1] + [[value]]
# Cleaned K template will always end with ":". Remove it.
category = clean_node(wxr, {}, temp_node)[:-1]
if category:
categories.append(category)
if key == "ft":
# ft (free text) is used liberally to modify the context
# template. Sometimes it seems to belong rather to the
# gloss itself. Most of the time it is not useful.
# XXX Treat free text in K templates.
continue

prep = template_node.template_parameters.get("Prä")
case = template_node.template_parameters.get("Kas")
category = (prep if prep else "") + (" + " + case if case else "")
if category:
categories.append(category)

# Remove the template_node from the children of list_item_node
list_item_node.children = [
c for c in list_item_node.children if c != template_node
]

gloss_data["categories"].extend(categories)
gloss_data["tags"].extend(categories)


def extract_categories_from_gloss_text(
def extract_tags_from_gloss_text(
gloss_data: defaultdict(list), gloss_text: str
) -> None:
parts = gloss_text.split(":", 1)
Expand All @@ -144,7 +163,7 @@ def extract_categories_from_gloss_text(

categories = [c.strip() for c in re.split(",|and", categories_part)]
if all(c.isalnum() for c in categories):
gloss_data["categories"].extend(categories)
gloss_data["tags"].extend(categories)
return parts[1].strip()

return gloss_text
179 changes: 153 additions & 26 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,35 @@
import unittest
from collections import defaultdict
from unittest.mock import patch

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.gloss import (
extract_glosses,
extract_categories_from_gloss_node,
extract_categories_from_gloss_text,
process_K_template,
extract_tags_from_gloss_text,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


def mock_clean_node(wxr, sense_data, value):
param = value.largs[1][0]
map = {
"trans.": "transitiv",
"intrans.": "intransitiv",
"refl.": "reflexiv",
"kPl.": "kein Plural",
"ugs.": "umgangssprachlich",
"sein": "Hilfsverb sein",
"österr.": "österreichisch",
}
if param in map:
return map[param] + ":"
return param + ":"


class TestGlossList(unittest.TestCase):
maxDiff = None

Expand Down Expand Up @@ -92,9 +109,9 @@ def test_de_extract_glosses_with_subglosses(self):

def test_de_extract_glosses_with_only_subglosses(self):
self.wxr.wtp.start_page("")
self.wxr.wtp.add_page("Vorlage:K", 10, "")
self.wxr.wtp.add_page("Vorlage:K", 10, "tag:")
root = self.wxr.wtp.parse(
":[1] {{K|category}}\n::[a] subglossA\n::[1b] subglossB"
":[1] {{K|tag}}\n::[a] subglossA\n::[1b] subglossB"
)

page_data = [defaultdict(list)]
Expand All @@ -107,13 +124,13 @@ def test_de_extract_glosses_with_only_subglosses(self):
{
"senses": [
{
"categories": ["category"],
"tags": ["tag"],
"glosses": ["subglossA"],
"raw_glosses": ["[a] subglossA"],
"senseid": "1a",
},
{
"categories": ["category"],
"tags": ["tag"],
"glosses": ["subglossB"],
"raw_glosses": ["[1b] subglossB"],
"senseid": "1b",
Expand All @@ -123,49 +140,159 @@ def test_de_extract_glosses_with_only_subglosses(self):
],
)

def test_de_extract_categories_from_gloss_node(self):
@patch("wiktextract.extractor.de.gloss.clean_node", mock_clean_node)
def test_process_K_template_removes_K_template_nodes(self):
self.wxr.wtp.start_page("")
self.wxr.wtp.add_page("Vorlage:K", 10, "")
root = self.wxr.wtp.parse(":[1] {{K|category1|category2}} gloss1")

list_item_node = root.children[0].children[0]
# self.wxr.wtp.add_page("Vorlage:K", 10, "")
root = self.wxr.wtp.parse("{{K|tag1|tag2}} gloss1")

gloss_data = defaultdict(list)

extract_categories_from_gloss_node(self.wxr, gloss_data, list_item_node)
self.assertEqual(len(root.children), 2)

process_K_template(self.wxr, gloss_data, root)

self.assertEqual(
gloss_data,
{
"categories": ["category1", "category2"],
"tags": ["tag1", "tag2"],
},
)

def test_de_extract_categories_from_gloss_text(self):
self.assertEqual(len(root.children), 1)

@patch("wiktextract.extractor.de.gloss.clean_node", mock_clean_node)
def test_process_K_template(self):
# Test cases chosen from:
# https://de.wiktionary.org/wiki/Vorlage:K/Doku
test_cases = [
# https://de.wiktionary.org/wiki/delektieren
{"input": "{{K|refl.}}", "expected_tags": ["reflexiv"]},
# https://de.wiktionary.org/wiki/delektieren
{"input": "{{K|trans.}}", "expected_tags": ["transitiv"]},
# https://de.wiktionary.org/wiki/abbreviare
{
"input": "{{K|trans.|ft=etwas in seinem [[räumlich]]en oder [[zeitlich]]en [[Ausmaß]] verringern|spr=it}}",
"expected_tags": ["transitiv"],
},
# https://de.wiktionary.org/wiki/abbreviare
{
"input": "{{K|trans.|Linguistik|Wortbildung|spr=it}}",
"expected_tags": [
"transitiv",
"Linguistik",
"Wortbildung",
],
},
# https://de.wiktionary.org/wiki/Bakterie
{"input": "{{K|Biologie}}", "expected_tags": ["Biologie"]},
# https://de.wiktionary.org/wiki/Kraut
{
"input": "{{K|kPl.|ugs.}}",
"expected_tags": ["kein Plural", "umgangssprachlich"],
},
# https://de.wiktionary.org/wiki/almen
# Ideally we would filter out "besonders" but there doesn't seem
# to be a general rule which tags are semmantially relevant
{
"input": "{{K|trans.|t1=;|besonders|t2=_|bayrisch|österr.}}",
"expected_tags": [
"transitiv",
"besonders",
"bayrisch",
"österreichisch",
],
},
# https://de.wiktionary.org/wiki/Agentur
{
"input": "{{K|Behörde|ft=seit etwa 2000 in Deutschland}}",
"expected_tags": ["Behörde"],
},
# https://de.wiktionary.org/wiki/Objekt
{
"input": "{{K|Astronomie|ft=kurz für}}",
"expected_tags": ["Astronomie"],
},
# https://de.wiktionary.org/wiki/einlaufen
{
"input": "{{K|intrans.|Nautik|t7=_|ft=(von Schiffen)}}",
"expected_tags": ["intransitiv", "Nautik"],
},
# https://de.wiktionary.org/wiki/Pfund
{
"input": "{{K|veraltet|veraltend|t1=;|t7=_|ft=(in Deutschland)}}",
"expected_tags": ["veraltet", "veraltend"],
},
# https://de.wiktionary.org/wiki/umkippen
{"input": "{{K|sein}}", "expected_tags": ["Hilfsverb sein"]},
# https://de.wiktionary.org/wiki/umkippen
{
"input": "{{K|sein|salopp}}",
"expected_tags": ["Hilfsverb sein", "salopp"],
},
# https://de.wiktionary.org/wiki/Hasskommentar
{
"input": "{{K|Internet|ft=[[soziales Netzwerk{{!}}soziale Netzwerke]]}}",
"expected_tags": ["Internet"],
},
# https://de.wiktionary.org/wiki/abominabilis
{
"input": "{{K|spätlateinisch|spr=la}}",
"expected_tags": ["spätlateinisch"],
},
# https://de.wiktionary.org/wiki/zählen
{
"input": "{{K|intrans.|Prä=auf|Kas=Akk.|ft=(auf jemanden/etwas zählen)}}",
"expected_tags": ["intransitiv", "auf + Akk."],
},
# https://de.wiktionary.org/wiki/bojovat
{
"input": "{{K|intrans.|Prä=proti|Kas=Dativ||ft=bojovat [[proti]] + [[Dativ]]|spr=cs}}",
"expected_tags": ["intransitiv", "proti + Dativ"],
},
]

for case in test_cases:
with self.subTest(case=case):
gloss_data = defaultdict(list)

self.wxr.wtp.start_page("")

root = self.wxr.wtp.parse(case["input"])

process_K_template(self.wxr, gloss_data, root)
self.assertEqual(
gloss_data,
{
"tags": case["expected_tags"],
},
)

def test_de_extract_tags_from_gloss_text(self):
test_cases = [
{
"input": "category1: gloss1",
"expected_categories": ["category1"],
"input": "tag1: gloss1",
"expected_tags": ["tag1"],
"expected_gloss": "gloss1",
},
{
"input": "category1, category2: gloss1",
"expected_categories": ["category1", "category2"],
"input": "tag1, tag2: gloss1",
"expected_tags": ["tag1", "tag2"],
"expected_gloss": "gloss1",
},
{
"input": "category1 and category2: gloss1",
"expected_categories": ["category1", "category2"],
"input": "tag1 and tag2: gloss1",
"expected_tags": ["tag1", "tag2"],
"expected_gloss": "gloss1",
},
{
"input": "category1, category2 and category3: gloss1",
"expected_categories": ["category1", "category2", "category3"],
"input": "tag1, tag2 and tag3: gloss1",
"expected_tags": ["tag1", "tag2", "tag3"],
"expected_gloss": "gloss1",
},
{
"input": "Beginning of gloss: second part of gloss",
"expected_categories": None,
"expected_tags": None,
"expected_gloss": "Beginning of gloss: second part of gloss",
}
# Add more test cases as needed
Expand All @@ -174,17 +301,17 @@ def test_de_extract_categories_from_gloss_text(self):
with self.subTest(case=case):
gloss_data = defaultdict(list)

gloss_text = extract_categories_from_gloss_text(
gloss_text = extract_tags_from_gloss_text(
gloss_data, case["input"]
)

if case["expected_categories"] is None:
if case["expected_tags"] is None:
self.assertEqual(gloss_data, {})
else:
self.assertEqual(
gloss_data,
{
"categories": case["expected_categories"],
"tags": case["expected_tags"],
},
)
self.assertEqual(gloss_text, case["expected_gloss"])

0 comments on commit ed17506

Please sign in to comment.