Skip to content

Commit

Permalink
Extract semantic relations from German Wiktionary
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.

Fix types for python3.9
  • Loading branch information
empiriker committed Oct 20, 2023
1 parent bcef60e commit 551b95b
Show file tree
Hide file tree
Showing 6 changed files with 303 additions and 10 deletions.
20 changes: 16 additions & 4 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
from wikitextprocessor.parser import LevelNode

from wiktextract.datautils import append_base_data
from wiktextract.extractor.de.pronunciation import extract_pronunciation
from wiktextract.wxr_context import WiktextractContext

from .example import extract_examples
from .gloss import extract_glosses
from .pronunciation import extract_pronunciation
from .semantic_relations import extract_semantic_relations
from .translation import extract_translation

# Templates that are used to form panels on pages and that should be ignored in
Expand Down Expand Up @@ -67,12 +67,24 @@ def parse_section(
wxr.wtp.start_subsection(section_name)
if section_name == "Bedeutungen":
extract_glosses(wxr, page_data, level_node)
if section_name == "Aussprache":
elif section_name == "Aussprache":
extract_pronunciation(wxr, page_data, level_node)
if section_name == "Beispiele":
elif section_name == "Beispiele":
extract_examples(wxr, page_data, level_node)
if section_name == "Übersetzungen":
elif section_name == "Übersetzungen":
extract_translation(wxr, page_data, level_node)
elif section_name in [
"Gegenwörter",
"Holonyme",
"Oberbegriffe",
"Redewendungen",
"Sinnverwandte Wörter",
"Sprichwörter",
"Synonyme",
"Unterbegriffe",
"Wortbildungen",
]:
extract_semantic_relations(wxr, page_data, level_node)


FORM_POS = {
Expand Down
97 changes: 97 additions & 0 deletions src/wiktextract/extractor/de/semantic_relations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import re
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.utils import split_senseids
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

RELATION_TYPES = {
"Gegenwörter": "antonyms",
"Holonyme": "holonyms",
"Oberbegriffe": "hypernyms",
"Redewendungen": "expressions",
"Sinnverwandte Wörter": "coordinate_terms",
"Sprichwörter": "proverbs",
"Synonyme": "synonyms",
"Unterbegriffe": "hyponyms",
"Wortbildungen": "derived",
}


def extract_semantic_relations(
wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode
):
relation_key = RELATION_TYPES.get(level_node.largs[0][0])
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
# Get the senseids
senseids = (
split_senseids(list_item.children[0])
if (
len(list_item.children) > 0
and isinstance(list_item.children[0], str)
)
else []
)

# Extract links
semantic_links = []
if relation_key == "expressions":
for child in list_item.children:
if isinstance(child, str) and contains_dash(child):
# XXX Capture the part after the dash as an explanatory note to the expression, e.g.:
# https://de.wiktionary.org/wiki/Beispiel
# ":[[ein gutes Beispiel geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]"
break
elif (
isinstance(child, WikiNode)
and child.kind == NodeKind.LINK
):
process_link(wxr, semantic_links, child)
else:
for link in list_item.find_child(NodeKind.LINK):
process_link(wxr, semantic_links, link)

# Add links to the page data
if len(page_data[-1]["senses"]) == 1:
page_data[-1]["senses"][0][relation_key].extend(semantic_links)
elif len(senseids) > 0:
for senseid in senseids:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid:
sense[relation_key].extend(semantic_links)
else:
page_data[-1][relation_key].extend(semantic_links)

# Check for potentially missed data
for non_link in list_item.invert_find_child(NodeKind.LINK):
if (
relation_key == "expressions"
and isinstance(non_link, str)
and contains_dash(non_link)
):
break
elif isinstance(non_link, str) and (
non_link.startswith("[") or len(non_link.strip()) <= 3
):
continue
wxr.wtp.debug(
f"Found unexpected non-link node '{non_link}' in: {list_item}",
sortid="extractor/de/semantic_relations/extract_semantic_relations/84",
)


def process_link(
wxr: WiktextractContext, semantic_links: List[str], link: WikiNode
):
clean_link = clean_node(wxr, {}, link)
if clean_link.startswith("Verzeichnis:"):
return
semantic_links.append(clean_link)


def contains_dash(text: str):
return re.search(r"[–—―‒-]", text)
27 changes: 21 additions & 6 deletions src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,12 @@ def process_translation_list(
if node.template_name[-1] == "?":
translation_data["uncertain"] = True

translation_data["word"] = node.template_parameters.get(2)
translation_data["word"] = clean_node(
wxr, {}, node.template_parameters.get(2)
)

if node.template_name.removesuffix("?") == "Ü":
process_Ü_template(translation_data, node)
process_Ü_template(wxr, translation_data, node)

if node.template_name.removesuffix("?") == "Üt":
process_Üt_template(wxr, translation_data, node)
Expand All @@ -134,12 +136,13 @@ def is_translation_template(node: any) -> bool:


def process_Ü_template(
wxr: WiktextractContext,
translation_data: Dict[str, Union[str, List, bool]],
template_node: TemplateNode,
):
overwrite_word = template_node.template_parameters.get(3)
if overwrite_word:
translation_data["word"] = overwrite_word
overwrite_word(
wxr, translation_data, template_node.template_parameters.get(3)
)


def process_Üt_template(
Expand All @@ -158,7 +161,19 @@ def process_Üt_template(
if match:
translation_data["roman"] = match.group(1)

overwrite_word = template_node.template_parameters.get(4)
overwrite_word(
wxr, translation_data, template_node.template_parameters.get(4)
)


def overwrite_word(
wxr: WiktextractContext,
translation_data: Dict[str, Union[str, List, bool]],
nodes: Union[List[Union[WikiNode, str]], WikiNode, str, None],
):
if nodes == None:
return
overwrite_word = clean_node(wxr, {}, nodes).strip()
if overwrite_word:
translation_data["word"] = overwrite_word

Expand Down
29 changes: 29 additions & 0 deletions src/wiktextract/extractor/de/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from typing import List

from wikitextprocessor import NodeKind, WikiNode

Expand All @@ -23,3 +24,31 @@ def find_and_remove_child(node: WikiNode, kind: NodeKind, cb=None):
del node.children[idx]
children.append(child)
return reversed(children)


def split_senseids(senseids_str: str) -> List[str]:
senseids = []
raw_ids = (
senseids_str.strip().removeprefix("[").removesuffix("]").split(",")
)
for raw_id in raw_ids:
range_split = raw_id.split("-")
if len(range_split) == 1:
senseids.append(raw_id.strip())
elif len(range_split) == 2:
try:
start = re.sub(r"[a-z]", "", range_split[0].strip())
end = re.sub(r"[a-z]", "", range_split[1].strip())
senseids.extend(
[
str(id)
for id in range(
int(start),
int(end) + 1,
)
]
)
except:
pass

return senseids
116 changes: 116 additions & 0 deletions tests/test_de_semantic_relations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import unittest
from collections import defaultdict

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.semantic_relations import (
extract_semantic_relations,
)
from wiktextract.wxr_context import WiktextractContext


class TestDETranslation(unittest.TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de")
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_de_extract_semantic_relations(self):
test_cases = [
# https://de.wiktionary.org/wiki/Beispiel
# Extracts linkages and places them in the correct sense.
{
"input": "==== Sinnverwandte Wörter ====\n:[1] [[Beleg]], [[Exempel]]\n:[2] [[Muster]], [[Vorbild]]",
"page_data": [
defaultdict(
list,
{
"senses": [
defaultdict(list, {"senseid": "1"}),
defaultdict(list, {"senseid": "2"}),
]
},
)
],
"expected": [
{
"senses": [
{
"senseid": "1",
"coordinate_terms": ["Beleg", "Exempel"],
},
{
"senseid": "2",
"coordinate_terms": ["Muster", "Vorbild"],
},
]
}
],
},
# https://de.wiktionary.org/wiki/Beispiel
# Cleans explanatory text from expressions.
{
"input": "====Redewendungen====\n:[[ein gutes Beispiel geben|ein gutes ''Beispiel'' geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]",
"page_data": [defaultdict(list)],
"expected": [
{
"expressions": ["ein gutes Beispiel geben"],
"senses": [],
},
],
},
# Always places relations in first sense if just one sense.
{
"input": "====Synonyme====\n:[[Synonym1]]",
"page_data": [
defaultdict(
list, {"senses": [defaultdict(list, {"senseid": "1"})]}
)
],
"expected": [
{
"senses": [{"senseid": "1", "synonyms": ["Synonym1"]}],
},
],
},
# https://de.wiktionary.org/wiki/Kokospalme
# Ignores modifiers of relations and all other text.
{
"input": "====Synonyme====\n:[1] [[Kokosnusspalme]], ''wissenschaftlich:'' [[Cocos nucifera]]",
"page_data": [
defaultdict(
list, {"senses": [defaultdict(list, {"senseid": "1"})]}
)
],
"expected": [
{
"senses": [
{
"senseid": "1",
"synonyms": [
"Kokosnusspalme",
"Cocos nucifera",
],
}
],
},
],
},
]

for case in test_cases:
with self.subTest(case=case):
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(case["input"])

extract_semantic_relations(
self.wxr, case["page_data"], root.children[0]
)

self.assertEqual(case["page_data"], case["expected"])
24 changes: 24 additions & 0 deletions tests/test_de_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import unittest

from wiktextract.extractor.de.utils import split_senseids


class TestDEUtils(unittest.TestCase):
maxDiff = None

def test_split_senseids(self):
test_cases = [
("[1]", ["1"]),
("[1,2]", ["1", "2"]),
("[1, 2]", ["1", "2"]),
("[1, 2 ]", ["1", "2"]),
("[1-3]", ["1", "2", "3"]),
("[1, 3-5]", ["1", "3", "4", "5"]),
("[1, 3-4, 6]", ["1", "3", "4", "6"]),
("[1a]", ["1a"]),
("[1, 2a]", ["1", "2a"]),
("[1, 2a-3]", ["1", "2", "3"]),
]

for test_case in test_cases:
self.assertEqual(split_senseids(test_case[0]), test_case[1])

0 comments on commit 551b95b

Please sign in to comment.