Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add linkage_subtitles.json for German Wiktionary #376

Merged
merged 2 commits into from
Oct 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/wiktextract/data/de/linkage_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"Gegenwörter": "antonyms",
"Holonyme": "holonyms",
"Oberbegriffe": "hypernyms",
"Redewendungen": "expressions",
"Sinnverwandte Wörter": "coordinate_terms",
"Sprichwörter": "proverbs",
"Synonyme": "synonyms",
"Unterbegriffe": "hyponyms",
"Wortbildungen": "derived"
}
71 changes: 35 additions & 36 deletions src/wiktextract/data/fr/linkage_subtitles.json
Original file line number Diff line number Diff line change
@@ -1,47 +1,46 @@
{
"synonymes": "synonyms",
"syn": "synonyms",
"hyponymes": "hyponyms",
"hypo": "hyponyms",
"hyperonymes": "hypernyms",
"hyper": "hypernyms",
"holonymes": "holonyms",
"holo": "holonyms",
"méronymes": "meronyms",
"méro": "meronyms",
"dérivés": "derived",
"drv": "derived",
"troponymes": "troponyms",
"tropo": "troponyms",
"paronymes": "paronyms",
"paro": "paronyms",
"apparentés": "related",
"apr": "related",
"abrév": "abbreviation",
"abréviations": "abbreviation",
"app": "related",
"apparentés": "related",
"étymologiques": "related",
"quasi-synonymes": "synonyms",
"quasi-syn": "synonyms",
"q-syn": "synonyms",
"apr": "related",
"dérivés autres langues": "derived",
"dérivés int": "derived",
"dérivés": "derived",
"dial": "related",
"dialectes": "related",
"drv-int": "derived",
"variantes": "related",
"var": "related",
"variantes orthographiques": "related",
"variantes ortho": "related",
"drv": "derived",
"étymologiques": "related",
"holo": "holonyms",
"holonymes": "holonyms",
"hyper": "hypernyms",
"hyperonymes": "hypernyms",
"hypo": "hyponyms",
"hyponymes": "hyponyms",
"méro": "meronyms",
"méronymes": "meronyms",
"paro": "paronyms",
"paronymes": "paronyms",
"phrases": "proverbs",
"q-syn": "synonyms",
"quasi-syn": "synonyms",
"quasi-synonymes": "synonyms",
"syn": "synonyms",
"synonymes": "synonyms",
"tropo": "troponyms",
"troponymes": "troponyms",
"var-dial": "related",
"var-ortho": "related",
"variantes dialectales": "related",
"var": "related",
"variantes dial": "related",
"var-dial": "related",
"dial": "related",
"variantes dialectales": "related",
"variantes dialectes": "related",
"dialectes": "related",
"abréviations": "abbreviation",
"abrév": "abbreviation",
"phrases": "proverbs",
"vocabulaire": "related",
"vocabulaire apparenté": "related",
"variantes ortho": "related",
"variantes orthographiques": "related",
"variantes": "related",
"voc": "related",
"vocabulaire proche": "related"
"vocabulaire apparenté": "related",
"vocabulaire proche": "related",
"vocabulaire": "related"
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,11 @@
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

SEMANTIC_RELATIONS = {
"Gegenwörter": "antonyms",
"Holonyme": "holonyms",
"Oberbegriffe": "hypernyms",
"Redewendungen": "expressions",
"Sinnverwandte Wörter": "coordinate_terms",
"Sprichwörter": "proverbs",
"Synonyme": "synonyms",
"Unterbegriffe": "hyponyms",
"Wortbildungen": "derived",
}


def extract_semantic_relations(
def extract_linkages(
wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode
):
relation_key = SEMANTIC_RELATIONS.get(level_node.largs[0][0])
linkage_type = wxr.config.LINKAGE_SUBTITLES.get(level_node.largs[0][0])
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
# Get the senseids
Expand All @@ -38,8 +26,8 @@ def extract_semantic_relations(
)

# Extract links
semantic_links = []
if relation_key == "expressions":
linkages = []
if linkage_type == "expressions":
for child in list_item.children:
if isinstance(child, str) and contains_dash(child):
# XXX Capture the part after the dash as an explanatory note to the expression, e.g.:
Expand All @@ -50,26 +38,26 @@ def extract_semantic_relations(
isinstance(child, WikiNode)
and child.kind == NodeKind.LINK
):
process_link(wxr, semantic_links, child)
process_link(wxr, linkages, child)
else:
for link in list_item.find_child(NodeKind.LINK):
process_link(wxr, semantic_links, link)
process_link(wxr, linkages, link)

# Add links to the page data
if len(page_data[-1]["senses"]) == 1:
page_data[-1]["senses"][0][relation_key].extend(semantic_links)
page_data[-1]["senses"][0][linkage_type].extend(linkages)
elif len(senseids) > 0:
for senseid in senseids:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid:
sense[relation_key].extend(semantic_links)
sense[linkage_type].extend(linkages)
else:
page_data[-1][relation_key].extend(semantic_links)
page_data[-1][linkage_type].extend(linkages)

# Check for potentially missed data
for non_link in list_item.invert_find_child(NodeKind.LINK):
if (
relation_key == "expressions"
linkage_type == "expressions"
and isinstance(non_link, str)
and contains_dash(non_link)
):
Expand All @@ -80,7 +68,7 @@ def extract_semantic_relations(
continue
wxr.wtp.debug(
f"Found unexpected non-link node '{non_link}' in: {list_item}",
sortid="extractor/de/semantic_relations/extract_semantic_relations/84",
sortid="extractor/de/linkages/extract_linkages/84",
)


Expand Down
17 changes: 11 additions & 6 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from .example import extract_examples
from .gloss import extract_glosses
from .linkage import extract_linkages
from .pronunciation import extract_pronunciation
from .semantic_relations import SEMANTIC_RELATIONS, extract_semantic_relations
from .translation import extract_translation

# Templates that are used to form panels on pages and that should be ignored in
Expand Down Expand Up @@ -67,14 +67,19 @@ def parse_section(
wxr.wtp.start_subsection(section_name)
if section_name == "Bedeutungen":
extract_glosses(wxr, page_data, level_node)
elif section_name == "Aussprache":
elif wxr.config.capture_pronunciation and section_name == "Aussprache":
extract_pronunciation(wxr, page_data, level_node)
elif section_name == "Beispiele":
elif wxr.config.capture_examples and section_name == "Beispiele":
extract_examples(wxr, page_data, level_node)
elif section_name == "Übersetzungen":
elif (
wxr.config.capture_translations and section_name == "Übersetzungen"
):
extract_translation(wxr, page_data, level_node)
elif section_name in SEMANTIC_RELATIONS:
extract_semantic_relations(wxr, page_data, level_node)
elif (
wxr.config.capture_linkages
and section_name in wxr.config.LINKAGE_SUBTITLES
):
extract_linkages(wxr, page_data, level_node)


FORM_POS = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@
from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.semantic_relations import (
extract_semantic_relations,
)
from wiktextract.extractor.de.linkage import extract_linkages
from wiktextract.wxr_context import WiktextractContext


class TestDETranslation(unittest.TestCase):
class TestDELinkages(unittest.TestCase):
maxDiff = None

def setUp(self) -> None:
Expand All @@ -21,7 +19,7 @@ def setUp(self) -> None:
def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_de_extract_semantic_relations(self):
def test_de_extract_linkages(self):
test_cases = [
# https://de.wiktionary.org/wiki/Beispiel
# Extracts linkages and places them in the correct sense.
Expand Down Expand Up @@ -109,8 +107,6 @@ def test_de_extract_semantic_relations(self):
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(case["input"])

extract_semantic_relations(
self.wxr, case["page_data"], root.children[0]
)
extract_linkages(self.wxr, case["page_data"], root.children[0])

self.assertEqual(case["page_data"], case["expected"])