Skip to content

Commit

Permalink
Merge pull request #376 from empiriker/de
Browse files Browse the repository at this point in the history
Add linkage_subtitles.json for German Wiktionary
  • Loading branch information
xxyzz authored Oct 21, 2023
2 parents b7d8d2d + 4e6e295 commit b2ab827
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 73 deletions.
11 changes: 11 additions & 0 deletions src/wiktextract/data/de/linkage_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"Gegenwörter": "antonyms",
"Holonyme": "holonyms",
"Oberbegriffe": "hypernyms",
"Redewendungen": "expressions",
"Sinnverwandte Wörter": "coordinate_terms",
"Sprichwörter": "proverbs",
"Synonyme": "synonyms",
"Unterbegriffe": "hyponyms",
"Wortbildungen": "derived"
}
71 changes: 35 additions & 36 deletions src/wiktextract/data/fr/linkage_subtitles.json
Original file line number Diff line number Diff line change
@@ -1,47 +1,46 @@
{
"synonymes": "synonyms",
"syn": "synonyms",
"hyponymes": "hyponyms",
"hypo": "hyponyms",
"hyperonymes": "hypernyms",
"hyper": "hypernyms",
"holonymes": "holonyms",
"holo": "holonyms",
"méronymes": "meronyms",
"méro": "meronyms",
"dérivés": "derived",
"drv": "derived",
"troponymes": "troponyms",
"tropo": "troponyms",
"paronymes": "paronyms",
"paro": "paronyms",
"apparentés": "related",
"apr": "related",
"abrév": "abbreviation",
"abréviations": "abbreviation",
"app": "related",
"apparentés": "related",
"étymologiques": "related",
"quasi-synonymes": "synonyms",
"quasi-syn": "synonyms",
"q-syn": "synonyms",
"apr": "related",
"dérivés autres langues": "derived",
"dérivés int": "derived",
"dérivés": "derived",
"dial": "related",
"dialectes": "related",
"drv-int": "derived",
"variantes": "related",
"var": "related",
"variantes orthographiques": "related",
"variantes ortho": "related",
"drv": "derived",
"étymologiques": "related",
"holo": "holonyms",
"holonymes": "holonyms",
"hyper": "hypernyms",
"hyperonymes": "hypernyms",
"hypo": "hyponyms",
"hyponymes": "hyponyms",
"méro": "meronyms",
"méronymes": "meronyms",
"paro": "paronyms",
"paronymes": "paronyms",
"phrases": "proverbs",
"q-syn": "synonyms",
"quasi-syn": "synonyms",
"quasi-synonymes": "synonyms",
"syn": "synonyms",
"synonymes": "synonyms",
"tropo": "troponyms",
"troponymes": "troponyms",
"var-dial": "related",
"var-ortho": "related",
"variantes dialectales": "related",
"var": "related",
"variantes dial": "related",
"var-dial": "related",
"dial": "related",
"variantes dialectales": "related",
"variantes dialectes": "related",
"dialectes": "related",
"abréviations": "abbreviation",
"abrév": "abbreviation",
"phrases": "proverbs",
"vocabulaire": "related",
"vocabulaire apparenté": "related",
"variantes ortho": "related",
"variantes orthographiques": "related",
"variantes": "related",
"voc": "related",
"vocabulaire proche": "related"
"vocabulaire apparenté": "related",
"vocabulaire proche": "related",
"vocabulaire": "related"
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,11 @@
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

SEMANTIC_RELATIONS = {
"Gegenwörter": "antonyms",
"Holonyme": "holonyms",
"Oberbegriffe": "hypernyms",
"Redewendungen": "expressions",
"Sinnverwandte Wörter": "coordinate_terms",
"Sprichwörter": "proverbs",
"Synonyme": "synonyms",
"Unterbegriffe": "hyponyms",
"Wortbildungen": "derived",
}


def extract_semantic_relations(
def extract_linkages(
wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode
):
relation_key = SEMANTIC_RELATIONS.get(level_node.largs[0][0])
linkage_type = wxr.config.LINKAGE_SUBTITLES.get(level_node.largs[0][0])
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
# Get the senseids
Expand All @@ -38,8 +26,8 @@ def extract_semantic_relations(
)

# Extract links
semantic_links = []
if relation_key == "expressions":
linkages = []
if linkage_type == "expressions":
for child in list_item.children:
if isinstance(child, str) and contains_dash(child):
# XXX Capture the part after the dash as an explanatory note to the expression, e.g.:
Expand All @@ -50,26 +38,26 @@ def extract_semantic_relations(
isinstance(child, WikiNode)
and child.kind == NodeKind.LINK
):
process_link(wxr, semantic_links, child)
process_link(wxr, linkages, child)
else:
for link in list_item.find_child(NodeKind.LINK):
process_link(wxr, semantic_links, link)
process_link(wxr, linkages, link)

# Add links to the page data
if len(page_data[-1]["senses"]) == 1:
page_data[-1]["senses"][0][relation_key].extend(semantic_links)
page_data[-1]["senses"][0][linkage_type].extend(linkages)
elif len(senseids) > 0:
for senseid in senseids:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid:
sense[relation_key].extend(semantic_links)
sense[linkage_type].extend(linkages)
else:
page_data[-1][relation_key].extend(semantic_links)
page_data[-1][linkage_type].extend(linkages)

# Check for potentially missed data
for non_link in list_item.invert_find_child(NodeKind.LINK):
if (
relation_key == "expressions"
linkage_type == "expressions"
and isinstance(non_link, str)
and contains_dash(non_link)
):
Expand All @@ -80,7 +68,7 @@ def extract_semantic_relations(
continue
wxr.wtp.debug(
f"Found unexpected non-link node '{non_link}' in: {list_item}",
sortid="extractor/de/semantic_relations/extract_semantic_relations/84",
sortid="extractor/de/linkages/extract_linkages/84",
)


Expand Down
17 changes: 11 additions & 6 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from .example import extract_examples
from .gloss import extract_glosses
from .linkage import extract_linkages
from .pronunciation import extract_pronunciation
from .semantic_relations import SEMANTIC_RELATIONS, extract_semantic_relations
from .translation import extract_translation

# Templates that are used to form panels on pages and that should be ignored in
Expand Down Expand Up @@ -67,14 +67,19 @@ def parse_section(
wxr.wtp.start_subsection(section_name)
if section_name == "Bedeutungen":
extract_glosses(wxr, page_data, level_node)
elif section_name == "Aussprache":
elif wxr.config.capture_pronunciation and section_name == "Aussprache":
extract_pronunciation(wxr, page_data, level_node)
elif section_name == "Beispiele":
elif wxr.config.capture_examples and section_name == "Beispiele":
extract_examples(wxr, page_data, level_node)
elif section_name == "Übersetzungen":
elif (
wxr.config.capture_translations and section_name == "Übersetzungen"
):
extract_translation(wxr, page_data, level_node)
elif section_name in SEMANTIC_RELATIONS:
extract_semantic_relations(wxr, page_data, level_node)
elif (
wxr.config.capture_linkages
and section_name in wxr.config.LINKAGE_SUBTITLES
):
extract_linkages(wxr, page_data, level_node)


FORM_POS = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@
from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.semantic_relations import (
extract_semantic_relations,
)
from wiktextract.extractor.de.linkage import extract_linkages
from wiktextract.wxr_context import WiktextractContext


class TestDETranslation(unittest.TestCase):
class TestDELinkages(unittest.TestCase):
maxDiff = None

def setUp(self) -> None:
Expand All @@ -21,7 +19,7 @@ def setUp(self) -> None:
def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_de_extract_semantic_relations(self):
def test_de_extract_linkages(self):
test_cases = [
# https://de.wiktionary.org/wiki/Beispiel
# Extracts linkages and places them in the correct sense.
Expand Down Expand Up @@ -109,8 +107,6 @@ def test_de_extract_semantic_relations(self):
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(case["input"])

extract_semantic_relations(
self.wxr, case["page_data"], root.children[0]
)
extract_linkages(self.wxr, case["page_data"], root.children[0])

self.assertEqual(case["page_data"], case["expected"])

0 comments on commit b2ab827

Please sign in to comment.