Skip to content

Commit

Permalink
Merge pull request #369 from empiriker/de
Browse files Browse the repository at this point in the history
Extract translations from German Wiktionary
  • Loading branch information
xxyzz authored Oct 19, 2023
2 parents 2e936fa + 361db67 commit a8787ef
Show file tree
Hide file tree
Showing 11 changed files with 552 additions and 13 deletions.
3 changes: 1 addition & 2 deletions src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from collections import defaultdict
from typing import Dict, List


from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid

from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid

from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
5 changes: 4 additions & 1 deletion src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@

from wiktextract.datautils import append_base_data
from wiktextract.extractor.de.pronunciation import extract_pronunciation
from wiktextract.extractor.de.translation import extract_translation
from wiktextract.wxr_context import WiktextractContext

from .gloss import extract_glosses
from .example import extract_examples
from .gloss import extract_glosses

# Templates that are used to form panels on pages and that should be ignored in
# various positions
Expand Down Expand Up @@ -76,6 +77,8 @@ def parse_section(
extract_pronunciation(wxr, page_data, level_node)
if section_name == "Beispiele":
extract_examples(wxr, page_data, level_node)
if section_name == "Übersetzungen":
extract_translation(wxr, page_data, level_node)


FORM_POS = {
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.share import create_audio_url_dict

from wiktextract.extractor.share import create_audio_url_dict
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
213 changes: 213 additions & 0 deletions src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import re
from collections import defaultdict
from typing import Dict, List, Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_translation(
wxr: WiktextractContext, page_data: List[Dict], level_node: WikiNode
) -> None:
for level_node_child in level_node.filter_empty_str_child():
if not (
isinstance(level_node_child, WikiNode)
and level_node_child.kind == NodeKind.TEMPLATE
and level_node_child.template_name == "Ü-Tabelle"
):
wxr.wtp.debug(
f"Unexpected node type in extract_translation: {level_node_child}",
sortid="extractor/de/translation/extract_translation/31",
)
else:
sense_translations = []
base_translation_data = defaultdict(list)
senseid = level_node_child.template_parameters.get(1)
if senseid == None:
# XXX: Sense-disambiguate where senseids are in Ü-Liste (ca. 0.03% of pages), e.g.:
# https://de.wiktionary.org/wiki/Beitrag
# """
# {{Ü-Tabelle|Ü-Liste=
# *{{en}}: [1] {{Ü|en|subscription}}; [1a] {{Ü|en|dues}}, {{Ü|en|membership fee}}; [1, 2] {{Ü|en|contribution}}; [3] {{Ü|en|article}}}}
pass

sense_text = level_node_child.template_parameters.get("G")

if sense_text:
sense_text = clean_node(wxr, {}, sense_text).strip()
if sense_text == "Übersetzungen umgeleitet":
# XXX: Handle cases where translations are in a separate page (ca. 1.1% of pages), e.g.:
# https://de.wiktionary.org/wiki/Pöpke
# """
# {{Ü-Tabelle|*|G=Übersetzungen umgeleitet|Ü-Liste=
# :{{Übersetzungen umleiten|1|Poppe}}
# }}
# """
continue

base_translation_data["sense"] = clean_node(wxr, {}, sense_text)

translation_list = level_node_child.template_parameters.get(
"Ü-Liste"
)
if translation_list:
process_translation_list(
wxr,
sense_translations,
base_translation_data,
translation_list,
)

dialect_table = level_node_child.template_parameters.get(
"Dialekttabelle"
)
if dialect_table:
process_dialect_table(wxr, base_translation_data, dialect_table)

matched_senseid = False
if senseid:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid.strip():
sense["translations"].extend(sense_translations)
matched_senseid = True

if not matched_senseid:
wxr.wtp.debug(
f"Unknown senseid: {senseid}.",
sortid="extractor/de/translation/extract_translation/65",
)
page_data[-1]["translations"].extend(sense_translations)


def process_translation_list(
wxr: WiktextractContext,
sense_translations: List[Dict],
base_translation_data: Dict[str, List],
translation_list: List[Union[WikiNode, str]],
):
modifiers = []
for node in translation_list:
if not is_translation_template(node):
modifiers.append(node)

else:
translation_data = base_translation_data.copy()
process_modifiers(
wxr, sense_translations, translation_data, modifiers
)

lang_code = node.template_parameters.get(1)
translation_data["code"] = lang_code
languages = wxr.wtp.LANGUAGES_BY_CODE.get(lang_code)
if languages:
translation_data["lang"] = languages[0]
else:
wxr.wtp.debug(
f"Unknown language code: {lang_code}",
sortid="extractor/de/translation/process_translation_list/70",
)
if node.template_name[-1] == "?":
translation_data["uncertain"] = True

translation_data["word"] = node.template_parameters.get(2)

if node.template_name.removesuffix("?") == "Ü":
process_Ü_template(translation_data, node)

if node.template_name.removesuffix("?") == "Üt":
process_Üt_template(wxr, translation_data, node)

sense_translations.append(translation_data)
# Process modifiers at the end of the list
process_modifiers(wxr, sense_translations, defaultdict, modifiers)


def is_translation_template(node: any) -> bool:
return (
isinstance(node, WikiNode)
and node.kind == NodeKind.TEMPLATE
and node.template_name in ["Ü", "Üt", "Ü?", "Üt?"]
)


def process_Ü_template(
translation_data: Dict[str, Union[str, List, bool]],
template_node: TemplateNode,
):
overwrite_word = template_node.template_parameters.get(3)
if overwrite_word:
translation_data["word"] = overwrite_word


def process_Üt_template(
wxr: WiktextractContext,
translation_data: Dict[str, Union[str, List, bool]],
template_node: TemplateNode,
):
transcription = template_node.template_parameters.get(3)
if transcription:
translation_data["roman"] = transcription
# Look for automatic transcription
else:
cleaned_node = clean_node(wxr, {}, template_node)
match = re.search(r"\(([^)]+?)\^\☆\)", cleaned_node)

if match:
translation_data["roman"] = match.group(1)

overwrite_word = template_node.template_parameters.get(4)
if overwrite_word:
translation_data["word"] = overwrite_word


def process_modifiers(
wxr: WiktextractContext,
sense_translations: List[Dict],
translation_data: Dict[str, Union[str, List, bool]],
modifiers,
):
# Get rid of the "*" and language template nodes that start each translation
for i, elem in enumerate(modifiers):
if isinstance(elem, str) and "*" in elem:
del modifiers[i:]
break

clean_text = clean_node(wxr, {}, modifiers).strip()
if clean_text:
tags = re.split(r";|,|\(|\)|:", clean_text)
tags = [tag.strip() for tag in tags if tag.strip()]
if tags:
if clean_text.endswith(":"):
translation_data["tags"].extend(tags)
elif sense_translations:
sense_translations[-1]["tags"].extend(tags)
# Reset modifiers
modifiers.clear()


def process_dialect_table(
wxr: WiktextractContext,
base_translation_data: Dict[str, Union[str, List, bool]],
dialect_table: List[Union[WikiNode, str]],
):
wxr.wtp.debug("Dialect table not implemented yet.", sortid="TODO")
# XXX: Extract dialect information (ca. 0.12% of pages), e.g.:
# https://de.wiktionary.org/wiki/Bein
# """
# {{Ü-Tabelle|4|G=in der Medizin nur in zusammengesetzten Wörtern: Knochen|Ü-Liste=...
# |Dialekttabelle=
# *Berlinerisch: Been
# *Kölsch:
# *Mitteldeutsch:
# **{{pfl}}: {{Lautschrift|bɛː}}, {{Lautschrift|bɛ̃ː}}
# *Oberdeutsch:
# **{{als}}: [1] Fuëß
# ***Schwäbisch: [1, 2] Fuaß; [4] Boi, Boa
# **{{bar}}: [1, 2] Fuaß; [4] Boan
# *Thüringisch-Obersächsisch: Been, Knoche
# }}"""

return
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re

from wikitextprocessor import NodeKind, WikiNode


Expand Down
1 change: 0 additions & 1 deletion tests/test_de_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.example import extract_examples, extract_reference

from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext

Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from wiktextract.wxr_context import WiktextractContext


class TestGlossList(unittest.TestCase):
class TestDEGloss(unittest.TestCase):
maxDiff = None

def setUp(self) -> None:
Expand Down
7 changes: 2 additions & 5 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,12 @@
from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.page import (
parse_page,
parse_section,
)
from wiktextract.extractor.de.page import parse_page, parse_section
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


class DePageTests(unittest.TestCase):
class TestDEPage(unittest.TestCase):
def setUp(self):
conf1 = WiktionaryConfig(
dump_file_lang_code="de",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.pronunciation import (
process_ipa,
process_hoerbeispiele,
process_ipa,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext
Expand Down
Loading

0 comments on commit a8787ef

Please sign in to comment.