Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract translations from German Wiktionary #369

Merged
merged 2 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from collections import defaultdict
from typing import Dict, List


from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid

from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid

from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
5 changes: 4 additions & 1 deletion src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@

from wiktextract.datautils import append_base_data
from wiktextract.extractor.de.pronunciation import extract_pronunciation
from wiktextract.extractor.de.translation import extract_translation
from wiktextract.wxr_context import WiktextractContext

from .gloss import extract_glosses
from .example import extract_examples
from .gloss import extract_glosses

# Templates that are used to form panels on pages and that should be ignored in
# various positions
Expand Down Expand Up @@ -76,6 +77,8 @@ def parse_section(
extract_pronunciation(wxr, page_data, level_node)
if section_name == "Beispiele":
extract_examples(wxr, page_data, level_node)
if section_name == "Übersetzungen":
extract_translation(wxr, page_data, level_node)


FORM_POS = {
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wiktextract.extractor.share import create_audio_url_dict

from wiktextract.extractor.share import create_audio_url_dict
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down
213 changes: 213 additions & 0 deletions src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import re
from collections import defaultdict
from typing import Dict, List, Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_translation(
wxr: WiktextractContext, page_data: List[Dict], level_node: WikiNode
) -> None:
for level_node_child in level_node.filter_empty_str_child():
if not (
isinstance(level_node_child, WikiNode)
and level_node_child.kind == NodeKind.TEMPLATE
and level_node_child.template_name == "Ü-Tabelle"
):
wxr.wtp.debug(
f"Unexpected node type in extract_translation: {level_node_child}",
sortid="extractor/de/translation/extract_translation/31",
)
else:
sense_translations = []
base_translation_data = defaultdict(list)
senseid = level_node_child.template_parameters.get(1)
if senseid == None:
# XXX: Sense-disambiguate where senseids are in Ü-Liste (ca. 0.03% of pages), e.g.:
# https://de.wiktionary.org/wiki/Beitrag
# """
# {{Ü-Tabelle|Ü-Liste=
# *{{en}}: [1] {{Ü|en|subscription}}; [1a] {{Ü|en|dues}}, {{Ü|en|membership fee}}; [1, 2] {{Ü|en|contribution}}; [3] {{Ü|en|article}}}}
pass

sense_text = level_node_child.template_parameters.get("G")

if sense_text:
sense_text = clean_node(wxr, {}, sense_text).strip()
if sense_text == "Übersetzungen umgeleitet":
# XXX: Handle cases where translations are in a separate page (ca. 1.1% of pages), e.g.:
# https://de.wiktionary.org/wiki/Pöpke
# """
# {{Ü-Tabelle|*|G=Übersetzungen umgeleitet|Ü-Liste=
# :{{Übersetzungen umleiten|1|Poppe}}
# }}
# """
continue

base_translation_data["sense"] = clean_node(wxr, {}, sense_text)

translation_list = level_node_child.template_parameters.get(
"Ü-Liste"
)
if translation_list:
process_translation_list(
wxr,
sense_translations,
base_translation_data,
translation_list,
)

dialect_table = level_node_child.template_parameters.get(
"Dialekttabelle"
)
if dialect_table:
process_dialect_table(wxr, base_translation_data, dialect_table)

matched_senseid = False
if senseid:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid.strip():
sense["translations"].extend(sense_translations)
matched_senseid = True

if not matched_senseid:
wxr.wtp.debug(
f"Unknown senseid: {senseid}.",
sortid="extractor/de/translation/extract_translation/65",
)
page_data[-1]["translations"].extend(sense_translations)


def process_translation_list(
wxr: WiktextractContext,
sense_translations: List[Dict],
base_translation_data: Dict[str, List],
translation_list: List[Union[WikiNode, str]],
):
modifiers = []
for node in translation_list:
if not is_translation_template(node):
modifiers.append(node)

else:
translation_data = base_translation_data.copy()
process_modifiers(
wxr, sense_translations, translation_data, modifiers
)

lang_code = node.template_parameters.get(1)
translation_data["code"] = lang_code
languages = wxr.wtp.LANGUAGES_BY_CODE.get(lang_code)
if languages:
translation_data["lang"] = languages[0]
else:
wxr.wtp.debug(
f"Unknown language code: {lang_code}",
sortid="extractor/de/translation/process_translation_list/70",
)
if node.template_name[-1] == "?":
translation_data["uncertain"] = True

translation_data["word"] = node.template_parameters.get(2)
empiriker marked this conversation as resolved.
Show resolved Hide resolved

if node.template_name.removesuffix("?") == "Ü":
process_Ü_template(translation_data, node)

if node.template_name.removesuffix("?") == "Üt":
process_Üt_template(wxr, translation_data, node)

sense_translations.append(translation_data)
# Process modifiers at the end of the list
process_modifiers(wxr, sense_translations, defaultdict, modifiers)


def is_translation_template(node: any) -> bool:
return (
isinstance(node, WikiNode)
and node.kind == NodeKind.TEMPLATE
and node.template_name in ["Ü", "Üt", "Ü?", "Üt?"]
)


def process_Ü_template(
translation_data: Dict[str, Union[str, List, bool]],
template_node: TemplateNode,
):
overwrite_word = template_node.template_parameters.get(3)
if overwrite_word:
translation_data["word"] = overwrite_word


def process_Üt_template(
wxr: WiktextractContext,
translation_data: Dict[str, Union[str, List, bool]],
template_node: TemplateNode,
):
transcription = template_node.template_parameters.get(3)
if transcription:
translation_data["roman"] = transcription
# Look for automatic transcription
else:
cleaned_node = clean_node(wxr, {}, template_node)
match = re.search(r"\(([^)]+?)\^\☆\)", cleaned_node)

if match:
translation_data["roman"] = match.group(1)

overwrite_word = template_node.template_parameters.get(4)
if overwrite_word:
translation_data["word"] = overwrite_word


def process_modifiers(
wxr: WiktextractContext,
sense_translations: List[Dict],
translation_data: Dict[str, Union[str, List, bool]],
modifiers,
):
# Get rid of the "*" and language template nodes that start each translation
for i, elem in enumerate(modifiers):
if isinstance(elem, str) and "*" in elem:
del modifiers[i:]
break

clean_text = clean_node(wxr, {}, modifiers).strip()
if clean_text:
tags = re.split(r";|,|\(|\)|:", clean_text)
tags = [tag.strip() for tag in tags if tag.strip()]
if tags:
if clean_text.endswith(":"):
translation_data["tags"].extend(tags)
elif sense_translations:
sense_translations[-1]["tags"].extend(tags)
# Reset modifiers
modifiers.clear()


def process_dialect_table(
wxr: WiktextractContext,
base_translation_data: Dict[str, Union[str, List, bool]],
dialect_table: List[Union[WikiNode, str]],
):
wxr.wtp.debug("Dialect table not implemented yet.", sortid="TODO")
# XXX: Extract dialect information (ca. 0.12% of pages), e.g.:
# https://de.wiktionary.org/wiki/Bein
# """
# {{Ü-Tabelle|4|G=in der Medizin nur in zusammengesetzten Wörtern: Knochen|Ü-Liste=...
# |Dialekttabelle=
# *Berlinerisch: Been
# *Kölsch:
# *Mitteldeutsch:
# **{{pfl}}: {{Lautschrift|bɛː}}, {{Lautschrift|bɛ̃ː}}
# *Oberdeutsch:
# **{{als}}: [1] Fuëß
# ***Schwäbisch: [1, 2] Fuaß; [4] Boi, Boa
# **{{bar}}: [1, 2] Fuaß; [4] Boan
# *Thüringisch-Obersächsisch: Been, Knoche
# }}"""

return
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re

from wikitextprocessor import NodeKind, WikiNode


Expand Down
1 change: 0 additions & 1 deletion tests/test_de_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.example import extract_examples, extract_reference

from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext

Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from wiktextract.wxr_context import WiktextractContext


class TestGlossList(unittest.TestCase):
class TestDEGloss(unittest.TestCase):
maxDiff = None

def setUp(self) -> None:
Expand Down
7 changes: 2 additions & 5 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,12 @@
from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.page import (
parse_page,
parse_section,
)
from wiktextract.extractor.de.page import parse_page, parse_section
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


class DePageTests(unittest.TestCase):
class TestDEPage(unittest.TestCase):
def setUp(self):
conf1 = WiktionaryConfig(
dump_file_lang_code="de",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_de_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.pronunciation import (
process_ipa,
process_hoerbeispiele,
process_ipa,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext
Expand Down
Loading