From aded61e5155acf301b95314dbda2a90c16a51836 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 19 Oct 2023 12:57:36 +0300 Subject: [PATCH 1/2] Reorder imports and unify naming of tests in German Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/de/example.py | 3 +-- src/wiktextract/extractor/de/gloss.py | 2 +- src/wiktextract/extractor/de/pronunciation.py | 2 +- src/wiktextract/extractor/de/utils.py | 1 + tests/test_de_example.py | 1 - tests/test_de_gloss.py | 2 +- tests/test_de_page.py | 7 ++----- tests/test_de_pronunciation.py | 2 +- 8 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py index 8a3a97f6..da3268d3 100644 --- a/src/wiktextract/extractor/de/example.py +++ b/src/wiktextract/extractor/de/example.py @@ -1,11 +1,10 @@ from collections import defaultdict from typing import Dict, List - from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode -from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid +from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index ea2761c2..3caa7252 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -4,8 +4,8 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode -from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid +from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py index b5fb1d0d..e55adb13 100644 --- a/src/wiktextract/extractor/de/pronunciation.py +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -3,8 +3,8 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode -from wiktextract.extractor.share import create_audio_url_dict +from wiktextract.extractor.share import create_audio_url_dict from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext diff --git a/src/wiktextract/extractor/de/utils.py b/src/wiktextract/extractor/de/utils.py index 73416645..a50ab640 100644 --- a/src/wiktextract/extractor/de/utils.py +++ b/src/wiktextract/extractor/de/utils.py @@ -1,4 +1,5 @@ import re + from wikitextprocessor import NodeKind, WikiNode diff --git a/tests/test_de_example.py b/tests/test_de_example.py index 980a0be6..edfa7c1d 100644 --- a/tests/test_de_example.py +++ b/tests/test_de_example.py @@ -5,7 +5,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.example import extract_examples, extract_reference - from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index 02a41751..8c00426a 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -14,7 +14,7 @@ from wiktextract.wxr_context import WiktextractContext -class TestGlossList(unittest.TestCase): +class TestDEGloss(unittest.TestCase): maxDiff = None def setUp(self) -> None: diff --git a/tests/test_de_page.py b/tests/test_de_page.py index 4ac9e3cb..53a95be3 100644 --- a/tests/test_de_page.py +++ b/tests/test_de_page.py @@ -6,15 +6,12 @@ from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig -from wiktextract.extractor.de.page import ( - parse_page, - parse_section, -) +from wiktextract.extractor.de.page import parse_page, parse_section from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext -class DePageTests(unittest.TestCase): +class TestDEPage(unittest.TestCase): def setUp(self): conf1 = WiktionaryConfig( dump_file_lang_code="de", diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py index 6fae64eb..db5eb676 100644 --- a/tests/test_de_pronunciation.py +++ b/tests/test_de_pronunciation.py @@ -5,8 +5,8 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.pronunciation import ( - process_ipa, process_hoerbeispiele, + process_ipa, ) from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext From 361db67e3745c634b873efd186447a40fbd6fab0 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 19 Oct 2023 13:15:03 +0300 Subject: [PATCH 2/2] Extract translations from German Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/de/page.py | 5 +- src/wiktextract/extractor/de/translation.py | 213 +++++++++++++ tests/test_de_translation.py | 327 ++++++++++++++++++++ 3 files changed, 544 insertions(+), 1 deletion(-) create mode 100644 src/wiktextract/extractor/de/translation.py create mode 100644 tests/test_de_translation.py diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index 9c0aab1a..6c33ffc5 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -8,10 +8,11 @@ from wiktextract.datautils import append_base_data from wiktextract.extractor.de.pronunciation import extract_pronunciation +from wiktextract.extractor.de.translation import extract_translation from wiktextract.wxr_context import WiktextractContext -from .gloss import extract_glosses from .example import extract_examples +from .gloss import extract_glosses # Templates that are used to form panels on pages and that should be ignored in # various positions @@ -76,6 +77,8 @@ def parse_section( extract_pronunciation(wxr, page_data, level_node) if section_name == "Beispiele": extract_examples(wxr, page_data, level_node) + if section_name == "Übersetzungen": + extract_translation(wxr, page_data, level_node) FORM_POS = { diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py new file mode 100644 index 00000000..da6cc92d --- /dev/null +++ b/src/wiktextract/extractor/de/translation.py @@ -0,0 +1,213 @@ +import re +from collections import defaultdict +from typing import Dict, List, Union + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import TemplateNode + +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + + +def extract_translation( + wxr: WiktextractContext, page_data: List[Dict], level_node: WikiNode +) -> None: + for level_node_child in level_node.filter_empty_str_child(): + if not ( + isinstance(level_node_child, WikiNode) + and level_node_child.kind == NodeKind.TEMPLATE + and level_node_child.template_name == "Ü-Tabelle" + ): + wxr.wtp.debug( + f"Unexpected node type in extract_translation: {level_node_child}", + sortid="extractor/de/translation/extract_translation/31", + ) + else: + sense_translations = [] + base_translation_data = defaultdict(list) + senseid = level_node_child.template_parameters.get(1) + if senseid == None: + # XXX: Sense-disambiguate where senseids are in Ü-Liste (ca. 0.03% of pages), e.g.: + # https://de.wiktionary.org/wiki/Beitrag + # """ + # {{Ü-Tabelle|Ü-Liste= + # *{{en}}: [1] {{Ü|en|subscription}}; [1a] {{Ü|en|dues}}, {{Ü|en|membership fee}}; [1, 2] {{Ü|en|contribution}}; [3] {{Ü|en|article}}}} + pass + + sense_text = level_node_child.template_parameters.get("G") + + if sense_text: + sense_text = clean_node(wxr, {}, sense_text).strip() + if sense_text == "Übersetzungen umgeleitet": + # XXX: Handle cases where translations are in a separate page (ca. 1.1% of pages), e.g.: + # https://de.wiktionary.org/wiki/Pöpke + # """ + # {{Ü-Tabelle|*|G=Übersetzungen umgeleitet|Ü-Liste= + # :{{Übersetzungen umleiten|1|Poppe}} + # }} + # """ + continue + + base_translation_data["sense"] = clean_node(wxr, {}, sense_text) + + translation_list = level_node_child.template_parameters.get( + "Ü-Liste" + ) + if translation_list: + process_translation_list( + wxr, + sense_translations, + base_translation_data, + translation_list, + ) + + dialect_table = level_node_child.template_parameters.get( + "Dialekttabelle" + ) + if dialect_table: + process_dialect_table(wxr, base_translation_data, dialect_table) + + matched_senseid = False + if senseid: + for sense in page_data[-1]["senses"]: + if sense["senseid"] == senseid.strip(): + sense["translations"].extend(sense_translations) + matched_senseid = True + + if not matched_senseid: + wxr.wtp.debug( + f"Unknown senseid: {senseid}.", + sortid="extractor/de/translation/extract_translation/65", + ) + page_data[-1]["translations"].extend(sense_translations) + + +def process_translation_list( + wxr: WiktextractContext, + sense_translations: List[Dict], + base_translation_data: Dict[str, List], + translation_list: List[Union[WikiNode, str]], +): + modifiers = [] + for node in translation_list: + if not is_translation_template(node): + modifiers.append(node) + + else: + translation_data = base_translation_data.copy() + process_modifiers( + wxr, sense_translations, translation_data, modifiers + ) + + lang_code = node.template_parameters.get(1) + translation_data["code"] = lang_code + languages = wxr.wtp.LANGUAGES_BY_CODE.get(lang_code) + if languages: + translation_data["lang"] = languages[0] + else: + wxr.wtp.debug( + f"Unknown language code: {lang_code}", + sortid="extractor/de/translation/process_translation_list/70", + ) + if node.template_name[-1] == "?": + translation_data["uncertain"] = True + + translation_data["word"] = node.template_parameters.get(2) + + if node.template_name.removesuffix("?") == "Ü": + process_Ü_template(translation_data, node) + + if node.template_name.removesuffix("?") == "Üt": + process_Üt_template(wxr, translation_data, node) + + sense_translations.append(translation_data) + # Process modifiers at the end of the list + process_modifiers(wxr, sense_translations, defaultdict, modifiers) + + +def is_translation_template(node: any) -> bool: + return ( + isinstance(node, WikiNode) + and node.kind == NodeKind.TEMPLATE + and node.template_name in ["Ü", "Üt", "Ü?", "Üt?"] + ) + + +def process_Ü_template( + translation_data: Dict[str, Union[str, List, bool]], + template_node: TemplateNode, +): + overwrite_word = template_node.template_parameters.get(3) + if overwrite_word: + translation_data["word"] = overwrite_word + + +def process_Üt_template( + wxr: WiktextractContext, + translation_data: Dict[str, Union[str, List, bool]], + template_node: TemplateNode, +): + transcription = template_node.template_parameters.get(3) + if transcription: + translation_data["roman"] = transcription + # Look for automatic transcription + else: + cleaned_node = clean_node(wxr, {}, template_node) + match = re.search(r"\(([^)]+?)\^\☆\)", cleaned_node) + + if match: + translation_data["roman"] = match.group(1) + + overwrite_word = template_node.template_parameters.get(4) + if overwrite_word: + translation_data["word"] = overwrite_word + + +def process_modifiers( + wxr: WiktextractContext, + sense_translations: List[Dict], + translation_data: Dict[str, Union[str, List, bool]], + modifiers, +): + # Get rid of the "*" and language template nodes that start each translation + for i, elem in enumerate(modifiers): + if isinstance(elem, str) and "*" in elem: + del modifiers[i:] + break + + clean_text = clean_node(wxr, {}, modifiers).strip() + if clean_text: + tags = re.split(r";|,|\(|\)|:", clean_text) + tags = [tag.strip() for tag in tags if tag.strip()] + if tags: + if clean_text.endswith(":"): + translation_data["tags"].extend(tags) + elif sense_translations: + sense_translations[-1]["tags"].extend(tags) + # Reset modifiers + modifiers.clear() + + +def process_dialect_table( + wxr: WiktextractContext, + base_translation_data: Dict[str, Union[str, List, bool]], + dialect_table: List[Union[WikiNode, str]], +): + wxr.wtp.debug("Dialect table not implemented yet.", sortid="TODO") + # XXX: Extract dialect information (ca. 0.12% of pages), e.g.: + # https://de.wiktionary.org/wiki/Bein + # """ + # {{Ü-Tabelle|4|G=in der Medizin nur in zusammengesetzten Wörtern: Knochen|Ü-Liste=... + # |Dialekttabelle= + # *Berlinerisch: Been + # *Kölsch: + # *Mitteldeutsch: + # **{{pfl}}: {{Lautschrift|bɛː}}, {{Lautschrift|bɛ̃ː}} + # *Oberdeutsch: + # **{{als}}: [1] Fuëß + # ***Schwäbisch: [1, 2] Fuaß; [4] Boi, Boa + # **{{bar}}: [1, 2] Fuaß; [4] Boan + # *Thüringisch-Obersächsisch: Been, Knoche + # }}""" + + return diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py new file mode 100644 index 00000000..d1007865 --- /dev/null +++ b/tests/test_de_translation.py @@ -0,0 +1,327 @@ +import unittest +from collections import defaultdict + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.de.translation import ( + extract_translation, + process_translation_list, +) +from wiktextract.thesaurus import close_thesaurus_db +from wiktextract.wxr_context import WiktextractContext + + +class TestDETranslation(unittest.TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") + ) + self.wxr.wtp.LANGUAGES_BY_CODE["en"] = ["Englisch"] + self.wxr.wtp.LANGUAGES_BY_CODE["hy"] = ["Armenisch"] + self.wxr.wtp.LANGUAGES_BY_CODE["ru"] = ["Russisch"] + self.wxr.wtp.LANGUAGES_BY_CODE["fr"] = ["Französisch"] + self.wxr.wtp.LANGUAGES_BY_CODE["ar"] = ["Arabisch"] + self.wxr.wtp.LANGUAGES_BY_CODE["la"] = ["Latein"] + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + close_thesaurus_db( + self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn + ) + + def test_de_extract_translation(self): + test_cases = [ + # Adds sense data to correct sense + { + "input": "{{Ü-Tabelle|1|G=Beispiel|Ü-Liste=*{{en}}: {{Ü|en|example}}}}", + "page_data": [ + defaultdict( + list, {"senses": [defaultdict(list, {"senseid": "1"})]} + ) + ], + "expected": [ + { + "senses": [ + { + "senseid": "1", + "translations": [ + { + "sense": "Beispiel", + "code": "en", + "lang": "Englisch", + "word": "example", + } + ], + } + ] + } + ], + }, + # Adds sense data to page_data root if no senseid is given + { + "input": "{{Ü-Tabelle||G=Beispiel|Ü-Liste=*{{en}}: {{Ü|en|example}}}}", + "page_data": [ + defaultdict( + list, {"senses": [defaultdict(list, {"senseid": "1"})]} + ) + ], + "expected": [ + { + "senses": [ + { + "senseid": "1", + } + ], + "translations": [ + { + "sense": "Beispiel", + "code": "en", + "lang": "Englisch", + "word": "example", + } + ], + } + ], + }, + # Adds sense data to page_data root if senseid could not be matched + { + "input": "{{Ü-Tabelle|2|G=Beispiel|Ü-Liste=*{{en}}: {{Ü|en|example}}}}", + "page_data": [ + defaultdict( + list, {"senses": [defaultdict(list, {"senseid": "1"})]} + ) + ], + "expected": [ + { + "senses": [ + { + "senseid": "1", + } + ], + "translations": [ + { + "sense": "Beispiel", + "code": "en", + "lang": "Englisch", + "word": "example", + } + ], + } + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse(case["input"]) + + page_data = case["page_data"] + + extract_translation(self.wxr, page_data, root) + + self.assertEqual(page_data, case["expected"]) + + def test_de_process_translation_list(self): + test_cases = [ + # https://de.wiktionary.org/wiki/Beispiel + # Ü template + { + "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{en}}: {{Ü|en|example}}}}", + "expected_sense_translations": [ + {"code": "en", "lang": "Englisch", "word": "example"} + ], + }, + # https://de.wiktionary.org/wiki/Beispiel + # Üt template with manual transcription + { + "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{hy}}: {{Üt|hy|օրինակ|orinak}}}}", + "expected_sense_translations": [ + { + "code": "hy", + "lang": "Armenisch", + "word": "օրինակ", + "roman": "orinak", + } + ], + }, + # https://de.wiktionary.org/wiki/Beispiel + # Üt template with automatic transcription + { + "pages": [("Vorlage:Üt", 10, "пример (primer^☆) → ru")], + "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{ru}}: {{Üt|ru|пример}}}}", + "expected_sense_translations": [ + { + "code": "ru", + "lang": "Russisch", + "word": "пример", + "roman": "primer", + } + ], + }, + # https://de.wiktionary.org/wiki/Schrift + # Üt? template + { + "pages": [("Vorlage:Üt", 10, "عريضة ? () → ar")], + "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{ar}}: {{Üt?|ar|عريضة|}}}}", + "expected_sense_translations": [ + { + "code": "ar", + "lang": "Arabisch", + "word": "عريضة", + "uncertain": True, + } + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + if "pages" in case: + for page in case["pages"]: + self.wxr.wtp.add_page(*page) + + root = self.wxr.wtp.parse(case["input"]) + + sense_translations = [] + base_translation_data = defaultdict(list) + + translation_list = root.children[0].template_parameters.get( + "Ü-Liste" + ) + + process_translation_list( + self.wxr, + sense_translations, + base_translation_data, + translation_list, + ) + self.assertEqual( + sense_translations, case["expected_sense_translations"] + ) + + def test_de_process_translation_list_with_modifiers(self): + test_cases = [ + # https://de.wiktionary.org/wiki/Beispiel + # Modifying the following translation + { + "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{en}}: {{Ü|en|instance}}, ''Vorbild:'' {{Ü|en|model}}}}", + "expected_sense_translations": [ + {"code": "en", "lang": "Englisch", "word": "instance"}, + { + "code": "en", + "lang": "Englisch", + "word": "model", + "tags": ["Vorbild"], + }, + ], + }, + # https://de.wiktionary.org/wiki/Beispiel + # Modifying the previous translation + { + "pages": [("Vorlage:m", 10, "m")], + "input": "{{Ü-Tabelle|||Ü-Liste=\n**{{fr}}: {{Ü|fr|exemple}} {{m}}}}", + "expected_sense_translations": [ + { + "code": "fr", + "lang": "Französisch", + "word": "exemple", + "tags": ["m"], + } + ], + }, + # https://de.wiktionary.org/wiki/Bein + # Multiple modifiers + { + "pages": [("Vorlage:f", 10, "f")], + "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{la}}: {{Ü|la|crus}} {{f}}, {{Ü|la|camba}} (vulgärlateinisch) {{f}}, {{Ü|la|gamba}} (vulgärlateinisch) {{f}}}}", + "expected_sense_translations": [ + { + "code": "la", + "lang": "Latein", + "word": "crus", + "tags": ["f"], + }, + { + "code": "la", + "lang": "Latein", + "word": "camba", + "tags": ["vulgärlateinisch", "f"], + }, + { + "code": "la", + "lang": "Latein", + "word": "gamba", + "tags": ["vulgärlateinisch", "f"], + }, + ], + }, + # https://de.wiktionary.org/wiki/Beitrag + # With senseids in the modifiers + # This is just to document the current behaviour. When these cases + # get sense disambiguated, update this test case. + { + "pages": [("Vorlage:f", 10, "f")], + "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{en}}: [1] {{Ü|en|subscription}}; [1a] {{Ü|en|dues}}, {{Ü|en|membership fee}}; [1, 2] {{Ü|en|contribution}}; [3] {{Ü|en|article}}}}", + "expected_sense_translations": [ + { + "code": "en", + "lang": "Englisch", + "word": "subscription", + "tags": ["[1a]"], + }, + { + "code": "en", + "lang": "Englisch", + "word": "dues", + }, + { + "code": "en", + "lang": "Englisch", + "word": "membership fee", + "tags": ["[1", "2]"], + }, + { + "code": "en", + "lang": "Englisch", + "word": "contribution", + "tags": ["[3]"], + }, + { + "code": "en", + "lang": "Englisch", + "word": "article", + }, + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + if "pages" in case: + for page in case["pages"]: + self.wxr.wtp.add_page(*page) + + root = self.wxr.wtp.parse(case["input"]) + + sense_translations = [] + base_translation_data = defaultdict(list) + + translation_list = root.children[0].template_parameters.get( + "Ü-Liste" + ) + + process_translation_list( + self.wxr, + sense_translations, + base_translation_data, + translation_list, + ) + self.assertEqual( + sense_translations, case["expected_sense_translations"] + )