From ea34867b325d59afb5805aff845b59f9a29773e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 30 Jan 2024 10:20:39 +0200 Subject: [PATCH 1/4] Fields should not default to lists For example "ipa" should not be a list inside a larger "sounds" list entry; if there are several "ipa" entries, make them separate "sounds" list entries, with their own tags etc. if necessary. --- src/wiktextract/extractor/de/models.py | 25 ++++------- src/wiktextract/extractor/de/pronunciation.py | 42 +++++++++++-------- tests/test_de_pronunciation.py | 31 +++++++------- 3 files changed, 50 insertions(+), 48 deletions(-) diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py index a5ebfbcc..1a0d9a7b 100644 --- a/src/wiktextract/extractor/de/models.py +++ b/src/wiktextract/extractor/de/models.py @@ -39,9 +39,6 @@ class Translation(BaseModelWrap): description="Tags specifying the translated term, usually gender information", ) notes: list[str] = Field(default=[], description="A list of notes") - roman: str = Field( - default="", description="Transliteration in roman characters" - ) class Example(BaseModelWrap): @@ -123,22 +120,18 @@ class Sense(BaseModelWrap): class Sound(BaseModelWrap): - ipa: list[str] = Field( - default=[], description="International Phonetic Alphabet" - ) + ipa: str = Field(default="", description="International Phonetic Alphabet") # phonetic_transcription: list[str] = Field( # default=[], description="Phonetic transcription, less exact than IPA." # ) - audio: list[str] = Field(default=[], description="Audio file name") - wav_url: list[str] = Field(default=[]) - ogg_url: list[str] = Field(default=[]) - mp3_url: list[str] = Field(default=[]) - oga_url: list[str] = Field(default=[]) - flac_url: list[str] = Field(default=[]) - lang_code: list[str] = Field( - default=[], description="Wiktionary language code" - ) - lang: list[str] = Field(default=[], description="Localized language name") + audio: str = Field(default="", description="Audio file name") + wav_url: str = Field(default="") + ogg_url: str = Field(default="") + mp3_url: str = Field(default="") + oga_url: str = Field(default="") + flac_url: str = Field(default="") + lang_code: str = Field(default="", description="Wiktionary language code") + lang: str = Field(default="", description="Localized language name") # roman: list[str] = Field( # default=[], description="Translitaration to Roman characters" # ) diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py index 7c3e0919..90b1d171 100644 --- a/src/wiktextract/extractor/de/pronunciation.py +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -21,7 +21,8 @@ def extract_pronunciation( NodeKind.LIST_ITEM ): wxr.wtp.debug( - f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}", + f"Found unexpected non-list-item node in pronunciation " + f"section: {not_list_item_node}", sortid="extractor/de/pronunciation/extract_pronunciation/28", ) @@ -37,8 +38,10 @@ def extract_pronunciation( or not rest ): wxr.wtp.debug( - f"Found unexpected non-template node in pronunciation section: {head_template}", - sortid="extractor/de/pronunciation/extract_pronunciation/37", + f"Found unexpected non-template node in pronunciation " + f"section: {head_template}", + sortid="extractor/de/pronunciation/" + "extract_pronunciation/37", ) continue if head_template.template_name == "IPA": @@ -91,7 +94,7 @@ def process_ipa( def process_lautschrift_template( wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode -): +) -> None: template_parameters = node.template_parameters ipa = template_parameters.get(1) @@ -99,21 +102,26 @@ def process_lautschrift_template( lang_code = template_parameters.get("spr") if lang_code: lang = code_to_name(lang_code, "de") - add_sound_data_without_appending_to_existing_properties( - wxr, - sound_data, - { - "ipa": [ipa], - "lang_code": lang_code, - "lang": lang, - }, - ) + new_data = { + "lang_code": lang_code, + "lang": lang, + } else: - sound_data[-1].ipa.append(ipa) + new_data = dict() + + new_data["ipa"] = ipa + + add_sound_data_without_appending_to_existing_properties( + wxr, + sound_data, + new_data, + ) def process_hoerbeispiele( - wxr: WiktextractContext, sound_data: list[Sound], nodes: list[WikiNode] + wxr: WiktextractContext, + sound_data: list[Sound], + nodes: list[Union[str, WikiNode]], ): for node in nodes: if is_template_node_with_name(node, "Audio"): @@ -157,7 +165,7 @@ def is_template_node_with_name(node: Union[WikiNode, str], template_name: str): def add_sound_data_without_appending_to_existing_properties( wxr: WiktextractContext, sound_data: list[Sound], - new_sound_data: list[dict], + new_sound_data: dict, ): """Creates a new IPA data entry if properties exist in previous entry.""" if any( @@ -171,7 +179,7 @@ def add_sound_data_without_appending_to_existing_properties( for key, value in new_sound_data.items(): if key in sound_data[-1].model_fields: if isinstance(value, str): - getattr(sound_data[-1], key).append(value) + setattr(sound_data[-1], key, value) else: getattr(sound_data[-1], key).extend(value) else: diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py index 773d9b30..995e1adb 100644 --- a/tests/test_de_pronunciation.py +++ b/tests/test_de_pronunciation.py @@ -27,7 +27,7 @@ def test_de_process_ipa(self): "input": "{{Lautschrift|ipa1}}", "expected": [ { - "ipa": ["ipa1"], + "ipa": "ipa1", } ], }, @@ -35,28 +35,29 @@ def test_de_process_ipa(self): "input": "{{Lautschrift|ipa1|spr=de}}", "expected": [ { - "ipa": ["ipa1"], - "lang": ["Deutsch"], - "lang_code": ["de"], + "ipa": "ipa1", + "lang": "Deutsch", + "lang_code": "de", } ], }, { "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}", "expected": [ - {"ipa": ["ipa1", "ipa2"]}, + {"ipa": "ipa1"}, + {"ipa": "ipa2"}, { - "ipa": ["ipa3"], - "lang": ["Deutsch"], - "lang_code": ["de"], + "ipa": "ipa3", + "lang": "Deutsch", + "lang_code": "de", }, ], }, { "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}", "expected": [ - {"ipa": ["ipa1"]}, - {"ipa": ["ipa2"], "tags": ["tag1"]}, + {"ipa": "ipa1"}, + {"ipa": "ipa2", "tags": ["tag1"]}, ], }, ] @@ -90,7 +91,7 @@ def test_de_process_hoerbeispiele(self): "input": "{{Audio|" + filename1 + "}}", "expected": [ { - "audio": [filename1], + "audio": filename1, "mp3_url": None, # None indicates we don't care about the exact value "ogg_url": None, } @@ -104,12 +105,12 @@ def test_de_process_hoerbeispiele(self): + "}}", "expected": [ { - "audio": [filename1], + "audio": filename1, "mp3_url": None, "ogg_url": None, }, { - "audio": [filename2], + "audio": filename2, "ogg_url": None, "mp3_url": None, "wav_url": None, @@ -124,13 +125,13 @@ def test_de_process_hoerbeispiele(self): + "}}", "expected": [ { - "audio": [filename1], + "audio": filename1, "mp3_url": None, "ogg_url": None, "tags": ["tag1"], }, { - "audio": [filename2], + "audio": filename2, "mp3_url": None, "ogg_url": None, "wav_url": None, From 197fc8bbe2e75f7e0e08cfda9ae3fa99445db409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 30 Jan 2024 14:08:44 +0200 Subject: [PATCH 2/4] Es-edition: "lang"-field required --- src/wiktextract/extractor/es/models.py | 1 + src/wiktextract/extractor/es/translation.py | 9 +++++- tests/test_es_translation.py | 34 ++++++++++++++++++--- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index afcb2795..c5e12c7b 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -23,6 +23,7 @@ class Translation(BaseModelWrap): lang_code: str = Field( description="Wiktionary language code of the translation term" ) + lang: str = Field(description="Name of the language of translation") senseids: list[str] = Field( default=[], description="List of senseids where this translation applies", diff --git a/src/wiktextract/extractor/es/translation.py b/src/wiktextract/extractor/es/translation.py index 9da20996..e9532d9a 100644 --- a/src/wiktextract/extractor/es/translation.py +++ b/src/wiktextract/extractor/es/translation.py @@ -1,5 +1,6 @@ from typing import Optional +from mediawiki_langcodes import code_to_name from wikitextprocessor import WikiNode from wiktextract.extractor.es.models import Translation, WordEntry from wiktextract.extractor.share import split_senseids @@ -15,6 +16,9 @@ def extract_translation( # Documentation: https://es.wiktionary.org/wiki/Plantilla:t+ lang_code = template_node.template_parameters.get(1) # Language code + lang = code_to_name(lang_code, "es") + if not lang: + lang = f"Unknown({lang_code})" # Initialize variables current_translation: Optional[Translation] = None @@ -82,7 +86,10 @@ def extract_translation( else: current_translation = Translation( - word=value, lang_code=lang_code, senseids=list(senseids) + word=value, + lang_code=lang_code, + lang=lang, + senseids=list(senseids), ) elif isinstance(key, str): if key == "tr": diff --git a/tests/test_es_translation.py b/tests/test_es_translation.py index 49c49982..9cee527b 100644 --- a/tests/test_es_translation.py +++ b/tests/test_es_translation.py @@ -32,13 +32,19 @@ def test_es_extract_translation(self): { "input": "{{t+|af|1|kat}}", "expected": [ - {"lang_code": "af", "word": "kat", "senseids": ["1"]} + { + "lang": "afrikáans", + "lang_code": "af", + "word": "kat", + "senseids": ["1"], + } ], }, { "input": "{{t+|de|1, 2|Katze|f|,|1|Kater|m|nota|gato macho|,|8|Tic Tac Toe}}", "expected": [ { + "lang": "alemán", "lang_code": "de", "word": "Katze", "senseids": ["1", "2"], @@ -46,6 +52,7 @@ def test_es_extract_translation(self): }, { "lang_code": "de", + "lang": "alemán", "word": "Kater", "senseids": ["1"], "tags": ["m"], @@ -53,6 +60,7 @@ def test_es_extract_translation(self): }, { "lang_code": "de", + "lang": "alemán", "word": "Tic Tac Toe", "senseids": ["8"], }, @@ -62,6 +70,7 @@ def test_es_extract_translation(self): "input": "{{t+|fr|1|profession|nl|de|bateleur}}", "expected": [ { + "lang": "francés", "lang_code": "fr", "word": "profession de bateleur", "senseids": ["1"], @@ -72,6 +81,7 @@ def test_es_extract_translation(self): "input": "{{t+|hy|1|կատու|tr|katu}}", "expected": [ { + "lang": "armenio", "lang_code": "hy", "word": "կատու", "roman": "katu", @@ -83,6 +93,7 @@ def test_es_extract_translation(self): "input": "{{t+|hy|1|կատու|tr=katu}}", "expected": [ { + "lang": "armenio", "lang_code": "hy", "word": "կատու", "roman": "katu", @@ -93,9 +104,24 @@ def test_es_extract_translation(self): { "input": "{{t+|de|amphibisch|adj|,|Amphibie|sust|,|Amphibium|sust}}", "expected": [ - {"lang_code": "de", "word": "amphibisch", "tags": ["adj"]}, - {"lang_code": "de", "word": "Amphibie", "tags": ["sust"]}, - {"lang_code": "de", "word": "Amphibium", "tags": ["sust"]}, + { + "lang": "alemán", + "lang_code": "de", + "word": "amphibisch", + "tags": ["adj"], + }, + { + "lang": "alemán", + "lang_code": "de", + "word": "Amphibie", + "tags": ["sust"], + }, + { + "lang": "alemán", + "lang_code": "de", + "word": "Amphibium", + "tags": ["sust"], + }, ], }, ] From 55e843dab7c9c018a2282955e1e8cfa9cd8d1d06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Wed, 31 Jan 2024 09:21:42 +0200 Subject: [PATCH 3/4] Es-edition: skip certain etymology templates There's a lot of kludge here, so the next commit is probably just... going to nuke this completely. Committing this to keep the history, just in case there's something useful here. --- src/wiktextract/extractor/es/etymology.py | 45 ++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/es/etymology.py b/src/wiktextract/extractor/es/etymology.py index 5460dc32..6adfb995 100644 --- a/src/wiktextract/extractor/es/etymology.py +++ b/src/wiktextract/extractor/es/etymology.py @@ -1,9 +1,19 @@ from wikitextprocessor import NodeKind, WikiNode from wiktextract.extractor.es.models import EtymologyTemplate, WordEntry -from wiktextract.page import clean_node +from wiktextract.page import clean_node, LEVEL_KINDS from wiktextract.wxr_context import WiktextractContext +IGNORED_NODE_LEVELS = [NodeKind.HTML] + list(LEVEL_KINDS) + +SKIPPED_TEMPLATES = ( + "ampliable", + "arcoiris", + "clear", + "cita requerida", +) + + def process_etymology_block( wxr: WiktextractContext, entry: WordEntry, @@ -23,7 +33,27 @@ def process_etymology_block( """ has_etymology_info = False + ignore_these_templates: list[WikiNode] = [] + for ignored_node in level_node.find_child_recursively(IGNORED_NODE_LEVELS): + # stuff inside should be ignored, and obvious mistakes + # like a sub-section to etymology (wrong level kind, in es.wiktionary + # these should all apparently be sibligns, so Etymology sections + # shouldn't have other levels as their children (example: + # calzado around 2024-01-30 + if ignored_node.kind == NodeKind.HTML and ignored_node.sarg == "reg": + # HTML nodes other than REG should be fine + continue + ignore_these_templates.extend( + ignored_node.find_child_recursively(NodeKind.TEMPLATE) + ) + for template_node in level_node.find_child_recursively(NodeKind.TEMPLATE): + if template_node in ignore_these_templates: + continue + + if template_node.template_name in SKIPPED_TEMPLATES: + continue + entry.etymology_templates = entry.etymology_templates or [] etymology_template = EtymologyTemplate( @@ -31,6 +61,15 @@ def process_etymology_block( expansion=clean_node(wxr, None, template_node), ) + if etymology_template.expansion in ( + # "Please fill in this etymology, thank you..." + "Si puedes, incorpórala: ver cómo", + "Préstamo no adaptado.", + "Este lema en este idioma es ampliable. " + "Retira este aviso si la mayor parte de las acepciones ya están incluidas.", + ): + continue + args = {} for index, param in template_node.template_parameters.items(): args[str(index)] = ( @@ -42,6 +81,10 @@ def process_etymology_block( if args: etymology_template.args = args + # DEBUG + if not args: + print(f"EMPTY ARGS in {entry.word}, {etymology_template}") + entry.etymology_templates.append(etymology_template) if has_etymology_info: From 372b1172f392b82bdc9302dc77bd45b695769fbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Wed, 31 Jan 2024 09:46:08 +0200 Subject: [PATCH 4/4] Es-edition: Remove kludges, use substring "whitelist" Really, all the previous kludges in the previous commit are unnecessary if I'd just realized we only want to keep etymology template data for *etymology templates*, not just any template. So a simple substring check should suffice. It is unlikely that there are any conflicting templates with "etim" in their name, but possible. --- src/wiktextract/extractor/es/etymology.py | 47 +++++++---------------- tests/test_es_etymology.py | 12 +----- 2 files changed, 14 insertions(+), 45 deletions(-) diff --git a/src/wiktextract/extractor/es/etymology.py b/src/wiktextract/extractor/es/etymology.py index 6adfb995..9d9d020a 100644 --- a/src/wiktextract/extractor/es/etymology.py +++ b/src/wiktextract/extractor/es/etymology.py @@ -1,19 +1,11 @@ +from typing import cast from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import TemplateNode from wiktextract.extractor.es.models import EtymologyTemplate, WordEntry -from wiktextract.page import clean_node, LEVEL_KINDS +from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -IGNORED_NODE_LEVELS = [NodeKind.HTML] + list(LEVEL_KINDS) - -SKIPPED_TEMPLATES = ( - "ampliable", - "arcoiris", - "clear", - "cita requerida", -) - - def process_etymology_block( wxr: WiktextractContext, entry: WordEntry, @@ -33,25 +25,14 @@ def process_etymology_block( """ has_etymology_info = False - ignore_these_templates: list[WikiNode] = [] - for ignored_node in level_node.find_child_recursively(IGNORED_NODE_LEVELS): - # stuff inside should be ignored, and obvious mistakes - # like a sub-section to etymology (wrong level kind, in es.wiktionary - # these should all apparently be sibligns, so Etymology sections - # shouldn't have other levels as their children (example: - # calzado around 2024-01-30 - if ignored_node.kind == NodeKind.HTML and ignored_node.sarg == "reg": - # HTML nodes other than REG should be fine - continue - ignore_these_templates.extend( - ignored_node.find_child_recursively(NodeKind.TEMPLATE) - ) for template_node in level_node.find_child_recursively(NodeKind.TEMPLATE): - if template_node in ignore_these_templates: - continue - - if template_node.template_name in SKIPPED_TEMPLATES: + # no-op type-annotation cast; we softly assert template_node is a + # TemplateNode, which has .template_name, to quiet the type-checker. + template_node = cast(TemplateNode, template_node) + if "etim" not in template_node.template_name: + # We don't want to keep any other template data other than + # the main etymology templates (and maybe Plantilla:etim) continue entry.etymology_templates = entry.etymology_templates or [] @@ -63,7 +44,7 @@ def process_etymology_block( if etymology_template.expansion in ( # "Please fill in this etymology, thank you..." - "Si puedes, incorpórala: ver cómo", + "Si puedes, incorpórala: ver cómo.", "Préstamo no adaptado.", "Este lema en este idioma es ampliable. " "Retira este aviso si la mayor parte de las acepciones ya están incluidas.", @@ -77,14 +58,12 @@ def process_etymology_block( if isinstance(param, str) else clean_node(wxr, None, param) ) + # if any other index other than "leng" is encountered, + # has_etymology => True has_etymology_info = has_etymology_info or index != "leng" - if args: + if args and not (len(args) == 1 and "leng" in args): etymology_template.args = args - # DEBUG - if not args: - print(f"EMPTY ARGS in {entry.word}, {etymology_template}") - entry.etymology_templates.append(etymology_template) if has_etymology_info: diff --git a/tests/test_es_etymology.py b/tests/test_es_etymology.py index 7527f143..f80a7f37 100644 --- a/tests/test_es_etymology.py +++ b/tests/test_es_etymology.py @@ -34,17 +34,7 @@ def test_es_extract_etymology(self): { # https://es.wiktionary.org/wiki/Schreck "input": "{{etimología|leng=de}}", - "expected": { - "etymology_templates": [ - { - "args": { - "leng": "de", - }, - "name": "etimología", - "expansion": "Si puedes, incorpórala: ver cómo.", - }, - ], - }, + "expected": dict(), }, { # https://es.wiktionary.org/wiki/bagre