From 201c4d567c8768808c84e583de4684d3a3674d3c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 29 Nov 2023 14:44:01 +0800 Subject: [PATCH 1/2] Don't use `append_base_data()` in French extractor This function is for the Chinese Wiktionary extractor. --- src/wiktextract/extractor/fr/page.py | 28 +++++++++++----------------- tests/test_fr_gloss.py | 4 ++-- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 0f8b1d04..6a83f00b 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -5,7 +5,6 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode -from wiktextract.datautils import append_base_data from wiktextract.page import LEVEL_KINDS, clean_node from wiktextract.wxr_context import WiktextractContext @@ -45,7 +44,11 @@ def parse_section( # https://fr.wiktionary.org/wiki/Modèle:S # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections section_type = level_node_template.template_parameters.get(1) - subtitle = clean_node(wxr, page_data[-1], level_node.largs) + subtitle = clean_node( + wxr, + page_data[-1] if len(page_data) > 0 else base_data, + level_node.largs, + ) wxr.wtp.start_subsection(subtitle) if section_type in wxr.config.OTHER_SUBTITLES["ignored_sections"]: pass @@ -104,7 +107,8 @@ def process_pos_block( pos_title: str, ): pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"] - append_base_data(page_data, "pos", pos_type, base_data) + page_data.append(copy.deepcopy(base_data)) + page_data[-1]["pos"] = pos_type page_data[-1]["pos_title"] = pos_title child_nodes = list(pos_title_node.filter_empty_str_child()) form_line_start = 0 # Ligne de forme @@ -163,27 +167,17 @@ def parse_page( # https://fr.wiktionary.org/wiki/Modèle:langue # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues if subtitle_template.template_name == "langue": - categories_and_links = defaultdict(list) + base_data = defaultdict(list, {"word": wxr.wtp.title}) lang_code = subtitle_template.template_parameters.get(1) if ( wxr.config.capture_language_codes is not None and lang_code not in wxr.config.capture_language_codes ): continue - lang_name = clean_node( - wxr, categories_and_links, subtitle_template - ) + lang_name = clean_node(wxr, base_data, subtitle_template) wxr.wtp.start_section(lang_name) - base_data = defaultdict( - list, - { - "lang_name": lang_name, - "lang_code": lang_code, - "word": wxr.wtp.title, - }, - ) - base_data.update(categories_and_links) - page_data.append(copy.deepcopy(base_data)) + base_data["lang_name"] = lang_name + base_data["lang_code"] = lang_code etymology_data: Optional[EtymologyData] = None for level3_node in level2_node.find_child(NodeKind.LEVEL3): new_etymology_data = parse_section( diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index b559a4d6..dbcfa7c9 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -9,7 +9,7 @@ from wiktextract.wxr_context import WiktextractContext -class TestFormLine(unittest.TestCase): +class TestFrGloss(unittest.TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -115,7 +115,7 @@ def test_zh_exemple_template(self): root = self.wxr.wtp.parse( "=== {{S|nom|zh}} ===\n# Cheval.\n{{zh-exemple|这匹'''马'''很大。|Ce cheval est grand.|Zhè pǐ '''mǎ''' hěn dà.
⠌⠢⠆ ⠏⠊⠄ ⠍⠔⠄ ⠓⠴⠄ ⠙⠔⠆⠐⠆}}" ) - page_data = [defaultdict(list)] + page_data = [] process_pos_block( self.wxr, page_data, From b8ff77135595991ecc38933aacf93e3c0cabbe49 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 29 Nov 2023 15:03:07 +0800 Subject: [PATCH 2/2] Fix index out of range error in fr/pronunciation.py --- src/wiktextract/extractor/fr/page.py | 2 +- src/wiktextract/extractor/fr/pronunciation.py | 16 +++++++++++----- tests/test_fr_pronunciation.py | 17 ++++++++++++----- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 6a83f00b..aa3ebc5d 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -72,7 +72,7 @@ def parse_section( wxr.config.capture_pronunciation and section_type in wxr.config.OTHER_SUBTITLES["pronunciation"] ): - extract_pronunciation(wxr, page_data, level_node) + extract_pronunciation(wxr, page_data, level_node, base_data) elif ( wxr.config.capture_linkages and section_type in wxr.config.LINKAGE_SUBTITLES diff --git a/src/wiktextract/extractor/fr/pronunciation.py b/src/wiktextract/extractor/fr/pronunciation.py index 67894ea4..319ef5ae 100644 --- a/src/wiktextract/extractor/fr/pronunciation.py +++ b/src/wiktextract/extractor/fr/pronunciation.py @@ -10,19 +10,25 @@ def extract_pronunciation( - wxr: WiktextractContext, page_data: List[Dict], level_node: WikiNode + wxr: WiktextractContext, + page_data: List[Dict], + level_node: WikiNode, + base_data: Dict[str, str], ) -> None: sound_data = [] + lang_code = base_data.get("lang_code") for list_node in level_node.find_child(NodeKind.LIST): for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): sound_data.extend( process_pron_list_item( - wxr, list_item_node, page_data, defaultdict(list) + wxr, list_item_node, defaultdict(list), lang_code ) ) if len(sound_data) == 0: return + if len(page_data) == 0: + page_data.append(deepcopy(base_data)) if level_node.kind == NodeKind.LEVEL3: # Add extracted sound data to all sense dictionaries that have the same @@ -53,10 +59,10 @@ def extract_pronunciation( def process_pron_list_item( wxr: WiktextractContext, list_item_node: WikiNode, - page_data: List[Dict], sound_data: Dict[str, Union[str, List[str]]], + lang_code: str, ) -> List[Dict[str, Union[str, List[str]]]]: - pron_key = "zh-pron" if page_data[-1].get("lang_code") == "zh" else "ipa" + pron_key = "zh-pron" if lang_code == "zh" else "ipa" for template_node in list_item_node.find_child(NodeKind.TEMPLATE): if template_node.template_name in PRON_TEMPLATES: @@ -81,7 +87,7 @@ def process_pron_list_item( ): new_sound_data = deepcopy(sound_data) process_pron_list_item( - wxr, nest_list_item, page_data, new_sound_data + wxr, nest_list_item, new_sound_data, lang_code ) if pron_key in new_sound_data: returned_data.append(new_sound_data) diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py index 2311a068..3473a2b7 100644 --- a/tests/test_fr_pronunciation.py +++ b/tests/test_fr_pronunciation.py @@ -27,7 +27,7 @@ def test_pron_list(self): root = self.wxr.wtp.parse( "=== Prononciation ===\n* {{pron|bɔ̃.ʒuʁ|fr}}\n** {{écouter|France (Paris)|bõ.ʒuːʁ|audio=Fr-bonjour.ogg|lang=fr}}" ) - extract_pronunciation(self.wxr, page_data, root.children[0]) + extract_pronunciation(self.wxr, page_data, root.children[0], {}) self.assertEqual( page_data, [ @@ -60,13 +60,18 @@ def test_pron_list(self): ) def test_str_pron(self): - page_data = [defaultdict(list, {"lang_code": "zh"})] + page_data = [] self.wxr.wtp.add_page("Modèle:Yale-zh", 10, body="Yale") self.wxr.wtp.start_page("") root = self.wxr.wtp.parse( "=== {{S|prononciation}} ===\n* '''cantonais''' {{pron||yue}}\n** {{Yale-zh}} : nei⁵hou²" ) - extract_pronunciation(self.wxr, page_data, root.children[0]) + extract_pronunciation( + self.wxr, + page_data, + root.children[0], + defaultdict(list, {"lang_code": "zh"}), + ) self.assertEqual( page_data[0].get("sounds"), [{"tags": ["cantonais", "Yale"], "zh-pron": "nei⁵hou²"}], @@ -78,14 +83,16 @@ def test_no_ipa(self): files. Test wikitext from https://fr.wiktionary.org/wiki/mars """ - page_data = [defaultdict(list)] + page_data = [] self.wxr.wtp.start_page("") root = self.wxr.wtp.parse( """=== {{S|prononciation}} === {{ébauche-pron|sv}} * {{écouter|lang=sv|Suède||audio=LL-Q9027 (swe)-Moonhouse-mars.wav}}""" ) - extract_pronunciation(self.wxr, page_data, root.children[0]) + extract_pronunciation( + self.wxr, page_data, root.children[0], defaultdict(list) + ) self.assertEqual( page_data[0].get("sounds"), [