From 201c4d567c8768808c84e583de4684d3a3674d3c Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 29 Nov 2023 14:44:01 +0800
Subject: [PATCH 1/2] Don't use `append_base_data()` in French extractor

This function is for the Chinese Wiktionary extractor.
---
 src/wiktextract/extractor/fr/page.py | 28 +++++++++++-----------------
 tests/test_fr_gloss.py               |  4 ++--
 2 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
index 0f8b1d04..6a83f00b 100644
--- a/src/wiktextract/extractor/fr/page.py
+++ b/src/wiktextract/extractor/fr/page.py
@@ -5,7 +5,6 @@
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import TemplateNode
-from wiktextract.datautils import append_base_data
 from wiktextract.page import LEVEL_KINDS, clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -45,7 +44,11 @@ def parse_section(
             # https://fr.wiktionary.org/wiki/Modèle:S
             # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
             section_type = level_node_template.template_parameters.get(1)
-            subtitle = clean_node(wxr, page_data[-1], level_node.largs)
+            subtitle = clean_node(
+                wxr,
+                page_data[-1] if len(page_data) > 0 else base_data,
+                level_node.largs,
+            )
             wxr.wtp.start_subsection(subtitle)
             if section_type in wxr.config.OTHER_SUBTITLES["ignored_sections"]:
                 pass
@@ -104,7 +107,8 @@ def process_pos_block(
     pos_title: str,
 ):
     pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"]
-    append_base_data(page_data, "pos", pos_type, base_data)
+    page_data.append(copy.deepcopy(base_data))
+    page_data[-1]["pos"] = pos_type
     page_data[-1]["pos_title"] = pos_title
     child_nodes = list(pos_title_node.filter_empty_str_child())
     form_line_start = 0  # Ligne de forme
@@ -163,27 +167,17 @@ def parse_page(
             # https://fr.wiktionary.org/wiki/Modèle:langue
             # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
             if subtitle_template.template_name == "langue":
-                categories_and_links = defaultdict(list)
+                base_data = defaultdict(list, {"word": wxr.wtp.title})
                 lang_code = subtitle_template.template_parameters.get(1)
                 if (
                     wxr.config.capture_language_codes is not None
                     and lang_code not in wxr.config.capture_language_codes
                 ):
                     continue
-                lang_name = clean_node(
-                    wxr, categories_and_links, subtitle_template
-                )
+                lang_name = clean_node(wxr, base_data, subtitle_template)
                 wxr.wtp.start_section(lang_name)
-                base_data = defaultdict(
-                    list,
-                    {
-                        "lang_name": lang_name,
-                        "lang_code": lang_code,
-                        "word": wxr.wtp.title,
-                    },
-                )
-                base_data.update(categories_and_links)
-                page_data.append(copy.deepcopy(base_data))
+                base_data["lang_name"] = lang_name
+                base_data["lang_code"] = lang_code
                 etymology_data: Optional[EtymologyData] = None
                 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
                     new_etymology_data = parse_section(
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
index b559a4d6..dbcfa7c9 100644
--- a/tests/test_fr_gloss.py
+++ b/tests/test_fr_gloss.py
@@ -9,7 +9,7 @@
 from wiktextract.wxr_context import WiktextractContext
 
 
-class TestFormLine(unittest.TestCase):
+class TestFrGloss(unittest.TestCase):
     def setUp(self) -> None:
         self.wxr = WiktextractContext(
             Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr")
@@ -115,7 +115,7 @@ def test_zh_exemple_template(self):
         root = self.wxr.wtp.parse(
             "=== {{S|nom|zh}} ===\n# Cheval.\n{{zh-exemple|这匹'''马'''很大。|Ce cheval est grand.|Zhè pǐ '''mǎ''' hěn dà.<br/>⠌⠢⠆ ⠏⠊⠄ ⠍⠔⠄ ⠓⠴⠄ ⠙⠔⠆⠐⠆}}"
         )
-        page_data = [defaultdict(list)]
+        page_data = []
         process_pos_block(
             self.wxr,
             page_data,

From b8ff77135595991ecc38933aacf93e3c0cabbe49 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 29 Nov 2023 15:03:07 +0800
Subject: [PATCH 2/2] Fix index out of range error in fr/pronunciation.py

---
 src/wiktextract/extractor/fr/page.py          |  2 +-
 src/wiktextract/extractor/fr/pronunciation.py | 16 +++++++++++-----
 tests/test_fr_pronunciation.py                | 17 ++++++++++++-----
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
index 6a83f00b..aa3ebc5d 100644
--- a/src/wiktextract/extractor/fr/page.py
+++ b/src/wiktextract/extractor/fr/page.py
@@ -72,7 +72,7 @@ def parse_section(
                 wxr.config.capture_pronunciation
                 and section_type in wxr.config.OTHER_SUBTITLES["pronunciation"]
             ):
-                extract_pronunciation(wxr, page_data, level_node)
+                extract_pronunciation(wxr, page_data, level_node, base_data)
             elif (
                 wxr.config.capture_linkages
                 and section_type in wxr.config.LINKAGE_SUBTITLES
diff --git a/src/wiktextract/extractor/fr/pronunciation.py b/src/wiktextract/extractor/fr/pronunciation.py
index 67894ea4..319ef5ae 100644
--- a/src/wiktextract/extractor/fr/pronunciation.py
+++ b/src/wiktextract/extractor/fr/pronunciation.py
@@ -10,19 +10,25 @@
 
 
 def extract_pronunciation(
-    wxr: WiktextractContext, page_data: List[Dict], level_node: WikiNode
+    wxr: WiktextractContext,
+    page_data: List[Dict],
+    level_node: WikiNode,
+    base_data: Dict[str, str],
 ) -> None:
     sound_data = []
+    lang_code = base_data.get("lang_code")
     for list_node in level_node.find_child(NodeKind.LIST):
         for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
             sound_data.extend(
                 process_pron_list_item(
-                    wxr, list_item_node, page_data, defaultdict(list)
+                    wxr, list_item_node, defaultdict(list), lang_code
                 )
             )
 
     if len(sound_data) == 0:
         return
+    if len(page_data) == 0:
+        page_data.append(deepcopy(base_data))
 
     if level_node.kind == NodeKind.LEVEL3:
         # Add extracted sound data to all sense dictionaries that have the same
@@ -53,10 +59,10 @@ def extract_pronunciation(
 def process_pron_list_item(
     wxr: WiktextractContext,
     list_item_node: WikiNode,
-    page_data: List[Dict],
     sound_data: Dict[str, Union[str, List[str]]],
+    lang_code: str,
 ) -> List[Dict[str, Union[str, List[str]]]]:
-    pron_key = "zh-pron" if page_data[-1].get("lang_code") == "zh" else "ipa"
+    pron_key = "zh-pron" if lang_code == "zh" else "ipa"
 
     for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
         if template_node.template_name in PRON_TEMPLATES:
@@ -81,7 +87,7 @@ def process_pron_list_item(
         ):
             new_sound_data = deepcopy(sound_data)
             process_pron_list_item(
-                wxr, nest_list_item, page_data, new_sound_data
+                wxr, nest_list_item, new_sound_data, lang_code
             )
             if pron_key in new_sound_data:
                 returned_data.append(new_sound_data)
diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py
index 2311a068..3473a2b7 100644
--- a/tests/test_fr_pronunciation.py
+++ b/tests/test_fr_pronunciation.py
@@ -27,7 +27,7 @@ def test_pron_list(self):
         root = self.wxr.wtp.parse(
             "=== Prononciation ===\n* {{pron|bɔ̃.ʒuʁ|fr}}\n** {{écouter|France (Paris)|bõ.ʒuːʁ|audio=Fr-bonjour.ogg|lang=fr}}"
         )
-        extract_pronunciation(self.wxr, page_data, root.children[0])
+        extract_pronunciation(self.wxr, page_data, root.children[0], {})
         self.assertEqual(
             page_data,
             [
@@ -60,13 +60,18 @@ def test_pron_list(self):
         )
 
     def test_str_pron(self):
-        page_data = [defaultdict(list, {"lang_code": "zh"})]
+        page_data = []
         self.wxr.wtp.add_page("Modèle:Yale-zh", 10, body="Yale")
         self.wxr.wtp.start_page("")
         root = self.wxr.wtp.parse(
             "=== {{S|prononciation}} ===\n* '''cantonais''' {{pron||yue}}\n** {{Yale-zh}} : nei⁵hou²"
         )
-        extract_pronunciation(self.wxr, page_data, root.children[0])
+        extract_pronunciation(
+            self.wxr,
+            page_data,
+            root.children[0],
+            defaultdict(list, {"lang_code": "zh"}),
+        )
         self.assertEqual(
             page_data[0].get("sounds"),
             [{"tags": ["cantonais", "Yale"], "zh-pron": "nei⁵hou²"}],
@@ -78,14 +83,16 @@ def test_no_ipa(self):
         files.
         Test wikitext from https://fr.wiktionary.org/wiki/mars
         """
-        page_data = [defaultdict(list)]
+        page_data = []
         self.wxr.wtp.start_page("")
         root = self.wxr.wtp.parse(
             """=== {{S|prononciation}} ===
 {{ébauche-pron|sv}}
 * {{écouter|lang=sv|Suède||audio=LL-Q9027 (swe)-Moonhouse-mars.wav}}"""
         )
-        extract_pronunciation(self.wxr, page_data, root.children[0])
+        extract_pronunciation(
+            self.wxr, page_data, root.children[0], defaultdict(list)
+        )
         self.assertEqual(
             page_data[0].get("sounds"),
             [