Extract French Wiktionary etymology lists

Unlink English Wiktionary, French Wiktionary writes all etymology data of different POS types inside the same section. And each POS data uses a list("*") or indent(":").
tatuylonen · Sep 25, 2023 · 152cc43 · 152cc43
1 parent 2c4523e
commit 152cc43
Show file tree

Hide file tree

Showing 7 changed files with 346 additions and 119 deletions.
diff --git a/tests/test_fr_etymology.py b/tests/test_fr_etymology.py
@@ -4,7 +4,10 @@
 from wikitextprocessor import Wtp
 
 from wiktextract.config import WiktionaryConfig
-from wiktextract.extractor.fr.page import extract_etymology
+from wiktextract.extractor.fr.etymology import (
+    extract_etymology,
+    insert_etymology_data,
+)
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
@@ -23,9 +26,150 @@ def tearDown(self) -> None:
 
     def test_ebauche_etym(self):
         # https://fr.wiktionary.org/wiki/Hörsaal
+        # missing etymology template "ébauche-étym" should be ignored
         self.wxr.wtp.start_page("")
         root = self.wxr.wtp.parse(": {{ébauche-étym|de}}")
-        base_data = defaultdict(list, {"lang_code": "de"})
-        page_data = [base_data]
-        extract_etymology(self.wxr, page_data, base_data, root.children)
-        self.assertEqual(page_data, [{"lang_code": "de"}])
+        etymology_data = extract_etymology(self.wxr, root.children)
+        self.assertIsNone(etymology_data)
+
+    def test_list_etymologies(self):
+        # https://fr.wiktionary.org/wiki/lenn
+        self.wxr.wtp.start_page("lenn")
+        root = self.wxr.wtp.parse(
+            """* [[#br-nom-1|Nom commun 1 :]]
+: Du vieux breton lin (« lac, étang ; liquide, humeur »).
+: Du moyen breton lenn.
+* [[#br-nom-2|Nom commun 2 :]]
+:Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."""
+        )
+        etymology_data = extract_etymology(self.wxr, root.children)
+        self.assertEqual(
+            etymology_data,
+            {
+                "Nom commun 1": [
+                    "Du vieux breton lin (« lac, étang ; liquide, humeur »).",
+                    "Du moyen breton lenn.",
+                ],
+                "Nom commun 2": [
+                    "Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."
+                ],
+            },
+        )
+        page_data = [
+            defaultdict(
+                list,
+                {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"},
+            ),
+            defaultdict(
+                list,
+                {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"},
+            ),
+        ]
+        insert_etymology_data("fr", page_data, etymology_data)
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "lang_code": "fr",
+                    "pos": "noun",
+                    "pos_title": "Nom commun 1",
+                    "etymology_texts": [
+                        "Du vieux breton lin (« lac, étang ; liquide, humeur »).",
+                        "Du moyen breton lenn.",
+                    ],
+                },
+                {
+                    "lang_code": "fr",
+                    "pos": "noun",
+                    "pos_title": "Nom commun 2",
+                    "etymology_texts": [
+                        "Du vieux breton lenn (« pièce de toile, voile, manteau, rideau »)."
+                    ],
+                },
+            ],
+        )
+
+    def test_indent_etymology_with_pos_template(self):
+        # https://fr.wiktionary.org/wiki/dame
+        self.wxr.wtp.start_page("damn")
+        self.wxr.wtp.add_page("Modèle:lien-ancre-étym", 10, "({{{2}}} {{{3}}})")
+        root = self.wxr.wtp.parse(
+            """: {{lien-ancre-étym|fr|Nom commun|1}} Du latin domina (« maîtresse de maison »).
+: {{lien-ancre-étym|fr|Nom commun|2}} Du moyen néerlandais dam (« digue »).
+: {{lien-ancre-étym|fr|Interjection|1}} Abréviation de « [[Notre-Dame]] ! » ou de « dame Dieu ! » (« [[Seigneur Dieu]] ! »).
+"""
+        )
+        etymology_data = extract_etymology(self.wxr, root.children)
+        self.assertEqual(
+            etymology_data,
+            {
+                "Nom commun 1": ["Du latin domina (« maîtresse de maison »)."],
+                "Nom commun 2": ["Du moyen néerlandais dam (« digue »)."],
+                "Interjection 1": [
+                    "Abréviation de « Notre-Dame ! » ou de « dame Dieu ! » (« Seigneur Dieu ! »)."
+                ],
+            },
+        )
+        page_data = [
+            defaultdict(
+                list,
+                {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"},
+            ),
+            defaultdict(
+                list,
+                {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"},
+            ),
+            defaultdict(
+                list,
+                {"lang_code": "fr", "pos": "intj", "pos_title": "Interjection"},
+            ),
+        ]
+        insert_etymology_data("fr", page_data, etymology_data)
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "lang_code": "fr",
+                    "pos": "noun",
+                    "pos_title": "Nom commun 1",
+                    "etymology_texts": [
+                        "Du latin domina (« maîtresse de maison »)."
+                    ],
+                },
+                {
+                    "lang_code": "fr",
+                    "pos": "noun",
+                    "pos_title": "Nom commun 2",
+                    "etymology_texts": [
+                        "Du moyen néerlandais dam (« digue »)."
+                    ],
+                },
+                {
+                    "lang_code": "fr",
+                    "pos": "intj",
+                    "pos_title": "Interjection",
+                    "etymology_texts": [
+                        "Abréviation de « Notre-Dame ! » ou de « dame Dieu ! » (« Seigneur Dieu ! »)."
+                    ],
+                },
+            ],
+        )
+
+    def test_indent_etymology_with_italic_pos(self):
+        # https://fr.wiktionary.org/wiki/hélas
+        self.wxr.wtp.start_page("hélas")
+        root = self.wxr.wtp.parse(
+            """: (''[[#Interjection|Interjection]]'') XIIe siècle, elas ; composé de hé et de las, au sens ancien de « malheureux ».
+: (''[[#fr-nom|Nom]]'') Par [[substantivation]] de l’interjection.
+"""
+        )
+        etymology_data = extract_etymology(self.wxr, root.children)
+        self.assertEqual(
+            etymology_data,
+            {
+                "Interjection": [
+                    "XIIe siècle, elas ; composé de hé et de las, au sens ancien de « malheureux »."
+                ],
+                "Nom commun": ["Par substantivation de l’interjection."],
+            },
+        )
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
@@ -122,13 +122,19 @@ def test_zh_exemple_template(self):
         )
         page_data = [defaultdict(list)]
         process_pos_block(
-            self.wxr, page_data, defaultdict(list), root.children[0], "nom"
+            self.wxr,
+            page_data,
+            defaultdict(list),
+            root.children[0],
+            "nom",
+            "Nom commun",
         )
         self.assertEqual(
             page_data,
             [
                 {
                     "pos": "noun",
+                    "pos_title": "Nom commun",
                     "senses": [
                         {
                             "glosses": ["Cheval."],

diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py
@@ -18,12 +18,6 @@ def setUp(self):
         conf1 = WiktionaryConfig(
             dump_file_lang_code="fr",
             capture_language_codes=None,
-            capture_translations=True,
-            capture_pronunciation=True,
-            capture_linkages=True,
-            capture_compounds=True,
-            capture_redirects=True,
-            capture_examples=True,
         )
         self.wxr = WiktextractContext(Wtp(lang_code="fr"), conf1)
 
@@ -52,6 +46,7 @@ def test_fr_parse_page(self):
                     "lang": "Français",
                     "lang_code": "fr",
                     "pos": "noun",
+                    "pos_title": "Nom commun",
                     "word": "exemple",
                 }
             ],

diff --git a/wiktextract/datautils.py b/wiktextract/datautils.py
@@ -259,6 +259,7 @@ def append_base_data(
             # append new dictionary if the last dictionary has sense data and
             # also has the same key
             page_data.append(copy.deepcopy(base_data))
+            page_data[-1][field] = value
         elif isinstance(page_data[-1].get(field), list):
             page_data[-1][field] += value
     else:

diff --git a/wiktextract/extractor/fr/etymology.py b/wiktextract/extractor/fr/etymology.py
@@ -0,0 +1,118 @@
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple, Union
+
+from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import LevelNode, TemplateNode
+
+from wiktextract.page import LEVEL_KINDS, clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+EtymologyData = Dict[str, List[str]]
+
+
+def extract_etymology(
+    wxr: WiktextractContext,
+    nodes: List[Union[WikiNode, str]],
+) -> Optional[EtymologyData]:
+    etymology_dict: EtymologyData = defaultdict(list)
+    level_node_index = len(nodes)
+    # find nodes after the etymology subtitle and before the next level node
+    for index, node in enumerate(nodes):
+        if isinstance(node, WikiNode) and node.kind in LEVEL_KINDS:
+            level_node_index = index
+            break
+
+    pos_title: Optional[str] = None
+    for etymology_node in nodes[:level_node_index]:
+        if (
+            isinstance(etymology_node, WikiNode)
+            and etymology_node.kind == NodeKind.LIST
+        ):
+            if etymology_node.sarg == "*":
+                pos_title = clean_node(wxr, None, etymology_node)
+                pos_title = pos_title.removeprefix("* ").removesuffix(" :")
+            elif etymology_node.sarg == ":":
+                # ignore missing etymology template "ébauche-étym"
+                for template_node in etymology_node.find_child_recursively(
+                    NodeKind.TEMPLATE
+                ):
+                    if template_node.template_name == "ébauche-étym":
+                        return
+
+                for etymology_item in etymology_node.find_child(
+                    NodeKind.LIST_ITEM
+                ):
+                    etymology_data = find_pos_in_etymology_list(
+                        wxr, etymology_item
+                    )
+                    if etymology_data is not None:
+                        new_pos_title, new_etymology_text = etymology_data
+                        etymology_dict[new_pos_title].append(new_etymology_text)
+                    else:
+                        etymology_text = clean_node(
+                            wxr, None, etymology_item.children
+                        )
+                        etymology_dict[pos_title].append(etymology_text)
+
+    return etymology_dict
+
+
+def find_pos_in_etymology_list(
+    wxr: WiktextractContext, list_item_node: WikiNode
+) -> Optional[Tuple[str, str]]:
+    """
+    Return tuple of POS title and etymology text if the passed lis item node
+    starts with italic POS node or POS template, otherwise return None.
+    """
+    child_nodes = list(list_item_node.filter_empty_str_child())
+    for index, node in enumerate(child_nodes):
+        if (
+            index == 0
+            and isinstance(node, TemplateNode)
+            and node.template_name == "lien-ancre-étym"
+        ):
+            return clean_node(wxr, None, node).strip("()"), clean_node(
+                wxr, None, child_nodes[index + 1 :]
+            )
+        if (
+            index == 1
+            and isinstance(node, WikiNode)
+            and node.kind == NodeKind.ITALIC
+            and isinstance(child_nodes[0], str)
+            and child_nodes[0].endswith("(")
+            and isinstance(child_nodes[2], str)
+            and child_nodes[2].startswith(")")
+        ):
+            # italic pos
+            pos_title = clean_node(wxr, None, node)
+            if pos_title == "Nom":
+                pos_title = "Nom commun"
+            return pos_title, clean_node(
+                wxr, None, child_nodes[index + 1 :]
+            ).removeprefix(") ")
+
+
+def insert_etymology_data(
+    lang_code: str, page_data: List[Dict], etymology_data: EtymologyData
+) -> None:
+    """
+    Insert list of etymology data extracted from the level 3 node to each sense
+    dictionary matches the language and POS.
+    """
+    sense_dict = {}  # group by pos title
+    for sense_data in page_data:
+        if sense_data.get("lang_code") == lang_code:
+            sense_dict[sense_data.get("pos_title")] = sense_data
+
+    for pos_title, etymology_texts in etymology_data.items():
+        if pos_title is None:  # add to all sense dictionaries
+            for sense_data in sense_dict.values():
+                sense_data["etymology_texts"] = etymology_texts
+        elif pos_title in sense_dict:
+            sense_dict[pos_title]["etymology_texts"] = etymology_texts
+        elif pos_title.removesuffix(" 1") in sense_dict:
+            # an index number is added in the etymology section but not added in
+            # POS title
+            sense_dict[pos_title.removesuffix(" 1")][
+                "etymology_texts"
+            ] = etymology_texts
diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py
@@ -6,7 +6,7 @@
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
-from .pronunciation import is_ipa_text, insert_ipa
+from .pronunciation import insert_ipa, is_ipa_text
 
 
 def extract_inflection(