Merge pull request #378 from xxyzz/fr

Update French extractor's linkage and form line code
tatuylonen · Oct 24, 2023 · 435627b · 435627b
2 parents b2ab827 + 694649d
commit 435627b
Show file tree

Hide file tree

Showing 8 changed files with 142 additions and 76 deletions.
diff --git a/json_schema/fr.json b/json_schema/fr.json
@@ -315,6 +315,14 @@
         "translation": {
           "description": "French translation",
           "type": "string"
+        },
+        "sense": {
+          "description": "Definition of the word",
+          "type": "string"
+        },
+        "sense_index": {
+          "description": "Number of the definition, start from 1",
+          "type": "integer"
         }
       }
     }

diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py
@@ -2,7 +2,7 @@
 from typing import Dict, List, Union
 
 from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import TemplateNode
+from wikitextprocessor.parser import TemplateNode, HTMLNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -56,19 +56,22 @@ def process_equiv_pour_template(
     wxr: WiktextractContext, node: TemplateNode, page_data: List[Dict]
 ) -> None:
     # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour
-    form_type = node.template_parameters.get(1)
-    for template_arg_index in range(2, 8):
-        form = clean_node(
-            wxr, None, node.template_parameters.get(template_arg_index, "")
-        )
-        if len(form) > 0:
-            page_data[-1]["forms"].append(
-                {
-                    "form": form,
-                    "tags": [f"pour {form_type}"],
-                    "source": "form line template 'équiv-pour'",
-                }
-            )
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(node), expand_all=True
+    )
+    form_tag = ""
+    for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML):
+        if child.kind == NodeKind.ITALIC:
+            form_tag = clean_node(wxr, None, child).strip("() ")
+        elif isinstance(child, HTMLNode) and child.tag == "bdi":
+            form_data = {
+                "form": clean_node(wxr, None, child),
+                "source": "form line template 'équiv-pour'",
+            }
+            if len(form_tag) > 0:
+                form_data["tags"] = [form_tag]
+            if len(form_data["form"]) > 0:
+                page_data[-1]["forms"].append(form_data)
 
 
 def process_zh_mot_template(

diff --git a/src/wiktextract/extractor/fr/linkage.py b/src/wiktextract/extractor/fr/linkage.py
@@ -15,11 +15,34 @@ def extract_linkage(
     level_node: WikiNode,
     linkage_type: str,
 ) -> None:
-    for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
+    sense_text = ""
+    sense_index = 0
+    for template_or_list_node in level_node.find_child_recursively(
+        NodeKind.LIST_ITEM | NodeKind.TEMPLATE
+    ):
+        # list table start template: https://fr.wiktionary.org/wiki/Modèle:(
+        if (
+            isinstance(template_or_list_node, TemplateNode)
+            and template_or_list_node.template_name == "("
+        ):
+            sense_text = clean_node(
+                wxr, None, template_or_list_node.template_parameters.get(1, "")
+            )
+            sense_index_text = template_or_list_node.template_parameters.get(
+                2, "0"
+            )
+            if sense_index_text.isdigit():
+                sense_index = int(sense_index_text)
+            continue
+
         linkage_data = defaultdict(list)
+        if len(sense_text) > 0:
+            linkage_data["sense"] = sense_text
+        if sense_index != 0:
+            linkage_data["sense_index"] = sense_index
         pending_tag = ""
         for index, child_node in enumerate(  # remove nested lists
-            list_item_node.invert_find_child(NodeKind.LIST)
+            template_or_list_node.invert_find_child(NodeKind.LIST)
         ):
             if index == 0 or "word" not in linkage_data:
                 if isinstance(child_node, TemplateNode):
@@ -57,7 +80,8 @@ def extract_linkage(
                     elif len(tag) > 0:
                         linkage_data["tags"].append(tag)
 
-        page_data[-1][linkage_type].append(linkage_data)
+        if "word" in linkage_data:
+            page_data[-1][linkage_type].append(linkage_data)
 
 
 def process_linkage_template(

diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py
@@ -42,38 +42,6 @@ def test_gender(self, mock_clean_node):
         extract_form_line(self.wxr, page_data, root.children)
         self.assertEqual(page_data, [{"tags": ["masculin"]}])
 
-    def test_equiv_pour(self):
-        self.wxr.wtp.start_page("")
-        root = self.wxr.wtp.parse(
-            "{{équiv-pour|une femme|autrice|auteure|auteuse|lang=fr}}"
-        )
-        page_data = [defaultdict(list)]
-        extract_form_line(self.wxr, page_data, root.children)
-        self.assertEqual(
-            page_data,
-            [
-                {
-                    "forms": [
-                        {
-                            "form": "autrice",
-                            "tags": ["pour une femme"],
-                            "source": "form line template 'équiv-pour'",
-                        },
-                        {
-                            "form": "auteure",
-                            "tags": ["pour une femme"],
-                            "source": "form line template 'équiv-pour'",
-                        },
-                        {
-                            "form": "auteuse",
-                            "tags": ["pour une femme"],
-                            "source": "form line template 'équiv-pour'",
-                        },
-                    ]
-                }
-            ],
-        )
-
     def test_zh_mot(self):
         self.wxr.wtp.start_page("")
         self.wxr.wtp.add_page("Modèle:zh-mot", 10, body="{{lang}} {{pron}}")
@@ -134,3 +102,51 @@ def test_template_in_pron_argument(self):
             page_data,
             [{"sounds": [{"ipa": "mi.ne.ʁa.l‿aʁ.ʒi.lø"}]}],
         )
+
+    @patch(
+        "wikitextprocessor.Wtp.node_to_wikitext",
+        return_value="''(pour un homme, on dit'' : <bdi lang=\"fr\" xml:lang=\"fr\" class=\"lang-fr\">[[auteur#fr|auteur]]</bdi> ; ''pour une personne non-binaire, on peut dire'' : <bdi lang=\"fr\" xml:lang=\"fr\" class=\"lang-fr\">[[autaire#fr|autaire]]</bdi>, <bdi lang=\"fr\" xml:lang=\"fr\" class=\"lang-fr\">[[auteurice#fr|auteurice]]</bdi>, <bdi lang=\"fr\" xml:lang=\"fr\" class=\"lang-fr\">[[auteur·ice#fr|auteur·ice]]</bdi>'')''"
+    )
+    def test_equiv_pour_template(self, mock_node_to_wikitext):
+        self.maxDiff = None
+        self.wxr.wtp.start_page("autrice")
+        root = self.wxr.wtp.parse(
+            "{{équiv-pour|un homme|auteur|2egenre=une personne non-binaire|2egenre1=autaire|2egenre2=auteurice|2egenre3=auteur·ice|lang=fr}}"
+        )
+        page_data = [defaultdict(list)]
+        extract_form_line(self.wxr, page_data, root.children)
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "forms": [
+                        {
+                            "form": "auteur",
+                            "tags": ["pour un homme, on dit"],
+                            "source": "form line template 'équiv-pour'",
+                        },
+                        {
+                            "form": "autaire",
+                            "tags": [
+                                "pour une personne non-binaire, on peut dire"
+                            ],
+                            "source": "form line template 'équiv-pour'",
+                        },
+                        {
+                            "form": "auteurice",
+                            "tags": [
+                                "pour une personne non-binaire, on peut dire"
+                            ],
+                            "source": "form line template 'équiv-pour'",
+                        },
+                        {
+                            "form": "auteur·ice",
+                            "tags": [
+                                "pour une personne non-binaire, on peut dire"
+                            ],
+                            "source": "form line template 'équiv-pour'",
+                        },
+                    ]
+                }
+            ],
+        )
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
@@ -210,7 +210,11 @@ def test_not_italic_tag(self):
             [
                 {
                     "senses": [
-                        {"glosses": ["Oiseau aquatique de taille moyenne du genre Rhynchops."]}
+                        {
+                            "glosses": [
+                                "Oiseau aquatique de taille moyenne du genre Rhynchops."
+                            ]
+                        }
                     ]
                 }
             ],
@@ -220,18 +224,10 @@ def test_preserve_space_between_tags(self):
         # https://fr.wiktionary.org/wiki/becs-en-ciseaux
         # the space between italic node and the link node should be preserved
         self.wxr.wtp.start_page("becs-en-ciseaux")
-        root = self.wxr.wtp.parse(
-            "# ''Pluriel de'' [[bec-en-ciseaux]]."
-        )
+        root = self.wxr.wtp.parse("# ''Pluriel de'' [[bec-en-ciseaux]].")
         page_data = [defaultdict(list)]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             page_data,
-            [
-                {
-                    "senses": [
-                        {"glosses": ["Pluriel de bec-en-ciseaux."]}
-                    ]
-                }
-            ],
+            [{"senses": [{"glosses": ["Pluriel de bec-en-ciseaux."]}]}],
         )
diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py
@@ -126,3 +126,27 @@ def test_sub_list(self):
                 }
             ],
         )
+
+    def test_sense(self):
+        page_data = [defaultdict(list)]
+        self.wxr.wtp.start_page("autrice")
+        root = self.wxr.wtp.parse(
+            """{{(|Celle qui est à l’origine de quelque chose|1}}
+* [[artisane]]
+"""
+        )
+        extract_linkage(self.wxr, page_data, root, "synonyms")
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "synonyms": [
+                        {
+                            "word": "artisane",
+                            "sense": "Celle qui est à l’origine de quelque chose",
+                            "sense_index": 1,
+                        },
+                    ]
+                }
+            ],
+        )
diff --git a/tests/test_fr_note.py b/tests/test_fr_note.py
@@ -20,19 +20,17 @@ def tearDown(self) -> None:
     def test_list_notes(self):
         # list created from template "note-féminisation"
         # https://fr.wiktionary.org/wiki/autrice
-        self.wxr.wtp.add_page("Modèle:note-féminisation", 10, "* list 1\n* list 2")
+        self.wxr.wtp.add_page(
+            "Modèle:note-féminisation", 10, "* list 1\n* list 2"
+        )
         self.wxr.wtp.start_page("autrice")
-        nodes = self.wxr.wtp.parse("""==== {{S|notes}} ====
+        nodes = self.wxr.wtp.parse(
+            """==== {{S|notes}} ====
 paragrapy 1
-{{note-féminisation}}""")
+{{note-féminisation}}"""
+        )
         page_data = [defaultdict(list)]
         extract_note(self.wxr, page_data, nodes.children[0])
-        self.assertEqual(page_data, [
-            {
-                "notes": [
-                    "paragrapy 1",
-                    "list 1",
-                    "list 2"
-                ]
-            }
-        ])
+        self.assertEqual(
+            page_data, [{"notes": ["paragrapy 1", "list 1", "list 2"]}]
+        )
diff --git a/tests/test_inflection_en.py b/tests/test_inflection_en.py
@@ -14,7 +14,7 @@
 
 class InflTests(unittest.TestCase):
     def setUp(self):
-        self.maxDiff = 100000
+        self.maxDiff = None
         self.wxr = WiktextractContext(Wtp(), WiktionaryConfig())
         self.wxr.wtp.start_page("testpage")
         self.wxr.wtp.start_section("English")
@@ -203,7 +203,6 @@ def test_English_verb1(self):
                 "form": "wanderest",
                 "source": "Conjugation",
                 "tags": [
-                  "archaic",
                   "present",
                   "second-person",
                   "singular"
@@ -222,7 +221,6 @@ def test_English_verb1(self):
                 "form": "wanderedst",
                 "source": "Conjugation",
                 "tags": [
-                  "archaic",
                   "past",
                   "second-person",
                   "singular"
@@ -241,7 +239,6 @@ def test_English_verb1(self):
                 "form": "wandereth",
                 "source": "Conjugation",
                 "tags": [
-                  "archaic",
                   "present",
                   "singular",
                   "third-person"