Merge pull request #436 from xxyzz/fr

Change find gloss tags strategy of French extractor
tatuylonen · Dec 26, 2023 · 87921d1 · 87921d1
2 parents f6b8340 + cc9796b
commit 87921d1
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 13 deletions.
diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py
@@ -21,47 +21,41 @@ def extract_gloss(
             )
         )
         gloss_data = Sense()
-        gloss_start = 0
         # process modifier, theme tempaltes before gloss text
         # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
+        tag_indexes = set()
         for index, gloss_node in enumerate(gloss_nodes):
             if isinstance(gloss_node, TemplateNode):
                 categories_data = defaultdict(list)
                 expanded_text = clean_node(wxr, categories_data, gloss_node)
                 if expanded_text.startswith("(") and expanded_text.endswith(
                     ")"
                 ):
-                    gloss_start = index + 1
                     tag = expanded_text.strip("() \n")
                     if len(tag) > 0:
                         gloss_data.tags.append(tag)
                     if "categories" in categories_data:
                         gloss_data.categories.extend(
                             categories_data["categories"]
                         )
-
-        gloss_only_nodes = []
-        tag_indexes = set()
-        for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start):
+                    tag_indexes.add(index)
             # if an italic node is between parentheses then it's a tag, also
             # don't add the parenthese strings to `gloss_only_nodes`
-            if (
-                isinstance(node, WikiNode)
-                and node.kind == NodeKind.ITALIC
-                and index > gloss_start
+            elif (
+                isinstance(gloss_node, WikiNode)
+                and gloss_node.kind == NodeKind.ITALIC
                 and isinstance(gloss_nodes[index - 1], str)
                 and gloss_nodes[index - 1].strip() == "("
                 and index + 1 < len(gloss_nodes)
                 and isinstance(gloss_nodes[index + 1], str)
                 and gloss_nodes[index + 1].strip() == ")"
             ):
-                gloss_data.tags.append(clean_node(wxr, None, node))
+                gloss_data.tags.append(clean_node(wxr, None, gloss_node))
                 tag_indexes |= {index - 1, index, index + 1}
-                continue
 
         gloss_only_nodes = [
             node
-            for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start)
+            for index, node in enumerate(gloss_nodes)
             if index not in tag_indexes
         ]
         gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)

diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py
@@ -85,6 +85,8 @@ def process_translation_templates(
         )
     elif template_node.template_name.startswith("trad"):
         # Translation term: https://fr.wiktionary.org/wiki/Modèle:trad
+        if 2 not in template_node.template_parameters:  # required parameter
+            return
         translation_term = clean_node(
             wxr,
             None,

diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
@@ -280,3 +280,29 @@ def test_nest_gloss(self):
                 },
             ],
         )
+
+    def test_sandwich_tag(self):
+        # https://fr.wiktionary.org/wiki/autrice#Nom_commun_4
+        self.wxr.wtp.start_page("autrice")
+        self.wxr.wtp.add_page("Modèle:lexique", 10, "''(Littérature)''")
+        self.wxr.wtp.add_page("Modèle:rare", 10, "''(Rare)''")
+        self.wxr.wtp.add_page("Modèle:lien", 10, "Autrice")
+        self.wxr.wtp.add_page("Modèle:absolument", 10, "''(Absolument)''")
+        root = self.wxr.wtp.parse(
+            "# {{lexique|littérature|nl}} {{rare|nl}} {{lien|autrice|fr|dif=Autrice}}, femme qui a créé une œuvre littéraire. {{absolument}} [[écrivaine|Écrivaine]]."
+        )
+        page_data = [
+            WordEntry(word="autrice", lang_code="nl", lang_name="Néerlandais")
+        ]
+        extract_gloss(self.wxr, page_data, root.children[0])
+        self.assertEqual(
+            [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
+            [
+                {
+                    "glosses": [
+                        "Autrice, femme qui a créé une œuvre littéraire. Écrivaine."
+                    ],
+                    "tags": ["Littérature", "Rare", "Absolument"]
+                }
+            ],
+        )