Combine separated synonyms tag string and template

also ignore empty string tag
tatuylonen · Sep 26, 2023 · 15b754f · 15b754f
1 parent be3fd6f
commit 15b754f
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 6 deletions.
diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py
@@ -23,7 +23,7 @@ def tearDown(self) -> None:
 
     def test_tags(self):
         page_data = [defaultdict(list)]
-        self.wxr.wtp.start_page("")
+        self.wxr.wtp.start_page("bonjour")
         self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
         self.wxr.wtp.add_page("Modèle:Louisiane", 10, body="(Louisiane)")
         root = self.wxr.wtp.parse(
@@ -43,7 +43,7 @@ def test_tags(self):
 
     def test_zh_synonyms(self):
         page_data = [defaultdict(list)]
-        self.wxr.wtp.start_page("")
+        self.wxr.wtp.start_page("你好")
         root = self.wxr.wtp.parse(
             "==== {{S|synonymes}} ====\n* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)."
         )
@@ -63,3 +63,24 @@ def test_zh_synonyms(self):
                 }
             ],
         )
+
+    def test_template_as_partial_tag(self):
+        page_data = [defaultdict(list)]
+        self.wxr.wtp.start_page("bonjour")
+        self.wxr.wtp.add_page("Modèle:lien", 10, body="kwei")
+        self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
+        self.wxr.wtp.add_page("Modèle:L", 10, body="Atikamekw")
+        root = self.wxr.wtp.parse(
+            "==== {{S|synonymes}} ====\n* {{lien|kwei|fr}} {{Canada|nocat=1}} (mot {{L|atj}})"
+        )
+        extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "synonyms": [
+                        {"word": "kwei", "tags": ["Canada", "mot Atikamekw"]}
+                    ]
+                }
+            ],
+        )
diff --git a/wiktextract/extractor/fr/linkage.py b/wiktextract/extractor/fr/linkage.py
@@ -16,6 +16,7 @@ def extract_linkage(
 ) -> None:
     for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
         linkage_data = defaultdict(list)
+        pending_tag = ""
         for index, child_node in enumerate(
             list_item_node.filter_empty_str_child()
         ):
@@ -28,10 +29,29 @@ def extract_linkage(
                 else:
                     linkage_data["word"] = clean_node(wxr, None, child_node)
             else:
-                tag = clean_node(wxr, page_data[-1], child_node).strip("()")
+                tag = (
+                    child_node
+                    if isinstance(child_node, str)
+                    else clean_node(wxr, page_data[-1], child_node)
+                )
+                if tag.strip().startswith("(") and not tag.strip().endswith(
+                    ")"
+                ):
+                    pending_tag = tag
+                    continue
+                elif not tag.strip().startswith("(") and tag.strip().endswith(
+                    ")"
+                ):
+                    tag = pending_tag + tag
+                    pending_tag = ""
+                elif len(pending_tag) > 0:
+                    pending_tag += tag
+                    continue
+
+                tag = tag.strip("() \n")
                 if tag.startswith("— "):
                     linkage_data["translation"] = tag.removeprefix("— ")
-                else:
+                elif len(tag) > 0:
                     linkage_data["tags"].append(tag)
 
         page_data[-1][linkage_type].append(linkage_data)
@@ -53,8 +73,8 @@ def process_lien_template(
     node: TemplateNode,
     linkage_data: Dict[str, Union[str, List[str]]],
 ) -> None:
-    # https://fr.wiktionary.org/wiki/Modèle:lien
-    if "dif" in node.template_parameters:
+    # link word template: https://fr.wiktionary.org/wiki/Modèle:lien
+    if "dif" in node.template_parameters:  # displayed word
         word = clean_node(wxr, None, node.template_parameters.get("dif"))
     else:
         word = clean_node(wxr, None, node.template_parameters.get(1))