Merge pull request #366 from xxyzz/fr

Exclude sublist child node from linkage list item node
tatuylonen · Oct 18, 2023 · b1cb0dd · b1cb0dd
2 parents b0038bd + 1ed25c3
commit b1cb0dd
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 26 deletions.
diff --git a/src/wiktextract/extractor/fr/linkage.py b/src/wiktextract/extractor/fr/linkage.py
@@ -7,6 +7,8 @@
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
+from ..share import split_tag_text
+
 
 def extract_linkage(
     wxr: WiktextractContext,
@@ -17,44 +19,44 @@ def extract_linkage(
     for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
         linkage_data = defaultdict(list)
         pending_tag = ""
-        for index, child_node in enumerate(
-            list_item_node.filter_empty_str_child()
+        for index, child_node in enumerate(  # remove nested lists
+            list_item_node.invert_find_child(NodeKind.LIST)
         ):
             if index == 0 or "word" not in linkage_data:
                 if isinstance(child_node, TemplateNode):
                     process_linkage_template(wxr, child_node, linkage_data)
                 else:
                     linkage_data["word"] = clean_node(wxr, None, child_node)
             else:
-                tag = (
+                tag_text = (
                     child_node
                     if isinstance(child_node, str)
                     else clean_node(wxr, page_data[-1], child_node)
                 )
-                if tag.strip().startswith("(") and not tag.strip().endswith(
-                    ")"
-                ):
-                    pending_tag = tag
+                if tag_text.strip().startswith(
+                    "("
+                ) and not tag_text.strip().endswith(")"):
+                    pending_tag = tag_text
                     continue
-                elif not tag.strip().startswith("(") and tag.strip().endswith(
-                    ")"
-                ):
-                    tag = pending_tag + tag
+                elif not tag_text.strip().startswith(
+                    "("
+                ) and tag_text.strip().endswith(")"):
+                    tag_text = pending_tag + tag_text
                     pending_tag = ""
-                elif tag.strip() == ",":
+                elif tag_text.strip() == ",":
                     # list item has more than one word
                     page_data[-1][linkage_type].append(linkage_data)
                     linkage_data = defaultdict(list)
                     continue
                 elif len(pending_tag) > 0:
-                    pending_tag += tag
+                    pending_tag += tag_text
                     continue
 
-                tag = tag.strip("() \n")
-                if tag.startswith("— "):
-                    linkage_data["translation"] = tag.removeprefix("— ")
-                elif len(tag) > 0:
-                    linkage_data["tags"].append(tag)
+                for tag in split_tag_text(tag_text):
+                    if tag.startswith("— "):
+                        linkage_data["translation"] = tag.removeprefix("— ")
+                    elif len(tag) > 0:
+                        linkage_data["tags"].append(tag)
 
         page_data[-1][linkage_type].append(linkage_data)
 

diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
@@ -49,6 +49,7 @@ def parse_section(
             wxr.wtp.start_subsection(subtitle)
             if section_type in wxr.config.OTHER_SUBTITLES["ignored_sections"]:
                 pass
+            # POS parameters:
             # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots
             elif section_type in wxr.config.POS_SUBTITLES:
                 process_pos_block(

diff --git a/src/wiktextract/extractor/share.py b/src/wiktextract/extractor/share.py
@@ -80,3 +80,13 @@ def create_transcode_url(filename: str, transcode_suffix: str) -> str:
         "https://upload.wikimedia.org/wikipedia/commons/transcoded/"
         + f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}"
     )
+
+
+def split_tag_text(text: str) -> List[str]:
+    """
+    Find tags enclosded in parentheses and remove parentheses
+    """
+    return [
+        tag.strip("()").strip()
+        for tag in re.split(r"(?<=\))\s+(?=\()", text.strip())
+    ]
diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py
@@ -23,9 +23,9 @@ def test_tags(self):
         self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
         self.wxr.wtp.add_page("Modèle:Louisiane", 10, body="(Louisiane)")
         root = self.wxr.wtp.parse(
-            "==== {{S|synonymes}} ====\n* [[bon matin]] {{Canada|nocat=1}} {{Louisiane|nocat=1}}"
+            "* [[bon matin]] {{Canada|nocat=1}} {{Louisiane|nocat=1}}"
         )
-        extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
+        extract_linkage(self.wxr, page_data, root, "synonyms")
         self.assertEqual(
             page_data,
             [
@@ -41,9 +41,9 @@ def test_zh_synonyms(self):
         page_data = [defaultdict(list)]
         self.wxr.wtp.start_page("你好")
         root = self.wxr.wtp.parse(
-            "==== {{S|synonymes}} ====\n* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)."
+            "* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)."
         )
-        extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
+        extract_linkage(self.wxr, page_data, root, "synonyms")
         self.assertEqual(
             page_data,
             [
@@ -67,9 +67,9 @@ def test_template_as_partial_tag(self):
         self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
         self.wxr.wtp.add_page("Modèle:L", 10, body="Atikamekw")
         root = self.wxr.wtp.parse(
-            "==== {{S|synonymes}} ====\n* {{lien|kwei|fr}} {{Canada|nocat=1}} (mot {{L|atj}})"
+            "* {{lien|kwei|fr}} {{Canada|nocat=1}} (mot {{L|atj}})"
         )
-        extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
+        extract_linkage(self.wxr, page_data, root, "synonyms")
         self.assertEqual(
             page_data,
             [
@@ -85,9 +85,9 @@ def test_list_item_has_two_words(self):
         page_data = [defaultdict(list)]
         self.wxr.wtp.start_page("masse")
         root = self.wxr.wtp.parse(
-            "==== {{S|dérivés}} ====\n* [[être à la masse]], [[mettre à la masse]]"
+            "* [[être à la masse]], [[mettre à la masse]]"
         )
-        extract_linkage(self.wxr, page_data, root.children[0], "derived")
+        extract_linkage(self.wxr, page_data, root, "derived")
         self.assertEqual(
             page_data,
             [
@@ -99,3 +99,30 @@ def test_list_item_has_two_words(self):
                 }
             ],
         )
+
+    def test_sub_list(self):
+        page_data = [defaultdict(list)]
+        self.wxr.wtp.start_page("lézard ocellé")
+        root = self.wxr.wtp.parse(
+            """* [[saurien]]s (Sauria)
+** [[lacertidé]]s (Lacertidae) (famille des lézards typiques)
+"""
+        )
+        extract_linkage(self.wxr, page_data, root, "hypernyms")
+        self.assertEqual(
+            page_data,
+            [
+                {
+                    "hypernyms": [
+                        {"tags": ["Sauria"], "word": "sauriens"},
+                        {
+                            "tags": [
+                                "Lacertidae",
+                                "famille des lézards typiques",
+                            ],
+                            "word": "lacertidés",
+                        },
+                    ]
+                }
+            ],
+        )