Extract "Onglets conjugaison" template and transclude pages

Some IPA links and the final "Impératif" table can't be parsed because the parser can't handle `<nowiki />` in these cases: - `[[ignore|<span>\\<nowiki /> sə <nowiki/>ipa\\</span>]]` - `<nowiki />` tag in table cell
tatuylonen · Jan 9, 2024 · 13c2181 · 13c2181
1 parent a279eb5
commit 13c2181
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 123 deletions.
diff --git a/src/wiktextract/extractor/fr/conjugation.py b/src/wiktextract/extractor/fr/conjugation.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import HTMLNode, TemplateNode
 from wiktextract.page import clean_node
@@ -6,7 +8,12 @@
 from .models import Form, WordEntry
 
 
-def extract_conjugation(wxr: WiktextractContext, entry: WordEntry) -> None:
+def extract_conjugation(
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    word: str = "",
+    select_template: str = "1",
+) -> None:
     """
     Find and extract conjugation page.
 
@@ -15,16 +22,21 @@ def extract_conjugation(wxr: WiktextractContext, entry: WordEntry) -> None:
     https://fr.wiktionary.org/wiki/Aide:Conjugaisons
     """
     conj_ns = wxr.wtp.NAMESPACE_DATA["Conjugaison"]
-    conj_page_title = (
-        f"{conj_ns['name']}:{entry.lang.lower()}/{entry.word}"
-    )
+    if len(word) == 0:
+        word = entry.word
+    conj_page_title = f"{conj_ns['name']}:{entry.lang.lower()}/{word}"
     conj_page = wxr.wtp.get_page_body(conj_page_title, conj_ns["id"])
     if conj_page is None:
         return
     conj_root = wxr.wtp.parse(conj_page)
     for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
         if conj_template.template_name.startswith("fr-conj-"):
             process_fr_conj_template(wxr, entry, conj_template)
+        elif conj_template.template_name == "Onglets conjugaison":
+            process_onglets_template(wxr, entry, conj_template, select_template)
+        elif conj_template.template_name.startswith(":Conjugaison:"):
+            word = conj_template.template_name.rsplit("/", 1)[-1]
+            extract_conjugation(wxr, entry, word, "2")
 
 
 def process_fr_conj_template(
@@ -154,9 +166,25 @@ def process_fr_conj_wiki_table(
                 cell_text = clean_node(wxr, None, cell)
                 if cell_index < 2:
                     form.form += cell_text
-                    if cell_index == 0:
+                    if cell_index == 0 and len(cell_text) > 0:
                         form.form += " "
                 else:
                     form.ipas.append(cell_text)
 
-            entry.forms.append(form)
+            if len(form.form) > 0 and form.form != "—":
+                entry.forms.append(form)
+
+
+def process_onglets_template(
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    template_node: TemplateNode,
+    select: str,
+) -> None:
+    # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
+    # this template expands to two tabs of tables
+    selected_template = template_node.template_parameters.get(
+        f"contenu{select}"
+    )
+    if selected_template is not None:
+        process_fr_conj_template(wxr, entry, selected_template)
diff --git a/tests/test_fr_conj.py b/tests/test_fr_conj.py
@@ -98,3 +98,62 @@ def test_fr_conj_1(self):
                 },
             ],
         )
+
+    def test_onglets_conjugaison(self):
+        # https://fr.wiktionary.org/wiki/Conjugaison:français/s’abattre
+        self.wxr.wtp.start_page("s’abattre")
+        self.wxr.wtp.add_page(
+            "Conjugaison:français/abattre",
+            116,
+            """{{Onglets conjugaison
+| onglet1  =Conjugaison active
+| contenu1 ={{fr-conj-3-attre|ab|a.b|'=oui}}
+| onglet2  =Conjugaison pronominale
+| contenu2 ={{fr-conj-3-attre|ab|a.b|'=oui|réfl=1}}
+| sél ={{{sél|1}}}
+}}""",
+        )
+        self.wxr.wtp.add_page(
+            "Conjugaison:français/s’abattre",
+            116,
+            "{{:Conjugaison:français/abattre|sél=2}}",
+        )
+        self.wxr.wtp.add_page(
+            "Modèle:fr-conj-3-attre",
+            10,
+            """<h3> Modes impersonnels </h3>
+<div>
+{|
+|-[[mode|Mode]]
+!colspan=\"3\"|[[présent|Présent]]
+!colspan=\"3\"|[[passé|Passé]]
+|-
+|'''[[infinitif|Infinitif]]'''
+|s’
+|[[abattre]]
+|<span>\\s‿a.batʁ\\</span>
+|s’être
+|[[abattu]]
+|<span>\\s‿ɛtʁ‿a.ba.ty\\</span>
+|}
+</div>""",
+        )
+        entry = WordEntry(lang_code="fr", lang="Français", word="s’abattre")
+        extract_conjugation(self.wxr, entry)
+        self.assertEqual(
+            [f.model_dump(exclude_defaults=True) for f in entry.forms],
+            [
+                {
+                    "form": "s’abattre",
+                    "ipas": ["\\s‿a.batʁ\\"],
+                    "source": "Conjugaison page",
+                    "tags": ["Modes impersonnels", "Infinitif", "Présent"],
+                },
+                {
+                    "form": "s’être abattu",
+                    "ipas": ["\\s‿ɛtʁ‿a.ba.ty\\"],
+                    "source": "Conjugaison page",
+                    "tags": ["Modes impersonnels", "Infinitif", "Passé"],
+                },
+            ],
+        )
diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py
@@ -24,9 +24,7 @@ def test_ipa(self):
         self.wxr.wtp.start_page("bonjour")
         self.wxr.wtp.add_page("Modèle:pron", 10, "\\bɔ̃.ʒuʁ\\")
         root = self.wxr.wtp.parse("'''bonjour''' {{pron|bɔ̃.ʒuʁ|fr}}")
-        page_data = [
-            WordEntry(word="bonjour", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="bonjour", lang_code="fr", lang="Français")]
         extract_form_line(self.wxr, page_data, root.children)
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].sounds],
@@ -37,9 +35,7 @@ def test_gender(self):
         self.wxr.wtp.start_page("bonjour")
         self.wxr.wtp.add_page("Modèle:m", 10, "masculin")
         root = self.wxr.wtp.parse("'''bonjour''' {{m}}")
-        page_data = [
-            WordEntry(word="bonjour", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="bonjour", lang_code="fr", lang="Français")]
         extract_form_line(self.wxr, page_data, root.children)
         self.assertEqual(page_data[-1].tags, ["masculin"])
 
@@ -49,9 +45,7 @@ def test_zh_mot(self):
         self.wxr.wtp.add_page("Modèle:lang", 10, body="mǎ")
         self.wxr.wtp.add_page("Modèle:pron", 10, body="\\ma̠˨˩˦\\")
         root = self.wxr.wtp.parse("{{zh-mot|马|mǎ}}")
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         process_zh_mot_template(self.wxr, root.children[0], page_data)
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].sounds],
@@ -98,9 +92,7 @@ def test_template_in_pron_argument(self):
         root = self.wxr.wtp.parse(
             "'''minéral argileux''' {{pron|mi.ne.ʁa.l{{liaison|fr}}aʁ.ʒi.lø|fr}}"
         )
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_form_line(self.wxr, page_data, root.children)
         self.assertEqual(
             page_data[-1].sounds[0].model_dump(exclude_defaults=True),
@@ -117,9 +109,7 @@ def test_equiv_pour_template(self, mock_node_to_wikitext):
         root = self.wxr.wtp.parse(
             "{{équiv-pour|un homme|auteur|2egenre=une personne non-binaire|2egenre1=autaire|2egenre2=auteurice|2egenre3=auteur·ice|lang=fr}}"
         )
-        page_data = [
-            WordEntry(word="autrice", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="autrice", lang_code="fr", lang="Français")]
         extract_form_line(self.wxr, page_data, root.children)
         self.assertEqual(
             page_data[-1].model_dump(exclude_defaults=True),

diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
@@ -29,9 +29,7 @@ def tearDown(self) -> None:
     def test_theme_templates(self, mock_get_page):
         self.wxr.wtp.start_page("")
         root = self.wxr.wtp.parse("# {{sportifs|fr}} gloss.\n#* example")
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
@@ -50,9 +48,7 @@ def test_example_template(self):
         root = self.wxr.wtp.parse(
             "# gloss.\n#* {{exemple|text|translation|roman|source=source}}"
         )
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
@@ -80,9 +76,7 @@ def test_example_source_template(self, mock_node_to_html):
         root = self.wxr.wtp.parse(
             "# gloss.\n#* example {{source|source_title}}"
         )
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
@@ -151,9 +145,7 @@ def test_variante_de(self):
         root = self.wxr.wtp.parse(
             "# {{désuet|en}} {{sports|en}} {{indénombrable|en}} {{variante de|basketball|en}}."
         )
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
@@ -171,9 +163,7 @@ def test_italic_tag(self):
         root = self.wxr.wtp.parse(
             "# (''localement'') [[bassin#Nom_commun|Bassin]], [[lavoir#Nom_commun|lavoir]]."
         )
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
@@ -186,9 +176,7 @@ def test_not_italic_tag(self):
         root = self.wxr.wtp.parse(
             "# [[oiseau|Oiseau]] aquatique de taille moyenne du genre ''[[Rhynchops]]''."
         )
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
@@ -206,9 +194,7 @@ def test_preserve_space_between_tags(self):
         # the space between italic node and the link node should be preserved
         self.wxr.wtp.start_page("becs-en-ciseaux")
         root = self.wxr.wtp.parse("# ''Pluriel de'' [[bec-en-ciseaux]].")
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
@@ -225,9 +211,7 @@ def test_template_is_not_tag(self, mock_get_page):
         root = self.wxr.wtp.parse(
             "# {{lien|autrice|fr|dif=Autrice}}, [[celle]] qui est à l’[[origine]] de [[quelque chose]]."
         )
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
@@ -250,9 +234,7 @@ def test_nest_gloss(self):
 ##* nest example
             """
         )
-        page_data = [
-            WordEntry(word="test", lang_code="fr", lang="Français")
-        ]
+        page_data = [WordEntry(word="test", lang_code="fr", lang="Français")]
         extract_gloss(self.wxr, page_data, root.children[0])
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],