From c8cff9c29f4659aec4dd5dc427a42ebabfe50161 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 27 Sep 2023 15:22:08 +0800
Subject: [PATCH 1/3] Extract "ja-mot" form line template

---
 wiktextract/extractor/fr/form_line.py | 32 ++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/wiktextract/extractor/fr/form_line.py b/wiktextract/extractor/fr/form_line.py
index 80644cb8..eb20c30b 100644
--- a/wiktextract/extractor/fr/form_line.py
+++ b/wiktextract/extractor/fr/form_line.py
@@ -35,6 +35,8 @@ def extract_form_line(
                 process_equiv_pour_template(wxr, node, page_data)
             elif node.template_name.startswith("zh-mot"):
                 process_zh_mot_template(wxr, node, page_data)
+            elif node.template_name == "ja-mot":
+                process_ja_mot_template(wxr, node, page_data)
             else:
                 tag = clean_node(wxr, page_data[-1], node)
                 if (
@@ -75,7 +77,7 @@ def process_zh_mot_template(
     node: TemplateNode,
     page_data: List[Dict],
 ) -> None:
-    # zh-mot, zh-mot-s, zh-mot-t
+    # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t
     # https://fr.wiktionary.org/wiki/Modèle:zh-mot
     node = wxr.wtp.parse(
         wxr.wtp.node_to_wikitext(node),
@@ -83,14 +85,38 @@ def process_zh_mot_template(
         additional_expand={node.template_name},
     )
     for template_node in node.find_child(NodeKind.TEMPLATE):
-        if template_node.template_name == "lang":
+        if template_node.template_name.lower() == "lang":
             page_data[-1]["sounds"].append(
                 {
                     "zh-pron": clean_node(wxr, None, template_node),
                     "tags": ["Pinyin"],
                 }
             )
-        elif template_node.template_name == "pron":
+        elif template_node.template_name in ("pron", "prononciation"):
             page_data[-1]["sounds"].append(
                 {"ipa": clean_node(wxr, None, template_node)}
             )
+
+def process_ja_mot_template(
+    wxr: WiktextractContext,
+    template_node: TemplateNode,
+    page_data: List[Dict],
+) -> None:
+    # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node),
+        expand_all=True
+    )
+    existing_forms = {existing_form.get("form") for existing_form in page_data[-1].get("forms", [])}
+    for index, node in expanded_node.find_html("span", with_index=True):
+        # the first span tag is the word, the second is Hepburn romanization
+        if index == 1:
+            form_text = clean_node(wxr, None, node)
+            if form_text not in existing_forms:
+                # avoid adding duplicated form data extracted from
+                # inflection table before the form line
+                page_data[-1]["forms"].append({
+                    "form": roman_form,
+                    "tags": ["romanization"]
+                })
+            break

From 53bf50258ba082a941278f466fa1572aee3dd8f2 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 27 Sep 2023 16:40:32 +0800
Subject: [PATCH 2/3] =?UTF-8?q?Extract=20inflection=20table=20expanded=20f?=
 =?UTF-8?q?rom=20template=20"lt-d=C3=A9cl-as"?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

this template creates some empty table cells breaks code counts the
table column number.
---
 tests/test_fr_inflection.py            | 35 +++++++++++++++----
 wiktextract/extractor/fr/form_line.py  | 16 +++++----
 wiktextract/extractor/fr/inflection.py | 47 +++++++++++++++-----------
 wiktextract/extractor/fr/page.py       |  2 +-
 4 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py
index ed90a852..d68b328d 100644
--- a/tests/test_fr_inflection.py
+++ b/tests/test_fr_inflection.py
@@ -40,7 +40,7 @@ def test_fr_reg(self, mock_node_to_wikitext):
         page_data = [defaultdict(list, {"word": "productrice"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("productrice")
-        extract_inflection(self.wxr, page_data, node, "fr-rég")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [{"form": "productrices", "tags": ["Pluriel"]}],
@@ -68,7 +68,7 @@ def test_fr_accord_al(self, mock_node_to_wikitext):
         page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("animal")
-        extract_inflection(self.wxr, page_data, node, "fr-accord-al")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [
@@ -104,7 +104,7 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext):
         page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("ration")
-        extract_inflection(self.wxr, page_data, node, "en-nom-rég")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [
@@ -131,7 +131,7 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
         page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("ration")
-        extract_inflection(self.wxr, page_data, node, "en-conj-rég")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [
@@ -158,7 +158,7 @@ def test_invalid_ipa(self, mock_node_to_wikitext):
         page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("animal")
-        extract_inflection(self.wxr, page_data, node, "ast-accord-mf")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [{"tags": ["Pluriel"], "form": "animales"}],
@@ -180,8 +180,31 @@ def test_no_column_headers(self, mock_node_to_wikitext):
         page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("一万")
-        extract_inflection(self.wxr, page_data, node, "zh-formes")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [{"tags": ["Traditionnel"], "form": "一萬"}],
         )
+
+    @patch(
+        "wikitextprocessor.Wtp.node_to_wikitext",
+        return_value="""{| class="flextable"
+!Cas
+! Singulier
+! Pluriel
+|-
+! Nominatif
+|| <bdi lang="lt" xml:lang="lt" class="lang-lt">[[abadas#lt|abadas]]</bdi>
+|| '''<span lang="lt" xml:lang="lt" class="lang-lt"><bdi>abadai</bdi></span>'''
+|}""",
+    )
+    def test_lt_décl_as(self, mock_node_to_wikitext):
+        # empty table cells should be ignored
+        page_data = [defaultdict(list, {"lang_code": "lt", "word": "abadai"})]
+        node = TemplateNode(0)
+        self.wxr.wtp.start_page("abadai")
+        extract_inflection(self.wxr, page_data, node)
+        self.assertEqual(
+            page_data[-1].get("forms"),
+            [{"tags": ["Singulier", "Nominatif"], "form": "abadas"}],
+        )
diff --git a/wiktextract/extractor/fr/form_line.py b/wiktextract/extractor/fr/form_line.py
index eb20c30b..21278c35 100644
--- a/wiktextract/extractor/fr/form_line.py
+++ b/wiktextract/extractor/fr/form_line.py
@@ -97,6 +97,7 @@ def process_zh_mot_template(
                 {"ipa": clean_node(wxr, None, template_node)}
             )
 
+
 def process_ja_mot_template(
     wxr: WiktextractContext,
     template_node: TemplateNode,
@@ -104,10 +105,12 @@ def process_ja_mot_template(
 ) -> None:
     # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot
     expanded_node = wxr.wtp.parse(
-        wxr.wtp.node_to_wikitext(template_node),
-        expand_all=True
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
     )
-    existing_forms = {existing_form.get("form") for existing_form in page_data[-1].get("forms", [])}
+    existing_forms = {
+        existing_form.get("form")
+        for existing_form in page_data[-1].get("forms", [])
+    }
     for index, node in expanded_node.find_html("span", with_index=True):
         # the first span tag is the word, the second is Hepburn romanization
         if index == 1:
@@ -115,8 +118,7 @@ def process_ja_mot_template(
             if form_text not in existing_forms:
                 # avoid adding duplicated form data extracted from
                 # inflection table before the form line
-                page_data[-1]["forms"].append({
-                    "form": roman_form,
-                    "tags": ["romanization"]
-                })
+                page_data[-1]["forms"].append(
+                    {"form": roman_form, "tags": ["romanization"]}
+                )
             break
diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py
index a978118b..8709b39e 100644
--- a/wiktextract/extractor/fr/inflection.py
+++ b/wiktextract/extractor/fr/inflection.py
@@ -2,6 +2,7 @@
 from typing import Dict, List
 
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import TemplateNode
 
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
@@ -12,12 +13,11 @@
 def extract_inflection(
     wxr: WiktextractContext,
     page_data: List[Dict],
-    node: WikiNode,
-    template_name: str,
+    template_node: TemplateNode,
 ) -> None:
     # inflection templates
     # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
-    process_inflection_table(wxr, page_data, node)
+    process_inflection_table(wxr, page_data, template_node)
 
 
 IGNORE_TABLE_HEADERS = {
@@ -43,15 +43,27 @@ def process_inflection_table(
         return
     table_node = table_nodes[0]
     column_headers = []
+    first_row_has_data_cell = False
     for row_num, table_row in enumerate(
         table_node.find_child(NodeKind.TABLE_ROW)
     ):
-        table_row_nodes = list(table_row.filter_empty_str_child())
-        first_row_has_data_cell = False
+        # filter empty table cells
+        table_row_nodes = [
+            row_node_child
+            for row_node_child in table_row.children
+            if isinstance(row_node_child, WikiNode)
+            and (
+                row_node_child.kind == NodeKind.TABLE_HEADER_CELL
+                or (
+                    row_node_child.kind == NodeKind.TABLE_CELL
+                    and len(row_node_child.children) > 0
+                )
+            )
+            and row_node_child.attrs.get("style") != "display:none"
+        ]
         if row_num == 0:
-            first_row_has_data_cell = not any(
-                isinstance(cell, WikiNode)
-                and cell.kind == NodeKind.TABLE_CELL
+            first_row_has_data_cell = any(
+                isinstance(cell, WikiNode) and cell.kind == NodeKind.TABLE_CELL
                 for cell in table_row_nodes
             )
 
@@ -59,24 +71,19 @@ def process_inflection_table(
             # data row has one more column then header: "fr-accord-al" template
             column_headers.insert(0, "")
 
-        row_header = ""
+        row_headers = []
         for column_num, table_cell in enumerate(table_row_nodes):
             form_data = defaultdict(list)
             if isinstance(table_cell, WikiNode):
                 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
                     table_header_text = clean_node(wxr, None, table_cell)
-                    if row_num == 0 and first_row_has_data_cell:
+                    if row_num == 0 and not first_row_has_data_cell:
                         # if cells of the first row are not all header cells
                         # then the header cells are row headers but not column
                         # headers
                         column_headers.append(table_header_text)
-                    elif (
-                        column_num == 0
-                        and table_header_text not in IGNORE_TABLE_HEADERS
-                    ):
-                        row_header = table_header_text
                     elif table_header_text not in IGNORE_TABLE_HEADERS:
-                        form_data["tags"].append(table_header_text)
+                        row_headers.append(table_header_text)
                 elif table_cell.kind == NodeKind.TABLE_CELL:
                     table_cell_lines = clean_node(wxr, None, table_cell)
                     for table_cell_line in table_cell_lines.splitlines():
@@ -94,7 +101,7 @@ def process_inflection_table(
                     ):
                         form_data["tags"].append(column_headers[column_num])
 
-            if len(row_header) > 0:
-                form_data["tags"].append(row_header)
-            if "form" in form_data:
-                page_data[-1]["forms"].append(form_data)
+                    if len(row_headers) > 0:
+                        form_data["tags"].extend(row_headers)
+                    if "form" in form_data:
+                        page_data[-1]["forms"].append(form_data)
diff --git a/wiktextract/extractor/fr/page.py b/wiktextract/extractor/fr/page.py
index a11558c1..d968e3f2 100644
--- a/wiktextract/extractor/fr/page.py
+++ b/wiktextract/extractor/fr/page.py
@@ -124,7 +124,7 @@ def process_pos_block(
                     # skip form line templates
                     continue
                 elif template_name.startswith(f"{lang_code}-"):
-                    extract_inflection(wxr, page_data, child, template_name)
+                    extract_inflection(wxr, page_data, child)
             elif child.kind == NodeKind.BOLD:
                 form_line_start = index + 1
             elif child.kind == NodeKind.LIST:

From 489751567791db342212b4699d1d6452c31af4f6 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 27 Sep 2023 18:08:38 +0800
Subject: [PATCH 3/3] Extract inflection table template "fr-accord-s"

this template creates a table cell with "invisible" class which should
be ignored
---
 tests/test_fr_inflection.py            | 48 +++++++++++++++++++++++++-
 wiktextract/extractor/fr/inflection.py |  4 ++-
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py
index d68b328d..bbb9dadb 100644
--- a/tests/test_fr_inflection.py
+++ b/tests/test_fr_inflection.py
@@ -64,7 +64,6 @@ def test_fr_reg(self, mock_node_to_wikitext):
     )
     def test_fr_accord_al(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/animal#Adjectif
-        self.maxDiff = None
         page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("animal")
@@ -101,6 +100,7 @@ def test_fr_accord_al(self, mock_node_to_wikitext):
     )
     def test_multiple_lines_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/ration#Nom_commun_2
+        # template "en-nom-rég"
         page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("ration")
@@ -128,6 +128,7 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext):
     )
     def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/ration#Verbe
+        # template "en-conj-rég"
         page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("ration")
@@ -155,6 +156,7 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
     )
     def test_invalid_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/animal#Nom_commun_3
+        # template "ast-accord-mf"
         page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("animal")
@@ -177,6 +179,7 @@ def test_invalid_ipa(self, mock_node_to_wikitext):
     )
     def test_no_column_headers(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/一万#Nom_commun
+        # template "zh-formes"
         page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("一万")
@@ -208,3 +211,46 @@ def test_lt_décl_as(self, mock_node_to_wikitext):
             page_data[-1].get("forms"),
             [{"tags": ["Singulier", "Nominatif"], "form": "abadas"}],
         )
+
+    @patch(
+        "wikitextprocessor.Wtp.node_to_wikitext",
+        return_value="""{|class="flextable flextable-fr-mfsp"
+
+|-
+| class="invisible" |
+! scope="col" | Singulier
+! scope="col" | Pluriel
+|- class="flextable-fr-m"
+! scope="row" | Masculin
+|colspan="2"| [[aastais]]<br
+/>[[Annexe:Prononciation/français|<span>\\a.a.stɛ\\</span>]]
+
+|- class="flextable-fr-f"
+! scope="row" | Féminin
+| [[aastaise]]<br
+/>[[Annexe:Prononciation/français|<span>\\a.a.stɛz\\</span>]]
+| [[aastaises]]<br
+/>[[Annexe:Prononciation/français|<span>\\a.a.stɛz\\</span>]]
+|}""",
+    )
+    def test_fr_accord_s(self, mock_node_to_wikitext):
+        # https://fr.wiktionary.org/wiki/
+        page_data = [defaultdict(list, {"lang_code": "fr", "word": "aastais"})]
+        node = TemplateNode(0)
+        self.wxr.wtp.start_page("aastais")
+        extract_inflection(self.wxr, page_data, node)
+        self.assertEqual(
+            page_data[-1].get("forms"),
+            [
+                {
+                    "tags": ["Singulier", "Féminin"],
+                    "form": "aastaise",
+                    "ipa": "\\a.a.stɛz\\",
+                },
+                {
+                    "tags": ["Pluriel", "Féminin"],
+                    "form": "aastaises",
+                    "ipa": "\\a.a.stɛz\\",
+                },
+            ],
+        )
diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py
index 8709b39e..8e180949 100644
--- a/wiktextract/extractor/fr/inflection.py
+++ b/wiktextract/extractor/fr/inflection.py
@@ -63,7 +63,9 @@ def process_inflection_table(
         ]
         if row_num == 0:
             first_row_has_data_cell = any(
-                isinstance(cell, WikiNode) and cell.kind == NodeKind.TABLE_CELL
+                isinstance(cell, WikiNode)
+                and cell.kind == NodeKind.TABLE_CELL
+                and "invisible" not in cell.attrs.get("class", "")
                 for cell in table_row_nodes
             )