tatuylonen · xxyzz · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023 · Sep 27, 2023
diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py
@@ -40,7 +40,7 @@ def test_fr_reg(self, mock_node_to_wikitext):
         page_data = [defaultdict(list, {"word": "productrice"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("productrice")
-        extract_inflection(self.wxr, page_data, node, "fr-rég")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [{"form": "productrices", "tags": ["Pluriel"]}],
@@ -64,11 +64,10 @@ def test_fr_reg(self, mock_node_to_wikitext):
     )
     def test_fr_accord_al(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/animal#Adjectif
-        self.maxDiff = None
         page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("animal")
-        extract_inflection(self.wxr, page_data, node, "fr-accord-al")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [
@@ -101,10 +100,11 @@ def test_fr_accord_al(self, mock_node_to_wikitext):
     )
     def test_multiple_lines_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/ration#Nom_commun_2
+        # template "en-nom-rég"
         page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("ration")
-        extract_inflection(self.wxr, page_data, node, "en-nom-rég")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [
@@ -128,10 +128,11 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext):
     )
     def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/ration#Verbe
+        # template "en-conj-rég"
         page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("ration")
-        extract_inflection(self.wxr, page_data, node, "en-conj-rég")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [
@@ -155,10 +156,11 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
     )
     def test_invalid_ipa(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/animal#Nom_commun_3
+        # template "ast-accord-mf"
         page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("animal")
-        extract_inflection(self.wxr, page_data, node, "ast-accord-mf")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [{"tags": ["Pluriel"], "form": "animales"}],
@@ -177,11 +179,78 @@ def test_invalid_ipa(self, mock_node_to_wikitext):
     )
     def test_no_column_headers(self, mock_node_to_wikitext):
         # https://fr.wiktionary.org/wiki/一万#Nom_commun
+        # template "zh-formes"
         page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})]
         node = TemplateNode(0)
         self.wxr.wtp.start_page("一万")
-        extract_inflection(self.wxr, page_data, node, "zh-formes")
+        extract_inflection(self.wxr, page_data, node)
         self.assertEqual(
             page_data[-1].get("forms"),
             [{"tags": ["Traditionnel"], "form": "一萬"}],
         )
+
+    @patch(
+        "wikitextprocessor.Wtp.node_to_wikitext",
+        return_value="""{| class="flextable"
+!Cas
+! Singulier
+! Pluriel
+|-
+! Nominatif
+|| <bdi lang="lt" xml:lang="lt" class="lang-lt">[[abadas#lt|abadas]]</bdi>
+|| '''<span lang="lt" xml:lang="lt" class="lang-lt"><bdi>abadai</bdi></span>'''
+|}""",
+    )
+    def test_lt_décl_as(self, mock_node_to_wikitext):
+        # empty table cells should be ignored
+        page_data = [defaultdict(list, {"lang_code": "lt", "word": "abadai"})]
+        node = TemplateNode(0)
+        self.wxr.wtp.start_page("abadai")
+        extract_inflection(self.wxr, page_data, node)
+        self.assertEqual(
+            page_data[-1].get("forms"),
+            [{"tags": ["Singulier", "Nominatif"], "form": "abadas"}],
+        )
+
+    @patch(
+        "wikitextprocessor.Wtp.node_to_wikitext",
+        return_value="""{|class="flextable flextable-fr-mfsp"
+
+|-
+| class="invisible" |
+! scope="col" | Singulier
+! scope="col" | Pluriel
+|- class="flextable-fr-m"
+! scope="row" | Masculin
+|colspan="2"| [[aastais]]<br
+/>[[Annexe:Prononciation/français|<span>\\a.a.stɛ\\</span>]]
+
+|- class="flextable-fr-f"
+! scope="row" | Féminin
+| [[aastaise]]<br
+/>[[Annexe:Prononciation/français|<span>\\a.a.stɛz\\</span>]]
+| [[aastaises]]<br
+/>[[Annexe:Prononciation/français|<span>\\a.a.stɛz\\</span>]]
+|}""",
+    )
+    def test_fr_accord_s(self, mock_node_to_wikitext):
+        # https://fr.wiktionary.org/wiki/
+        page_data = [defaultdict(list, {"lang_code": "fr", "word": "aastais"})]
+        node = TemplateNode(0)
+        self.wxr.wtp.start_page("aastais")
+        extract_inflection(self.wxr, page_data, node)
+        self.assertEqual(
+            page_data[-1].get("forms"),
+            [
+                {
+                    "tags": ["Singulier", "Féminin"],
+                    "form": "aastaise",
+                    "ipa": "\\a.a.stɛz\\",
+                },
+                {
+                    "tags": ["Pluriel", "Féminin"],
+                    "form": "aastaises",
+                    "ipa": "\\a.a.stɛz\\",
+                },
+            ],
+        )
diff --git a/wiktextract/extractor/fr/form_line.py b/wiktextract/extractor/fr/form_line.py
@@ -35,6 +35,8 @@ def extract_form_line(
                 process_equiv_pour_template(wxr, node, page_data)
             elif node.template_name.startswith("zh-mot"):
                 process_zh_mot_template(wxr, node, page_data)
+            elif node.template_name == "ja-mot":
+                process_ja_mot_template(wxr, node, page_data)
             else:
                 tag = clean_node(wxr, page_data[-1], node)
                 if (
@@ -75,22 +77,48 @@ def process_zh_mot_template(
     node: TemplateNode,
     page_data: List[Dict],
 ) -> None:
-    # zh-mot, zh-mot-s, zh-mot-t
+    # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t
     # https://fr.wiktionary.org/wiki/Modèle:zh-mot
     node = wxr.wtp.parse(
         wxr.wtp.node_to_wikitext(node),
         pre_expand=True,
         additional_expand={node.template_name},
     )
     for template_node in node.find_child(NodeKind.TEMPLATE):
-        if template_node.template_name == "lang":
+        if template_node.template_name.lower() == "lang":
             page_data[-1]["sounds"].append(
                 {
                     "zh-pron": clean_node(wxr, None, template_node),
                     "tags": ["Pinyin"],
                 }
             )
-        elif template_node.template_name == "pron":
+        elif template_node.template_name in ("pron", "prononciation"):
             page_data[-1]["sounds"].append(
                 {"ipa": clean_node(wxr, None, template_node)}
             )
+
+
+def process_ja_mot_template(
+    wxr: WiktextractContext,
+    template_node: TemplateNode,
+    page_data: List[Dict],
+) -> None:
+    # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
+    )
+    existing_forms = {
+        existing_form.get("form")
+        for existing_form in page_data[-1].get("forms", [])
+    }
+    for index, node in expanded_node.find_html("span", with_index=True):
+        # the first span tag is the word, the second is Hepburn romanization
+        if index == 1:
+            form_text = clean_node(wxr, None, node)
+            if form_text not in existing_forms:
+                # avoid adding duplicated form data extracted from
+                # inflection table before the form line
+                page_data[-1]["forms"].append(
+                    {"form": roman_form, "tags": ["romanization"]}
+                )
+            break
diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py
@@ -2,6 +2,7 @@
 from typing import Dict, List
 
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import TemplateNode
 
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
@@ -12,12 +13,11 @@
 def extract_inflection(
     wxr: WiktextractContext,
     page_data: List[Dict],
-    node: WikiNode,
-    template_name: str,
+    template_node: TemplateNode,
 ) -> None:
     # inflection templates
     # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
-    process_inflection_table(wxr, page_data, node)
+    process_inflection_table(wxr, page_data, template_node)
 
 
 IGNORE_TABLE_HEADERS = {
@@ -43,40 +43,49 @@ def process_inflection_table(
         return
     table_node = table_nodes[0]
     column_headers = []
+    first_row_has_data_cell = False
     for row_num, table_row in enumerate(
         table_node.find_child(NodeKind.TABLE_ROW)
     ):
-        table_row_nodes = list(table_row.filter_empty_str_child())
-        first_row_has_data_cell = False
+        # filter empty table cells
+        table_row_nodes = [
+            row_node_child
+            for row_node_child in table_row.children
+            if isinstance(row_node_child, WikiNode)
+            and (
+                row_node_child.kind == NodeKind.TABLE_HEADER_CELL
+                or (
+                    row_node_child.kind == NodeKind.TABLE_CELL
+                    and len(row_node_child.children) > 0
+                )
+            )
+            and row_node_child.attrs.get("style") != "display:none"
+        ]
         if row_num == 0:
-            first_row_has_data_cell = not any(
+            first_row_has_data_cell = any(
                 isinstance(cell, WikiNode)
                 and cell.kind == NodeKind.TABLE_CELL
+                and "invisible" not in cell.attrs.get("class", "")
                 for cell in table_row_nodes
             )
 
         if row_num != 0 and len(table_row_nodes) == len(column_headers) + 1:
             # data row has one more column then header: "fr-accord-al" template
             column_headers.insert(0, "")
 
-        row_header = ""
+        row_headers = []
         for column_num, table_cell in enumerate(table_row_nodes):
             form_data = defaultdict(list)
             if isinstance(table_cell, WikiNode):
                 if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
                     table_header_text = clean_node(wxr, None, table_cell)
-                    if row_num == 0 and first_row_has_data_cell:
+                    if row_num == 0 and not first_row_has_data_cell:
                         # if cells of the first row are not all header cells
                         # then the header cells are row headers but not column
                         # headers
                         column_headers.append(table_header_text)
-                    elif (
-                        column_num == 0
-                        and table_header_text not in IGNORE_TABLE_HEADERS
-                    ):
-                        row_header = table_header_text
                     elif table_header_text not in IGNORE_TABLE_HEADERS:
-                        form_data["tags"].append(table_header_text)
+                        row_headers.append(table_header_text)
                 elif table_cell.kind == NodeKind.TABLE_CELL:
                     table_cell_lines = clean_node(wxr, None, table_cell)
                     for table_cell_line in table_cell_lines.splitlines():
@@ -94,7 +103,7 @@ def process_inflection_table(
                     ):
                         form_data["tags"].append(column_headers[column_num])
 
-            if len(row_header) > 0:
-                form_data["tags"].append(row_header)
-            if "form" in form_data:
-                page_data[-1]["forms"].append(form_data)
+                    if len(row_headers) > 0:
+                        form_data["tags"].extend(row_headers)
+                    if "form" in form_data:
+                        page_data[-1]["forms"].append(form_data)
diff --git a/wiktextract/extractor/fr/page.py b/wiktextract/extractor/fr/page.py
@@ -124,7 +124,7 @@ def process_pos_block(
                     # skip form line templates
                     continue
                 elif template_name.startswith(f"{lang_code}-"):
-                    extract_inflection(wxr, page_data, child, template_name)
+                    extract_inflection(wxr, page_data, child)
             elif child.kind == NodeKind.BOLD:
                 form_line_start = index + 1
             elif child.kind == NodeKind.LIST: