diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py
index fd8175d3..abb1042f 100644
--- a/tests/test_fr_inflection.py
+++ b/tests/test_fr_inflection.py
@@ -302,3 +302,49 @@ def test_fr_accord_personne(self, mock_node_to_wikitext):
},
],
)
+
+ @patch(
+ "wikitextprocessor.Wtp.node_to_wikitext",
+ return_value="""{| class="flextable"
+! masculin
+! colspan=2 | Singulier
+! colspan=2 | Pluriel
+|-
+! cas || non articulé || articulé || non articulé || articulé
+|-
+! Nominatif
Accusatif
+| [[fenil#ro-nom|fenil]]
+| [[fenilul#ro-nom|fenilul]]
+| [[fenili#ro-nom|fenili]]
+| [[fenilii#ro-nom|fenilii]]
+|-
+! Vocatif
+| colspan=2| [[fenilule#ro-nom|fenilule]]
+| colspan=2| [[fenililor#ro-nom|fenililor]]
+|}""",
+ )
+ def test_ro_nom_tab(self, mock_node_to_wikitext):
+ # https://fr.wiktionary.org/wiki/fenil#Nom_commun_4
+ page_data = [defaultdict(list, {"word": "fenil"})]
+ node = TemplateNode(0)
+ self.wxr.wtp.start_page("fenil")
+ extract_inflection(self.wxr, page_data, node)
+ self.assertEqual(
+ page_data[-1].get("forms"),
+ [
+ {
+ "form": "fenilul",
+ "tags": ["Singulier", "articulé", "Nominatif Accusatif"],
+ },
+ {
+ "form": "fenili",
+ "tags": ["Pluriel", "non articulé", "Nominatif Accusatif"],
+ },
+ {
+ "form": "fenilii",
+ "tags": ["Pluriel", "articulé", "Nominatif Accusatif"],
+ },
+ {"form": "fenilule", "tags": ["Singulier", "Vocatif"]},
+ {"form": "fenililor", "tags": ["Pluriel", "Vocatif"]},
+ ],
+ )
diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py
index 26c9b66f..c9ee14e3 100644
--- a/wiktextract/extractor/fr/inflection.py
+++ b/wiktextract/extractor/fr/inflection.py
@@ -1,4 +1,5 @@
from collections import defaultdict, deque
+from dataclasses import dataclass
from typing import Dict, List
from wikitextprocessor import NodeKind, WikiNode
@@ -20,16 +21,26 @@ def extract_inflection(
process_inflection_table(wxr, page_data, template_node)
-IGNORE_TABLE_HEADERS = {
- "Terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
- "Forme", # br-flex-adj
- "Temps", # en-conj-rég,
- "Cas", # lt_décl_as
-}
-IGNORE_TABLE_CELL = {
- "Déclinaisons", # de-adj
- "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
-}
+IGNORE_TABLE_HEADERS = frozenset(
+ {
+ "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
+ "forme", # br-flex-adj
+ "temps", # en-conj-rég,
+ "cas", # lt_décl_as, ro-nom-tab(lower case)
+ }
+)
+IGNORE_TABLE_CELL = frozenset(
+ {
+ "Déclinaisons", # de-adj
+ "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
+ }
+)
+
+@dataclass
+class ColspanHeader:
+ text: str
+ index: int
+ span: int
def process_inflection_table(
@@ -46,7 +57,7 @@ def process_inflection_table(
table_node = table_nodes[0]
column_headers = []
rowspan_headers = deque()
- first_row_has_data_cell = False
+ colspan_headers = []
for row_num, table_row in enumerate(
table_node.find_child(NodeKind.TABLE_ROW)
):
@@ -64,13 +75,12 @@ def process_inflection_table(
)
and row_node_child.attrs.get("style") != "display:none"
]
- if row_num == 0:
- first_row_has_data_cell = any(
- isinstance(cell, WikiNode)
- and cell.kind == NodeKind.TABLE_CELL
- and "invisible" not in cell.attrs.get("class", "")
- for cell in table_row_nodes
- )
+ current_row_has_data_cell = any(
+ isinstance(cell, WikiNode)
+ and cell.kind == NodeKind.TABLE_CELL
+ and "invisible" not in cell.attrs.get("class", "")
+ for cell in table_row_nodes
+ )
row_headers = []
for index, (rowspan_text, rowspan_count) in enumerate(
rowspan_headers.copy()
@@ -86,14 +96,34 @@ def process_inflection_table(
form_data = defaultdict(list)
if isinstance(table_cell, WikiNode):
if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
- table_header_text = clean_node(wxr, None, table_cell)
- if table_header_text in IGNORE_TABLE_HEADERS:
+ if any(
+ table_cell.find_html(
+ "span",
+ attr_name="class",
+ attr_value="ligne-de-forme",
+ )
+ ):
+ # ignore gender header in template "ro-nom-tab"
+ continue
+ table_header_text = clean_node(
+ wxr, None, table_cell
+ ).replace("\n", " ")
+ if table_header_text.lower() in IGNORE_TABLE_HEADERS:
continue
- elif row_num == 0 and not first_row_has_data_cell:
- # if cells of the first row are not all header cells
- # then the header cells are row headers but not column
- # headers
- column_headers.append(table_header_text)
+ if not current_row_has_data_cell:
+ # if all cells of the row are header cells
+ # then the header cells are column headers
+ if "colspan" in table_cell.attrs:
+ colspan_headers.append(
+ ColspanHeader(
+ table_header_text,
+ column_cell_index,
+ int(table_cell.attrs.get("colspan")),
+ )
+ )
+ else:
+ column_headers.append(table_header_text)
+ column_cell_index += int(table_cell.attrs.get("colspan", 1))
elif row_num > 0:
row_headers.append(table_header_text)
if "rowspan" in table_cell.attrs:
@@ -113,9 +143,17 @@ def process_inflection_table(
and table_cell_line not in IGNORE_TABLE_CELL
):
form_data["form"] = table_cell_line
+ for colspan_header in colspan_headers:
+ if (
+ column_cell_index >= colspan_header.index
+ and column_cell_index
+ < colspan_header.index + colspan_header.span
+ ):
+ form_data["tags"].append(colspan_header.text)
if (
- len(column_headers) > column_cell_index
- and column_headers[column_cell_index]
+ "colspan" not in table_cell.attrs
+ and len(column_headers) > column_cell_index
+ and column_headers[column_cell_index].lower()
not in IGNORE_TABLE_HEADERS
):
form_data["tags"].append(
@@ -126,4 +164,5 @@ def process_inflection_table(
form_data["tags"].extend(row_headers)
if "form" in form_data:
page_data[-1]["forms"].append(form_data)
- column_cell_index += 1
+
+ column_cell_index += int(table_cell.attrs.get("colspan", 1))