From 9144898c2b96e81549aaa594ea39e702152280c2 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 10 Oct 2023 18:06:52 +0800 Subject: [PATCH] Extract inflection table that has column with `colspan` attr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Example page: https://fr.wiktionary.org/wiki/fenil#Nom_commun_4 Template: https://fr.wiktionary.org/wiki/Modèle:ro-nom-tab --- tests/test_fr_inflection.py | 46 +++++++++++++ wiktextract/extractor/fr/inflection.py | 95 ++++++++++++++++++-------- 2 files changed, 113 insertions(+), 28 deletions(-) diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index fd8175d3..abb1042f 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -302,3 +302,49 @@ def test_fr_accord_personne(self, mock_node_to_wikitext): }, ], ) + + @patch( + "wikitextprocessor.Wtp.node_to_wikitext", + return_value="""{| class="flextable" +! masculin +! colspan=2 | Singulier +! colspan=2 | Pluriel +|- +! cas || non articulé || articulé || non articulé || articulé +|- +! Nominatif
Accusatif +| [[fenil#ro-nom|fenil]] +| [[fenilul#ro-nom|fenilul]] +| [[fenili#ro-nom|fenili]] +| [[fenilii#ro-nom|fenilii]] +|- +! Vocatif +| colspan=2| [[fenilule#ro-nom|fenilule]] +| colspan=2| [[fenililor#ro-nom|fenililor]] +|}""", + ) + def test_ro_nom_tab(self, mock_node_to_wikitext): + # https://fr.wiktionary.org/wiki/fenil#Nom_commun_4 + page_data = [defaultdict(list, {"word": "fenil"})] + node = TemplateNode(0) + self.wxr.wtp.start_page("fenil") + extract_inflection(self.wxr, page_data, node) + self.assertEqual( + page_data[-1].get("forms"), + [ + { + "form": "fenilul", + "tags": ["Singulier", "articulé", "Nominatif Accusatif"], + }, + { + "form": "fenili", + "tags": ["Pluriel", "non articulé", "Nominatif Accusatif"], + }, + { + "form": "fenilii", + "tags": ["Pluriel", "articulé", "Nominatif Accusatif"], + }, + {"form": "fenilule", "tags": ["Singulier", "Vocatif"]}, + {"form": "fenililor", "tags": ["Pluriel", "Vocatif"]}, + ], + ) diff --git a/wiktextract/extractor/fr/inflection.py b/wiktextract/extractor/fr/inflection.py index 26c9b66f..c9ee14e3 100644 --- a/wiktextract/extractor/fr/inflection.py +++ b/wiktextract/extractor/fr/inflection.py @@ -1,4 +1,5 @@ from collections import defaultdict, deque +from dataclasses import dataclass from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode @@ -20,16 +21,26 @@ def extract_inflection( process_inflection_table(wxr, page_data, template_node) -IGNORE_TABLE_HEADERS = { - "Terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj - "Forme", # br-flex-adj - "Temps", # en-conj-rég, - "Cas", # lt_décl_as -} -IGNORE_TABLE_CELL = { - "Déclinaisons", # de-adj - "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom -} +IGNORE_TABLE_HEADERS = frozenset( + { + "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj + "forme", # br-flex-adj + "temps", # en-conj-rég, + "cas", # lt_décl_as, ro-nom-tab(lower case) + } +) +IGNORE_TABLE_CELL = frozenset( + { + "Déclinaisons", # de-adj + "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom + } +) + +@dataclass +class ColspanHeader: + text: str + index: int + span: int def process_inflection_table( @@ -46,7 +57,7 @@ def process_inflection_table( table_node = table_nodes[0] column_headers = [] rowspan_headers = deque() - first_row_has_data_cell = False + colspan_headers = [] for row_num, table_row in enumerate( table_node.find_child(NodeKind.TABLE_ROW) ): @@ -64,13 +75,12 @@ def process_inflection_table( ) and row_node_child.attrs.get("style") != "display:none" ] - if row_num == 0: - first_row_has_data_cell = any( - isinstance(cell, WikiNode) - and cell.kind == NodeKind.TABLE_CELL - and "invisible" not in cell.attrs.get("class", "") - for cell in table_row_nodes - ) + current_row_has_data_cell = any( + isinstance(cell, WikiNode) + and cell.kind == NodeKind.TABLE_CELL + and "invisible" not in cell.attrs.get("class", "") + for cell in table_row_nodes + ) row_headers = [] for index, (rowspan_text, rowspan_count) in enumerate( rowspan_headers.copy() @@ -86,14 +96,34 @@ def process_inflection_table( form_data = defaultdict(list) if isinstance(table_cell, WikiNode): if table_cell.kind == NodeKind.TABLE_HEADER_CELL: - table_header_text = clean_node(wxr, None, table_cell) - if table_header_text in IGNORE_TABLE_HEADERS: + if any( + table_cell.find_html( + "span", + attr_name="class", + attr_value="ligne-de-forme", + ) + ): + # ignore gender header in template "ro-nom-tab" + continue + table_header_text = clean_node( + wxr, None, table_cell + ).replace("\n", " ") + if table_header_text.lower() in IGNORE_TABLE_HEADERS: continue - elif row_num == 0 and not first_row_has_data_cell: - # if cells of the first row are not all header cells - # then the header cells are row headers but not column - # headers - column_headers.append(table_header_text) + if not current_row_has_data_cell: + # if all cells of the row are header cells + # then the header cells are column headers + if "colspan" in table_cell.attrs: + colspan_headers.append( + ColspanHeader( + table_header_text, + column_cell_index, + int(table_cell.attrs.get("colspan")), + ) + ) + else: + column_headers.append(table_header_text) + column_cell_index += int(table_cell.attrs.get("colspan", 1)) elif row_num > 0: row_headers.append(table_header_text) if "rowspan" in table_cell.attrs: @@ -113,9 +143,17 @@ def process_inflection_table( and table_cell_line not in IGNORE_TABLE_CELL ): form_data["form"] = table_cell_line + for colspan_header in colspan_headers: + if ( + column_cell_index >= colspan_header.index + and column_cell_index + < colspan_header.index + colspan_header.span + ): + form_data["tags"].append(colspan_header.text) if ( - len(column_headers) > column_cell_index - and column_headers[column_cell_index] + "colspan" not in table_cell.attrs + and len(column_headers) > column_cell_index + and column_headers[column_cell_index].lower() not in IGNORE_TABLE_HEADERS ): form_data["tags"].append( @@ -126,4 +164,5 @@ def process_inflection_table( form_data["tags"].extend(row_headers) if "form" in form_data: page_data[-1]["forms"].append(form_data) - column_cell_index += 1 + + column_cell_index += int(table_cell.attrs.get("colspan", 1))