Skip to content

Commit

Permalink
Merge pull request #357 from xxyzz/fr
Browse files Browse the repository at this point in the history
Extract inflection table that has column with `colspan` attr
  • Loading branch information
xxyzz authored Oct 10, 2023
2 parents b4a54f7 + 9144898 commit 8057bec
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 28 deletions.
46 changes: 46 additions & 0 deletions tests/test_fr_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,49 @@ def test_fr_accord_personne(self, mock_node_to_wikitext):
},
],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="""{| class="flextable"
! <span class="ligne-de-forme" ><i>masculin</i></span>
! colspan=2 | Singulier
! colspan=2 | Pluriel
|-
! cas || non articulé || articulé || non articulé || articulé
|-
! Nominatif<br />Accusatif
| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenil#ro-nom|fenil]]</bdi>
| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenilul#ro-nom|fenilul]]</bdi>
| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenili#ro-nom|fenili]]</bdi>
| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenilii#ro-nom|fenilii]]</bdi>
|-
! Vocatif
| colspan=2| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenilule#ro-nom|fenilule]]</bdi>
| colspan=2| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenililor#ro-nom|fenililor]]</bdi>
|}""",
)
def test_ro_nom_tab(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/fenil#Nom_commun_4
page_data = [defaultdict(list, {"word": "fenil"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("fenil")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[
{
"form": "fenilul",
"tags": ["Singulier", "articulé", "Nominatif Accusatif"],
},
{
"form": "fenili",
"tags": ["Pluriel", "non articulé", "Nominatif Accusatif"],
},
{
"form": "fenilii",
"tags": ["Pluriel", "articulé", "Nominatif Accusatif"],
},
{"form": "fenilule", "tags": ["Singulier", "Vocatif"]},
{"form": "fenililor", "tags": ["Pluriel", "Vocatif"]},
],
)
95 changes: 67 additions & 28 deletions wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict, deque
from dataclasses import dataclass
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
Expand All @@ -20,16 +21,26 @@ def extract_inflection(
process_inflection_table(wxr, page_data, template_node)


IGNORE_TABLE_HEADERS = {
"Terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
"Forme", # br-flex-adj
"Temps", # en-conj-rég,
"Cas", # lt_décl_as
}
IGNORE_TABLE_CELL = {
"Déclinaisons", # de-adj
"—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
}
IGNORE_TABLE_HEADERS = frozenset(
{
"terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
"forme", # br-flex-adj
"temps", # en-conj-rég,
"cas", # lt_décl_as, ro-nom-tab(lower case)
}
)
IGNORE_TABLE_CELL = frozenset(
{
"Déclinaisons", # de-adj
"—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
}
)

@dataclass
class ColspanHeader:
text: str
index: int
span: int


def process_inflection_table(
Expand All @@ -46,7 +57,7 @@ def process_inflection_table(
table_node = table_nodes[0]
column_headers = []
rowspan_headers = deque()
first_row_has_data_cell = False
colspan_headers = []
for row_num, table_row in enumerate(
table_node.find_child(NodeKind.TABLE_ROW)
):
Expand All @@ -64,13 +75,12 @@ def process_inflection_table(
)
and row_node_child.attrs.get("style") != "display:none"
]
if row_num == 0:
first_row_has_data_cell = any(
isinstance(cell, WikiNode)
and cell.kind == NodeKind.TABLE_CELL
and "invisible" not in cell.attrs.get("class", "")
for cell in table_row_nodes
)
current_row_has_data_cell = any(
isinstance(cell, WikiNode)
and cell.kind == NodeKind.TABLE_CELL
and "invisible" not in cell.attrs.get("class", "")
for cell in table_row_nodes
)
row_headers = []
for index, (rowspan_text, rowspan_count) in enumerate(
rowspan_headers.copy()
Expand All @@ -86,14 +96,34 @@ def process_inflection_table(
form_data = defaultdict(list)
if isinstance(table_cell, WikiNode):
if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
table_header_text = clean_node(wxr, None, table_cell)
if table_header_text in IGNORE_TABLE_HEADERS:
if any(
table_cell.find_html(
"span",
attr_name="class",
attr_value="ligne-de-forme",
)
):
# ignore gender header in template "ro-nom-tab"
continue
table_header_text = clean_node(
wxr, None, table_cell
).replace("\n", " ")
if table_header_text.lower() in IGNORE_TABLE_HEADERS:
continue
elif row_num == 0 and not first_row_has_data_cell:
# if cells of the first row are not all header cells
# then the header cells are row headers but not column
# headers
column_headers.append(table_header_text)
if not current_row_has_data_cell:
# if all cells of the row are header cells
# then the header cells are column headers
if "colspan" in table_cell.attrs:
colspan_headers.append(
ColspanHeader(
table_header_text,
column_cell_index,
int(table_cell.attrs.get("colspan")),
)
)
else:
column_headers.append(table_header_text)
column_cell_index += int(table_cell.attrs.get("colspan", 1))
elif row_num > 0:
row_headers.append(table_header_text)
if "rowspan" in table_cell.attrs:
Expand All @@ -113,9 +143,17 @@ def process_inflection_table(
and table_cell_line not in IGNORE_TABLE_CELL
):
form_data["form"] = table_cell_line
for colspan_header in colspan_headers:
if (
column_cell_index >= colspan_header.index
and column_cell_index
< colspan_header.index + colspan_header.span
):
form_data["tags"].append(colspan_header.text)
if (
len(column_headers) > column_cell_index
and column_headers[column_cell_index]
"colspan" not in table_cell.attrs
and len(column_headers) > column_cell_index
and column_headers[column_cell_index].lower()
not in IGNORE_TABLE_HEADERS
):
form_data["tags"].append(
Expand All @@ -126,4 +164,5 @@ def process_inflection_table(
form_data["tags"].extend(row_headers)
if "form" in form_data:
page_data[-1]["forms"].append(form_data)
column_cell_index += 1

column_cell_index += int(table_cell.attrs.get("colspan", 1))

0 comments on commit 8057bec

Please sign in to comment.