Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract inflection table that has column with colspan attr #357

Merged
merged 1 commit into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions tests/test_fr_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,49 @@ def test_fr_accord_personne(self, mock_node_to_wikitext):
},
],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="""{| class="flextable"
! <span class="ligne-de-forme" ><i>masculin</i></span>
! colspan=2 | Singulier
! colspan=2 | Pluriel
|-
! cas || non articulé || articulé || non articulé || articulé
|-
! Nominatif<br />Accusatif
| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenil#ro-nom|fenil]]</bdi>
| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenilul#ro-nom|fenilul]]</bdi>
| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenili#ro-nom|fenili]]</bdi>
| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenilii#ro-nom|fenilii]]</bdi>
|-
! Vocatif
| colspan=2| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenilule#ro-nom|fenilule]]</bdi>
| colspan=2| <bdi lang="ro" xml:lang="ro" class="lang-ro">[[fenililor#ro-nom|fenililor]]</bdi>
|}""",
)
def test_ro_nom_tab(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/fenil#Nom_commun_4
page_data = [defaultdict(list, {"word": "fenil"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("fenil")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[
{
"form": "fenilul",
"tags": ["Singulier", "articulé", "Nominatif Accusatif"],
},
{
"form": "fenili",
"tags": ["Pluriel", "non articulé", "Nominatif Accusatif"],
},
{
"form": "fenilii",
"tags": ["Pluriel", "articulé", "Nominatif Accusatif"],
},
{"form": "fenilule", "tags": ["Singulier", "Vocatif"]},
{"form": "fenililor", "tags": ["Pluriel", "Vocatif"]},
],
)
95 changes: 67 additions & 28 deletions wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict, deque
from dataclasses import dataclass
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
Expand All @@ -20,16 +21,26 @@ def extract_inflection(
process_inflection_table(wxr, page_data, template_node)


IGNORE_TABLE_HEADERS = {
"Terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
"Forme", # br-flex-adj
"Temps", # en-conj-rég,
"Cas", # lt_décl_as
}
IGNORE_TABLE_CELL = {
"Déclinaisons", # de-adj
"—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
}
IGNORE_TABLE_HEADERS = frozenset(
{
"terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
"forme", # br-flex-adj
"temps", # en-conj-rég,
"cas", # lt_décl_as, ro-nom-tab(lower case)
}
)
IGNORE_TABLE_CELL = frozenset(
{
"Déclinaisons", # de-adj
"—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
}
)

@dataclass
class ColspanHeader:
text: str
index: int
span: int


def process_inflection_table(
Expand All @@ -46,7 +57,7 @@ def process_inflection_table(
table_node = table_nodes[0]
column_headers = []
rowspan_headers = deque()
first_row_has_data_cell = False
colspan_headers = []
for row_num, table_row in enumerate(
table_node.find_child(NodeKind.TABLE_ROW)
):
Expand All @@ -64,13 +75,12 @@ def process_inflection_table(
)
and row_node_child.attrs.get("style") != "display:none"
]
if row_num == 0:
first_row_has_data_cell = any(
isinstance(cell, WikiNode)
and cell.kind == NodeKind.TABLE_CELL
and "invisible" not in cell.attrs.get("class", "")
for cell in table_row_nodes
)
current_row_has_data_cell = any(
isinstance(cell, WikiNode)
and cell.kind == NodeKind.TABLE_CELL
and "invisible" not in cell.attrs.get("class", "")
for cell in table_row_nodes
)
row_headers = []
for index, (rowspan_text, rowspan_count) in enumerate(
rowspan_headers.copy()
Expand All @@ -86,14 +96,34 @@ def process_inflection_table(
form_data = defaultdict(list)
if isinstance(table_cell, WikiNode):
if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
table_header_text = clean_node(wxr, None, table_cell)
if table_header_text in IGNORE_TABLE_HEADERS:
if any(
table_cell.find_html(
"span",
attr_name="class",
attr_value="ligne-de-forme",
)
):
# ignore gender header in template "ro-nom-tab"
continue
table_header_text = clean_node(
wxr, None, table_cell
).replace("\n", " ")
if table_header_text.lower() in IGNORE_TABLE_HEADERS:
continue
elif row_num == 0 and not first_row_has_data_cell:
# if cells of the first row are not all header cells
# then the header cells are row headers but not column
# headers
column_headers.append(table_header_text)
if not current_row_has_data_cell:
# if all cells of the row are header cells
# then the header cells are column headers
if "colspan" in table_cell.attrs:
colspan_headers.append(
ColspanHeader(
table_header_text,
column_cell_index,
int(table_cell.attrs.get("colspan")),
)
)
else:
column_headers.append(table_header_text)
column_cell_index += int(table_cell.attrs.get("colspan", 1))
elif row_num > 0:
row_headers.append(table_header_text)
if "rowspan" in table_cell.attrs:
Expand All @@ -113,9 +143,17 @@ def process_inflection_table(
and table_cell_line not in IGNORE_TABLE_CELL
):
form_data["form"] = table_cell_line
for colspan_header in colspan_headers:
if (
column_cell_index >= colspan_header.index
and column_cell_index
< colspan_header.index + colspan_header.span
):
form_data["tags"].append(colspan_header.text)
if (
len(column_headers) > column_cell_index
and column_headers[column_cell_index]
"colspan" not in table_cell.attrs
and len(column_headers) > column_cell_index
and column_headers[column_cell_index].lower()
not in IGNORE_TABLE_HEADERS
):
form_data["tags"].append(
Expand All @@ -126,4 +164,5 @@ def process_inflection_table(
form_data["tags"].extend(row_headers)
if "form" in form_data:
page_data[-1]["forms"].append(form_data)
column_cell_index += 1

column_cell_index += int(table_cell.attrs.get("colspan", 1))