Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for some French Wiktionary inflection table templates #349

Merged
merged 3 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 76 additions & 7 deletions tests/test_fr_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_fr_reg(self, mock_node_to_wikitext):
page_data = [defaultdict(list, {"word": "productrice"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("productrice")
extract_inflection(self.wxr, page_data, node, "fr-rég")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[{"form": "productrices", "tags": ["Pluriel"]}],
Expand All @@ -64,11 +64,10 @@ def test_fr_reg(self, mock_node_to_wikitext):
)
def test_fr_accord_al(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/animal#Adjectif
self.maxDiff = None
page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("animal")
extract_inflection(self.wxr, page_data, node, "fr-accord-al")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[
Expand Down Expand Up @@ -101,10 +100,11 @@ def test_fr_accord_al(self, mock_node_to_wikitext):
)
def test_multiple_lines_ipa(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/ration#Nom_commun_2
# template "en-nom-rég"
page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("ration")
extract_inflection(self.wxr, page_data, node, "en-nom-rég")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[
Expand All @@ -128,10 +128,11 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext):
)
def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/ration#Verbe
# template "en-conj-rég"
page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("ration")
extract_inflection(self.wxr, page_data, node, "en-conj-rég")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[
Expand All @@ -155,10 +156,11 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext):
)
def test_invalid_ipa(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/animal#Nom_commun_3
# template "ast-accord-mf"
page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("animal")
extract_inflection(self.wxr, page_data, node, "ast-accord-mf")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[{"tags": ["Pluriel"], "form": "animales"}],
Expand All @@ -177,11 +179,78 @@ def test_invalid_ipa(self, mock_node_to_wikitext):
)
def test_no_column_headers(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/一万#Nom_commun
# template "zh-formes"
page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("一万")
extract_inflection(self.wxr, page_data, node, "zh-formes")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[{"tags": ["Traditionnel"], "form": "一萬"}],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="""{| class="flextable"
!Cas
! Singulier
! Pluriel
|-
! Nominatif
|| <bdi lang="lt" xml:lang="lt" class="lang-lt">[[abadas#lt|abadas]]</bdi>
|| '''<span lang="lt" xml:lang="lt" class="lang-lt"><bdi>abadai</bdi></span>'''
|}""",
)
def test_lt_décl_as(self, mock_node_to_wikitext):
# empty table cells should be ignored
page_data = [defaultdict(list, {"lang_code": "lt", "word": "abadai"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("abadai")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[{"tags": ["Singulier", "Nominatif"], "form": "abadas"}],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="""{|class="flextable flextable-fr-mfsp"

|-
| class="invisible" |
! scope="col" | Singulier
! scope="col" | Pluriel
|- class="flextable-fr-m"
! scope="row" | Masculin
|colspan="2"| [[aastais]]<br
/>[[Annexe:Prononciation/français|<span>\\a.a.stɛ\\</span>]]

|- class="flextable-fr-f"
! scope="row" | Féminin
| [[aastaise]]<br
/>[[Annexe:Prononciation/français|<span>\\a.a.stɛz\\</span>]]
| [[aastaises]]<br
/>[[Annexe:Prononciation/français|<span>\\a.a.stɛz\\</span>]]
|}""",
)
def test_fr_accord_s(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/
page_data = [defaultdict(list, {"lang_code": "fr", "word": "aastais"})]
node = TemplateNode(0)
self.wxr.wtp.start_page("aastais")
extract_inflection(self.wxr, page_data, node)
self.assertEqual(
page_data[-1].get("forms"),
[
{
"tags": ["Singulier", "Féminin"],
"form": "aastaise",
"ipa": "\\a.a.stɛz\\",
},
{
"tags": ["Pluriel", "Féminin"],
"form": "aastaises",
"ipa": "\\a.a.stɛz\\",
},
],
)
34 changes: 31 additions & 3 deletions wiktextract/extractor/fr/form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def extract_form_line(
process_equiv_pour_template(wxr, node, page_data)
elif node.template_name.startswith("zh-mot"):
process_zh_mot_template(wxr, node, page_data)
elif node.template_name == "ja-mot":
process_ja_mot_template(wxr, node, page_data)
else:
tag = clean_node(wxr, page_data[-1], node)
if (
Expand Down Expand Up @@ -75,22 +77,48 @@ def process_zh_mot_template(
node: TemplateNode,
page_data: List[Dict],
) -> None:
# zh-mot, zh-mot-s, zh-mot-t
# Chinese form line template: zh-mot, zh-mot-s, zh-mot-t
# https://fr.wiktionary.org/wiki/Modèle:zh-mot
node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node),
pre_expand=True,
additional_expand={node.template_name},
)
for template_node in node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "lang":
if template_node.template_name.lower() == "lang":
page_data[-1]["sounds"].append(
{
"zh-pron": clean_node(wxr, None, template_node),
"tags": ["Pinyin"],
}
)
elif template_node.template_name == "pron":
elif template_node.template_name in ("pron", "prononciation"):
page_data[-1]["sounds"].append(
{"ipa": clean_node(wxr, None, template_node)}
)


def process_ja_mot_template(
wxr: WiktextractContext,
template_node: TemplateNode,
page_data: List[Dict],
) -> None:
# Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
existing_forms = {
existing_form.get("form")
for existing_form in page_data[-1].get("forms", [])
}
for index, node in expanded_node.find_html("span", with_index=True):
# the first span tag is the word, the second is Hepburn romanization
if index == 1:
form_text = clean_node(wxr, None, node)
if form_text not in existing_forms:
# avoid adding duplicated form data extracted from
# inflection table before the form line
page_data[-1]["forms"].append(
{"form": roman_form, "tags": ["romanization"]}
)
break
45 changes: 27 additions & 18 deletions wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -12,12 +13,11 @@
def extract_inflection(
wxr: WiktextractContext,
page_data: List[Dict],
node: WikiNode,
template_name: str,
template_node: TemplateNode,
) -> None:
# inflection templates
# https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
process_inflection_table(wxr, page_data, node)
process_inflection_table(wxr, page_data, template_node)


IGNORE_TABLE_HEADERS = {
Expand All @@ -43,40 +43,49 @@ def process_inflection_table(
return
table_node = table_nodes[0]
column_headers = []
first_row_has_data_cell = False
for row_num, table_row in enumerate(
table_node.find_child(NodeKind.TABLE_ROW)
):
table_row_nodes = list(table_row.filter_empty_str_child())
first_row_has_data_cell = False
# filter empty table cells
table_row_nodes = [
row_node_child
for row_node_child in table_row.children
if isinstance(row_node_child, WikiNode)
and (
row_node_child.kind == NodeKind.TABLE_HEADER_CELL
or (
row_node_child.kind == NodeKind.TABLE_CELL
and len(row_node_child.children) > 0
)
)
and row_node_child.attrs.get("style") != "display:none"
]
if row_num == 0:
first_row_has_data_cell = not any(
first_row_has_data_cell = any(
isinstance(cell, WikiNode)
and cell.kind == NodeKind.TABLE_CELL
and "invisible" not in cell.attrs.get("class", "")
for cell in table_row_nodes
)

if row_num != 0 and len(table_row_nodes) == len(column_headers) + 1:
# data row has one more column then header: "fr-accord-al" template
column_headers.insert(0, "")

row_header = ""
row_headers = []
for column_num, table_cell in enumerate(table_row_nodes):
form_data = defaultdict(list)
if isinstance(table_cell, WikiNode):
if table_cell.kind == NodeKind.TABLE_HEADER_CELL:
table_header_text = clean_node(wxr, None, table_cell)
if row_num == 0 and first_row_has_data_cell:
if row_num == 0 and not first_row_has_data_cell:
# if cells of the first row are not all header cells
# then the header cells are row headers but not column
# headers
column_headers.append(table_header_text)
elif (
column_num == 0
and table_header_text not in IGNORE_TABLE_HEADERS
):
row_header = table_header_text
elif table_header_text not in IGNORE_TABLE_HEADERS:
form_data["tags"].append(table_header_text)
row_headers.append(table_header_text)
elif table_cell.kind == NodeKind.TABLE_CELL:
table_cell_lines = clean_node(wxr, None, table_cell)
for table_cell_line in table_cell_lines.splitlines():
Expand All @@ -94,7 +103,7 @@ def process_inflection_table(
):
form_data["tags"].append(column_headers[column_num])

if len(row_header) > 0:
form_data["tags"].append(row_header)
if "form" in form_data:
page_data[-1]["forms"].append(form_data)
if len(row_headers) > 0:
form_data["tags"].extend(row_headers)
if "form" in form_data:
page_data[-1]["forms"].append(form_data)
2 changes: 1 addition & 1 deletion wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def process_pos_block(
# skip form line templates
continue
elif template_name.startswith(f"{lang_code}-"):
extract_inflection(wxr, page_data, child, template_name)
extract_inflection(wxr, page_data, child)
elif child.kind == NodeKind.BOLD:
form_line_start = index + 1
elif child.kind == NodeKind.LIST:
Expand Down