Skip to content

Commit

Permalink
Merge pull request #530 from xxyzz/fr
Browse files Browse the repository at this point in the history
Translate some fr edition form table tags
  • Loading branch information
xxyzz authored Mar 5, 2024
2 parents 2fe6a4d + 4ddbd5b commit 725e728
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 14 deletions.
25 changes: 16 additions & 9 deletions src/wiktextract/extractor/fr/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def extract_inflection(
"nombre", # ca-accord-mixte2
"nature", # de-adj
"genre", # es-accord-oa
"conjugaison présent indicatif", # avk-tab-conjug
"mode", # eo-conj
}
)
IGNORE_TABLE_HEADER_PREFIXES = (
Expand All @@ -50,7 +52,7 @@ def extract_inflection(
}
)
IGNORE_TABLE_CELL_PREFIXES = (
"voir conjugaison ", # en-conj
"voir conjugaison ", # en-conj, avk-conj
)


Expand Down Expand Up @@ -82,10 +84,10 @@ def table_data_cell_is_header(
def process_inflection_table(
wxr: WiktextractContext,
page_data: list[WordEntry],
node: WikiNode,
table_template: TemplateNode,
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
wxr.wtp.node_to_wikitext(table_template), expand_all=True
)
table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
if len(table_nodes) == 0:
Expand Down Expand Up @@ -118,6 +120,8 @@ def process_inflection_table(
and not table_data_cell_is_header(wxr, cell, page_data[-1].word)
for cell in table_row_nodes
)
if not current_row_has_data_cell:
column_headers.clear()
row_headers = []
new_rowspan_headers = []
for rowspan_text, rowspan_count in rowspan_headers:
Expand Down Expand Up @@ -178,7 +182,8 @@ def process_inflection_table(
table_cell.attrs.get("colspan", 1)
)
else:
row_headers.append(table_header_text)
if table_header_text not in row_headers:
row_headers.append(table_header_text)
if "rowspan" in table_cell.attrs:
rowspan_headers.append(
(
Expand All @@ -194,14 +199,14 @@ def process_inflection_table(
elif (
table_cell_line != page_data[-1].word
and table_cell_line not in IGNORE_TABLE_CELL
and not table_cell_line.startswith(
and not table_cell_line.lower().startswith(
IGNORE_TABLE_CELL_PREFIXES
)
):
if form_data.form == "":
form_data.form = table_cell_line
else:
form_data.form += " " + table_cell_line
form_data.form += "\n" + table_cell_line
for colspan_header in colspan_headers:
if (
column_cell_index >= colspan_header.index
Expand All @@ -222,10 +227,12 @@ def process_inflection_table(
if len(row_headers) > 0:
form_data.raw_tags.extend(row_headers)
if form_data.form != "":
for form in form_data.form.split(" ou "):
for form in form_data.form.splitlines():
new_form_data = form_data.model_copy(deep=True)
new_form_data.form = form
translate_raw_tags(new_form_data)
new_form_data.form = form.removeprefix("ou ")
translate_raw_tags(
new_form_data, table_template.template_name
)
page_data[-1].forms.append(new_form_data)

colspan_text = table_cell.attrs.get("colspan", "1")
Expand Down
52 changes: 47 additions & 5 deletions src/wiktextract/extractor/fr/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
"subjonctif": "subjunctive",
"conditionnel": "conditional",
"impératif": "imperative",
"volitif": "volitive",
}

VERB_FORM_TAGS: dict[str, Union[str, list[str]]] = {
Expand Down Expand Up @@ -62,6 +63,7 @@
"présent": "present",
"passé": "past",
"passé simple": "past",
"futur": "future",
"futur simple": "future",
# https://en.wikipedia.org/wiki/Passé_composé
"passé composé": "past multiword-construction",
Expand All @@ -75,11 +77,19 @@
}

# https://en.wikipedia.org/wiki/Grammatical_person
PERSON_TAGS: dict[str, str] = {
PERSON_TAGS: dict[str, Union[str, list[str]]] = {
"1ᵉ personne": "first-person",
"1ʳᵉ personne": "first-person",
"2ᵉ personne": "second-person",
"3ᵉ personne": "third-person",
# Modèle:avk-conj
"1ʳᵉ du sing.": ["first-person", "singular"],
"2ᵉ du sing.": ["second-person", "singular"],
"3ᵉ du sing.": ["third-person", "singular"],
"1ʳᵉ du plur.": ["first-person", "plural"],
"2ᵉ du plur.": ["second-person", "plural"],
"3ᵉ du plur.": ["third-person", "plural"],
"4ᵉ du plur.": ["fourth-person", "plural"],
}

SEMANTICS_TAGS: dict[str, str] = {
Expand Down Expand Up @@ -143,9 +153,21 @@
"rare": "rare",
"plus rare": "rare",
"familier": "colloquial",
"par extension": "broadly",
}

GRAMMATICAL_TAGS: dict[str, str] = {
# https://en.wikipedia.org/wiki/Voice_(grammar)
VOICE_TAGS: dict[str, Union[str, list[str]]] = {
# https://fr.wiktionary.org/wiki/Modèle:eo-conj
"participe actif": ["participle", "active"],
"participe passif": ["participle", "passive"],
"adverbe actif": ["adverb", "active"],
"adverbe passif": ["adverb", "passive"],
"substantif actif": ["subsuntive", "active"],
"substantif passif": ["subsuntive", "passive"],
}

GRAMMATICAL_TAGS: dict[str, Union[str, list[str]]] = {
**GENDER_TAGS,
**NUMBER_TAGS,
**MOOD_TAGS,
Expand All @@ -160,19 +182,39 @@
**JA_TAGS,
**OTHER_GRAMMATICAL_TAGS,
**SENSE_TAGS,
**VOICE_TAGS,
}


def translate_raw_tags(data: WordEntry) -> WordEntry:
def translate_raw_tags(
data: WordEntry,
table_template_name: str = "",
tag_dict: dict[str, str] = GRAMMATICAL_TAGS,
) -> WordEntry:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag.lower() in GRAMMATICAL_TAGS:
tr_tag = GRAMMATICAL_TAGS[raw_tag.lower()]
if raw_tag.lower() in tag_dict:
tr_tag = tag_dict[raw_tag.lower()]
if isinstance(tr_tag, str):
data.tags.append(tr_tag)
elif isinstance(tr_tag, list):
data.tags.extend(tr_tag)
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
if table_template_name != "":
return convert_table_headers(data, table_template_name)
return data


def convert_table_headers(data: WordEntry, template_name: str) -> WordEntry:
if template_name == "avk-tab-conjug":
# https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug
tags = {
"1": "first-person",
"2": "second-person",
"3": "third-person",
"4": "fourth-person",
}
return translate_raw_tags(data, tag_dict=tags)
return data
88 changes: 88 additions & 0 deletions tests/test_fr_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,91 @@ def test_br_nom(self):
},
],
)

def tes_avk_tab_conjug(self):
page_data = [WordEntry(word="aalar", lang_code="avk", lang="Kotava")]
self.wxr.wtp.start_page("aalar")
root = self.wxr.wtp.parse("{{avk-tab-conjug|aalá|aala}}")
self.wxr.wtp.add_page(
"Modèle:avk-tab-conjug",
10,
"""{| class="flextable"
|-
| class="titre" colspan="4" align="center" | '''Conjugaison Présent Indicatif'''
|-
! Personne
! Singulier
! Personne
! Pluriel
|-
! 1
| [[aalá]]
! 1
| [[aalat|aala'''t''']]
|}""",
)
extract_inflection(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].forms],
[
{
"form": "aalá",
"tags": ["singular", "first-person"],
},
{
"form": "aalat",
"tags": ["plural", "first-person"],
},
],
)

def test_eo_conj(self):
page_data = [
WordEntry(word="abdikanta", lang_code="eo", lang="Espéranto")
]
self.wxr.wtp.start_page("abdikanta")
root = self.wxr.wtp.parse("{{eo-conj|abdik|adp=1|sub=mf|subp=}}")
self.wxr.wtp.add_page(
"Modèle:eo-conj",
10,
"""{| class="flextable"
|-
! Temps
! Passé
! Présent
! Futur
|-
!Substantif<br />actif
| [[abdikinto#eo|abdikinto(j,n)]]<br>[[abdikintino#eo|abdikintino(j,n)]]
|-
! Mode
! Conditionnel
! Volitif
! Infinitif
|-
! Présent
| [[abdikus#eo|abdikus]] || [[abdiku#eo|abdiku]]
|}""",
)
extract_inflection(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].forms],
[
{
"form": "abdikinto(j,n)",
"tags": ["past", "subsuntive", "active"],
},
{
"form": "abdikintino(j,n)",
"tags": ["past", "subsuntive", "active"],
},
{
"form": "abdikus",
"tags": ["conditional", "present"],
},
{
"form": "abdiku",
"tags": ["volitive", "present"],
},
],
)

0 comments on commit 725e728

Please sign in to comment.