diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index 49da53852..36d01fc7b 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -36,6 +36,8 @@ def extract_inflection( "nombre", # ca-accord-mixte2 "nature", # de-adj "genre", # es-accord-oa + "conjugaison présent indicatif", # avk-tab-conjug + "mode", # eo-conj } ) IGNORE_TABLE_HEADER_PREFIXES = ( @@ -50,7 +52,7 @@ def extract_inflection( } ) IGNORE_TABLE_CELL_PREFIXES = ( - "voir conjugaison ", # en-conj + "voir conjugaison ", # en-conj, avk-conj ) @@ -82,10 +84,10 @@ def table_data_cell_is_header( def process_inflection_table( wxr: WiktextractContext, page_data: list[WordEntry], - node: WikiNode, + table_template: TemplateNode, ) -> None: expanded_node = wxr.wtp.parse( - wxr.wtp.node_to_wikitext(node), expand_all=True + wxr.wtp.node_to_wikitext(table_template), expand_all=True ) table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) if len(table_nodes) == 0: @@ -118,6 +120,8 @@ def process_inflection_table( and not table_data_cell_is_header(wxr, cell, page_data[-1].word) for cell in table_row_nodes ) + if not current_row_has_data_cell: + column_headers.clear() row_headers = [] new_rowspan_headers = [] for rowspan_text, rowspan_count in rowspan_headers: @@ -178,7 +182,8 @@ def process_inflection_table( table_cell.attrs.get("colspan", 1) ) else: - row_headers.append(table_header_text) + if table_header_text not in row_headers: + row_headers.append(table_header_text) if "rowspan" in table_cell.attrs: rowspan_headers.append( ( @@ -194,14 +199,14 @@ def process_inflection_table( elif ( table_cell_line != page_data[-1].word and table_cell_line not in IGNORE_TABLE_CELL - and not table_cell_line.startswith( + and not table_cell_line.lower().startswith( IGNORE_TABLE_CELL_PREFIXES ) ): if form_data.form == "": form_data.form = table_cell_line else: - form_data.form += " " + table_cell_line + form_data.form += "\n" + table_cell_line for colspan_header in colspan_headers: if ( column_cell_index >= colspan_header.index @@ -222,10 +227,12 @@ def process_inflection_table( if len(row_headers) > 0: form_data.raw_tags.extend(row_headers) if form_data.form != "": - for form in form_data.form.split(" ou "): + for form in form_data.form.splitlines(): new_form_data = form_data.model_copy(deep=True) - new_form_data.form = form - translate_raw_tags(new_form_data) + new_form_data.form = form.removeprefix("ou ") + translate_raw_tags( + new_form_data, table_template.template_name + ) page_data[-1].forms.append(new_form_data) colspan_text = table_cell.attrs.get("colspan", "1") diff --git a/src/wiktextract/extractor/fr/tags.py b/src/wiktextract/extractor/fr/tags.py index b38f91f88..12d77e74b 100644 --- a/src/wiktextract/extractor/fr/tags.py +++ b/src/wiktextract/extractor/fr/tags.py @@ -34,6 +34,7 @@ "subjonctif": "subjunctive", "conditionnel": "conditional", "impératif": "imperative", + "volitif": "volitive", } VERB_FORM_TAGS: dict[str, Union[str, list[str]]] = { @@ -62,6 +63,7 @@ "présent": "present", "passé": "past", "passé simple": "past", + "futur": "future", "futur simple": "future", # https://en.wikipedia.org/wiki/Passé_composé "passé composé": "past multiword-construction", @@ -75,11 +77,19 @@ } # https://en.wikipedia.org/wiki/Grammatical_person -PERSON_TAGS: dict[str, str] = { +PERSON_TAGS: dict[str, Union[str, list[str]]] = { "1ᵉ personne": "first-person", "1ʳᵉ personne": "first-person", "2ᵉ personne": "second-person", "3ᵉ personne": "third-person", + # Modèle:avk-conj + "1ʳᵉ du sing.": ["first-person", "singular"], + "2ᵉ du sing.": ["second-person", "singular"], + "3ᵉ du sing.": ["third-person", "singular"], + "1ʳᵉ du plur.": ["first-person", "plural"], + "2ᵉ du plur.": ["second-person", "plural"], + "3ᵉ du plur.": ["third-person", "plural"], + "4ᵉ du plur.": ["fourth-person", "plural"], } SEMANTICS_TAGS: dict[str, str] = { @@ -143,9 +153,21 @@ "rare": "rare", "plus rare": "rare", "familier": "colloquial", + "par extension": "broadly", } -GRAMMATICAL_TAGS: dict[str, str] = { +# https://en.wikipedia.org/wiki/Voice_(grammar) +VOICE_TAGS: dict[str, Union[str, list[str]]] = { + # https://fr.wiktionary.org/wiki/Modèle:eo-conj + "participe actif": ["participle", "active"], + "participe passif": ["participle", "passive"], + "adverbe actif": ["adverb", "active"], + "adverbe passif": ["adverb", "passive"], + "substantif actif": ["subsuntive", "active"], + "substantif passif": ["subsuntive", "passive"], +} + +GRAMMATICAL_TAGS: dict[str, Union[str, list[str]]] = { **GENDER_TAGS, **NUMBER_TAGS, **MOOD_TAGS, @@ -160,14 +182,19 @@ **JA_TAGS, **OTHER_GRAMMATICAL_TAGS, **SENSE_TAGS, + **VOICE_TAGS, } -def translate_raw_tags(data: WordEntry) -> WordEntry: +def translate_raw_tags( + data: WordEntry, + table_template_name: str = "", + tag_dict: dict[str, str] = GRAMMATICAL_TAGS, +) -> WordEntry: raw_tags = [] for raw_tag in data.raw_tags: - if raw_tag.lower() in GRAMMATICAL_TAGS: - tr_tag = GRAMMATICAL_TAGS[raw_tag.lower()] + if raw_tag.lower() in tag_dict: + tr_tag = tag_dict[raw_tag.lower()] if isinstance(tr_tag, str): data.tags.append(tr_tag) elif isinstance(tr_tag, list): @@ -175,4 +202,19 @@ def translate_raw_tags(data: WordEntry) -> WordEntry: else: raw_tags.append(raw_tag) data.raw_tags = raw_tags + if table_template_name != "": + return convert_table_headers(data, table_template_name) + return data + + +def convert_table_headers(data: WordEntry, template_name: str) -> WordEntry: + if template_name == "avk-tab-conjug": + # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug + tags = { + "1": "first-person", + "2": "second-person", + "3": "third-person", + "4": "fourth-person", + } + return translate_raw_tags(data, tag_dict=tags) return data diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index aab356629..c6990726c 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -580,3 +580,91 @@ def test_br_nom(self): }, ], ) + + def tes_avk_tab_conjug(self): + page_data = [WordEntry(word="aalar", lang_code="avk", lang="Kotava")] + self.wxr.wtp.start_page("aalar") + root = self.wxr.wtp.parse("{{avk-tab-conjug|aalá|aala}}") + self.wxr.wtp.add_page( + "Modèle:avk-tab-conjug", + 10, + """{| class="flextable" +|- +| class="titre" colspan="4" align="center" | '''Conjugaison Présent Indicatif''' +|- +! Personne +! Singulier +! Personne +! Pluriel +|- +! 1 +| [[aalá]] +! 1 +| [[aalat|aala'''t''']] +|}""", + ) + extract_inflection(self.wxr, page_data, root.children[0]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], + [ + { + "form": "aalá", + "tags": ["singular", "first-person"], + }, + { + "form": "aalat", + "tags": ["plural", "first-person"], + }, + ], + ) + + def test_eo_conj(self): + page_data = [ + WordEntry(word="abdikanta", lang_code="eo", lang="Espéranto") + ] + self.wxr.wtp.start_page("abdikanta") + root = self.wxr.wtp.parse("{{eo-conj|abdik|adp=1|sub=mf|subp=}}") + self.wxr.wtp.add_page( + "Modèle:eo-conj", + 10, + """{| class="flextable" +|- +! Temps +! Passé +! Présent +! Futur +|- +!Substantif
actif +| [[abdikinto#eo|abdikinto(j,n)]]
[[abdikintino#eo|abdikintino(j,n)]] +|- +! Mode +! Conditionnel +! Volitif +! Infinitif +|- +! Présent +| [[abdikus#eo|abdikus]] || [[abdiku#eo|abdiku]] +|}""", + ) + extract_inflection(self.wxr, page_data, root.children[0]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], + [ + { + "form": "abdikinto(j,n)", + "tags": ["past", "subsuntive", "active"], + }, + { + "form": "abdikintino(j,n)", + "tags": ["past", "subsuntive", "active"], + }, + { + "form": "abdikus", + "tags": ["conditional", "present"], + }, + { + "form": "abdiku", + "tags": ["volitive", "present"], + }, + ], + )