Merge pull request #530 from xxyzz/fr

Translate some fr edition form table tags
tatuylonen · Mar 5, 2024 · 725e728 · 725e728
2 parents 2fe6a4d + 4ddbd5b
commit 725e728
Show file tree

Hide file tree

Showing 3 changed files with 151 additions and 14 deletions.
diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py
@@ -36,6 +36,8 @@ def extract_inflection(
         "nombre",  # ca-accord-mixte2
         "nature",  # de-adj
         "genre",  # es-accord-oa
+        "conjugaison présent indicatif",  # avk-tab-conjug
+        "mode",  # eo-conj
     }
 )
 IGNORE_TABLE_HEADER_PREFIXES = (
@@ -50,7 +52,7 @@ def extract_inflection(
     }
 )
 IGNORE_TABLE_CELL_PREFIXES = (
-    "voir conjugaison ",  # en-conj
+    "voir conjugaison ",  # en-conj, avk-conj
 )
 
 
@@ -82,10 +84,10 @@ def table_data_cell_is_header(
 def process_inflection_table(
     wxr: WiktextractContext,
     page_data: list[WordEntry],
-    node: WikiNode,
+    table_template: TemplateNode,
 ) -> None:
     expanded_node = wxr.wtp.parse(
-        wxr.wtp.node_to_wikitext(node), expand_all=True
+        wxr.wtp.node_to_wikitext(table_template), expand_all=True
     )
     table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
     if len(table_nodes) == 0:
@@ -118,6 +120,8 @@ def process_inflection_table(
             and not table_data_cell_is_header(wxr, cell, page_data[-1].word)
             for cell in table_row_nodes
         )
+        if not current_row_has_data_cell:
+            column_headers.clear()
         row_headers = []
         new_rowspan_headers = []
         for rowspan_text, rowspan_count in rowspan_headers:
@@ -178,7 +182,8 @@ def process_inflection_table(
                             table_cell.attrs.get("colspan", 1)
                         )
                     else:
-                        row_headers.append(table_header_text)
+                        if table_header_text not in row_headers:
+                            row_headers.append(table_header_text)
                         if "rowspan" in table_cell.attrs:
                             rowspan_headers.append(
                                 (
@@ -194,14 +199,14 @@ def process_inflection_table(
                         elif (
                             table_cell_line != page_data[-1].word
                             and table_cell_line not in IGNORE_TABLE_CELL
-                            and not table_cell_line.startswith(
+                            and not table_cell_line.lower().startswith(
                                 IGNORE_TABLE_CELL_PREFIXES
                             )
                         ):
                             if form_data.form == "":
                                 form_data.form = table_cell_line
                             else:
-                                form_data.form += " " + table_cell_line
+                                form_data.form += "\n" + table_cell_line
                     for colspan_header in colspan_headers:
                         if (
                             column_cell_index >= colspan_header.index
@@ -222,10 +227,12 @@ def process_inflection_table(
                     if len(row_headers) > 0:
                         form_data.raw_tags.extend(row_headers)
                     if form_data.form != "":
-                        for form in form_data.form.split(" ou "):
+                        for form in form_data.form.splitlines():
                             new_form_data = form_data.model_copy(deep=True)
-                            new_form_data.form = form
-                            translate_raw_tags(new_form_data)
+                            new_form_data.form = form.removeprefix("ou ")
+                            translate_raw_tags(
+                                new_form_data, table_template.template_name
+                            )
                             page_data[-1].forms.append(new_form_data)
 
                     colspan_text = table_cell.attrs.get("colspan", "1")

diff --git a/src/wiktextract/extractor/fr/tags.py b/src/wiktextract/extractor/fr/tags.py
@@ -34,6 +34,7 @@
     "subjonctif": "subjunctive",
     "conditionnel": "conditional",
     "impératif": "imperative",
+    "volitif": "volitive",
 }
 
 VERB_FORM_TAGS: dict[str, Union[str, list[str]]] = {
@@ -62,6 +63,7 @@
     "présent": "present",
     "passé": "past",
     "passé simple": "past",
+    "futur": "future",
     "futur simple": "future",
     # https://en.wikipedia.org/wiki/Passé_composé
     "passé composé": "past multiword-construction",
@@ -75,11 +77,19 @@
 }
 
 # https://en.wikipedia.org/wiki/Grammatical_person
-PERSON_TAGS: dict[str, str] = {
+PERSON_TAGS: dict[str, Union[str, list[str]]] = {
     "1ᵉ personne": "first-person",
     "1ʳᵉ personne": "first-person",
     "2ᵉ personne": "second-person",
     "3ᵉ personne": "third-person",
+    # Modèle:avk-conj
+    "1ʳᵉ du sing.": ["first-person", "singular"],
+    "2ᵉ du sing.": ["second-person", "singular"],
+    "3ᵉ du sing.": ["third-person", "singular"],
+    "1ʳᵉ du plur.": ["first-person", "plural"],
+    "2ᵉ du plur.": ["second-person", "plural"],
+    "3ᵉ du plur.": ["third-person", "plural"],
+    "4ᵉ du plur.": ["fourth-person", "plural"],
 }
 
 SEMANTICS_TAGS: dict[str, str] = {
@@ -143,9 +153,21 @@
     "rare": "rare",
     "plus rare": "rare",
     "familier": "colloquial",
+    "par extension": "broadly",
 }
 
-GRAMMATICAL_TAGS: dict[str, str] = {
+# https://en.wikipedia.org/wiki/Voice_(grammar)
+VOICE_TAGS: dict[str, Union[str, list[str]]] = {
+    # https://fr.wiktionary.org/wiki/Modèle:eo-conj
+    "participe actif": ["participle", "active"],
+    "participe passif": ["participle", "passive"],
+    "adverbe actif": ["adverb", "active"],
+    "adverbe passif": ["adverb", "passive"],
+    "substantif actif": ["subsuntive", "active"],
+    "substantif passif": ["subsuntive", "passive"],
+}
+
+GRAMMATICAL_TAGS: dict[str, Union[str, list[str]]] = {
     **GENDER_TAGS,
     **NUMBER_TAGS,
     **MOOD_TAGS,
@@ -160,19 +182,39 @@
     **JA_TAGS,
     **OTHER_GRAMMATICAL_TAGS,
     **SENSE_TAGS,
+    **VOICE_TAGS,
 }
 
 
-def translate_raw_tags(data: WordEntry) -> WordEntry:
+def translate_raw_tags(
+    data: WordEntry,
+    table_template_name: str = "",
+    tag_dict: dict[str, str] = GRAMMATICAL_TAGS,
+) -> WordEntry:
     raw_tags = []
     for raw_tag in data.raw_tags:
-        if raw_tag.lower() in GRAMMATICAL_TAGS:
-            tr_tag = GRAMMATICAL_TAGS[raw_tag.lower()]
+        if raw_tag.lower() in tag_dict:
+            tr_tag = tag_dict[raw_tag.lower()]
             if isinstance(tr_tag, str):
                 data.tags.append(tr_tag)
             elif isinstance(tr_tag, list):
                 data.tags.extend(tr_tag)
         else:
             raw_tags.append(raw_tag)
     data.raw_tags = raw_tags
+    if table_template_name != "":
+        return convert_table_headers(data, table_template_name)
+    return data
+
+
+def convert_table_headers(data: WordEntry, template_name: str) -> WordEntry:
+    if template_name == "avk-tab-conjug":
+        # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug
+        tags = {
+            "1": "first-person",
+            "2": "second-person",
+            "3": "third-person",
+            "4": "fourth-person",
+        }
+        return translate_raw_tags(data, tag_dict=tags)
     return data
diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py
@@ -580,3 +580,91 @@ def test_br_nom(self):
                 },
             ],
         )
+
+    def tes_avk_tab_conjug(self):
+        page_data = [WordEntry(word="aalar", lang_code="avk", lang="Kotava")]
+        self.wxr.wtp.start_page("aalar")
+        root = self.wxr.wtp.parse("{{avk-tab-conjug|aalá|aala}}")
+        self.wxr.wtp.add_page(
+            "Modèle:avk-tab-conjug",
+            10,
+            """{| class="flextable"
+|-
+| class="titre" colspan="4" align="center" | '''Conjugaison Présent Indicatif'''
+|-
+! Personne
+! Singulier
+! Personne
+! Pluriel
+|-
+! 1
+| [[aalá]]
+! 1
+| [[aalat|aala'''t''']]
+|}""",
+        )
+        extract_inflection(self.wxr, page_data, root.children[0])
+        self.assertEqual(
+            [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms],
+            [
+                {
+                    "form": "aalá",
+                    "tags": ["singular", "first-person"],
+                },
+                {
+                    "form": "aalat",
+                    "tags": ["plural", "first-person"],
+                },
+            ],
+        )
+
+    def test_eo_conj(self):
+        page_data = [
+            WordEntry(word="abdikanta", lang_code="eo", lang="Espéranto")
+        ]
+        self.wxr.wtp.start_page("abdikanta")
+        root = self.wxr.wtp.parse("{{eo-conj|abdik|adp=1|sub=mf|subp=}}")
+        self.wxr.wtp.add_page(
+            "Modèle:eo-conj",
+            10,
+            """{| class="flextable"
+|-
+! Temps
+! Passé
+! Présent
+! Futur
+|-
+!Substantif<br />actif
+| [[abdikinto#eo|abdikinto(j,n)]]<br>[[abdikintino#eo|abdikintino(j,n)]]
+|-
+! Mode
+! Conditionnel
+! Volitif
+! Infinitif
+|-
+! Présent
+| [[abdikus#eo|abdikus]] || [[abdiku#eo|abdiku]]
+|}""",
+        )
+        extract_inflection(self.wxr, page_data, root.children[0])
+        self.assertEqual(
+            [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms],
+            [
+                {
+                    "form": "abdikinto(j,n)",
+                    "tags": ["past", "subsuntive", "active"],
+                },
+                {
+                    "form": "abdikintino(j,n)",
+                    "tags": ["past", "subsuntive", "active"],
+                },
+                {
+                    "form": "abdikus",
+                    "tags": ["conditional", "present"],
+                },
+                {
+                    "form": "abdiku",
+                    "tags": ["volitive", "present"],
+                },
+            ],
+        )