From c994da3fd4e58c94083cd65c244a3841bc2f7c26 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 29 Dec 2023 14:46:48 +0800 Subject: [PATCH] Extract "en-adj" inflection tables in French Wiktionary These templates use the data wikitext for column header --- src/wiktextract/extractor/fr/inflection.py | 39 ++- tests/test_fr_inflection.py | 340 ++++++++++++--------- 2 files changed, 238 insertions(+), 141 deletions(-) diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index f21d81f75..a9a1220ca 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -16,7 +16,10 @@ def extract_inflection( ) -> None: # inflection templates # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français - process_inflection_table(wxr, page_data, template_node) + if template_node.template_name.startswith("en-adj"): + process_en_adj_table(wxr, page_data, template_node) + else: + process_inflection_table(wxr, page_data, template_node) IGNORE_TABLE_HEADERS = frozenset( @@ -192,3 +195,37 @@ def insert_ipa(form: Form, ipa_text: str) -> None: if len(ipa_data) == 0: return form.ipas.extend(ipa_data) + + +def process_en_adj_table( + wxr: WiktextractContext, + page_data: list[WordEntry], + template_node: WikiNode, +) -> None: + # https://fr.wiktionary.org/wiki/Modèle:en-adj + # and other en-adj* templates + # these templates use normal table cell for column table header + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(template_node), expand_all=True + ) + table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) + if len(table_nodes) == 0: + return + table_node = table_nodes[0] + for row_num, table_row in enumerate( + table_node.find_child(NodeKind.TABLE_ROW) + ): + if row_num == 0: + # skip header + continue + if len(table_row.children) > 1: + form_data = Form() + form_data.tags.append(clean_node(wxr, None, table_row.children[0])) + form_text = clean_node(wxr, None, table_row.children[1]) + for form_line in form_text.splitlines(): + if is_ipa_text(form_line): + insert_ipa(form_data, form_line) + else: + form_data.form = form_line + if form_data.form != page_data[-1].word: + page_data[-1].forms.append(form_data) diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index bd505a676..e49d1a682 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -1,8 +1,6 @@ from unittest import TestCase -from unittest.mock import patch from wikitextprocessor import Wtp -from wikitextprocessor.parser import TemplateNode from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.inflection import extract_inflection from wiktextract.extractor.fr.models import WordEntry @@ -18,9 +16,14 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value=""" + def test_fr_reg(self): + page_data = [ + WordEntry(word="productrice", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:fr-rég", + 10, + """ {| ! Singulier !! Pluriel |- @@ -28,24 +31,25 @@ def tearDown(self) -> None: | [[productrices#fr|productrices]] |- |[[Annexe:Prononciation/français|\\pʁɔ.dyk.tʁis\\]] -|} - """, - ) - def test_fr_reg(self, mock_node_to_wikitext): - page_data = [ - WordEntry(word="productrice", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) +|}""", + ) self.wxr.wtp.start_page("productrice") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{fr-rég|pʁɔ.dyk.tʁis}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"form": "productrices", "tags": ["Pluriel"]}], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{|class="flextable flextable-fr-mfsp" + def test_fr_accord_al(self): + # https://fr.wiktionary.org/wiki/animal#Adjectif + page_data = [ + WordEntry(word="animal", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:fr-accord-al", + 10, + """{|class="flextable flextable-fr-mfsp" |- !scope="col"| Singulier !scope="col"| Pluriel @@ -58,15 +62,10 @@ def test_fr_reg(self, mock_node_to_wikitext): |[[animale]]
[[Annexe:Prononciation/français|\\a.ni.mal\\]] |[[animales]]
[[Annexe:Prononciation/français|\\a.ni.mal\\]] |}""", - ) - def test_fr_accord_al(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/animal#Adjectif - page_data = [ - WordEntry(word="animal", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("animal") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{fr-accord-al|a.ni.m}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -88,24 +87,25 @@ def test_fr_accord_al(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class='flextable flextable-en' -! Singulier !! Pluriel -|- -| '''ration'''
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ən\\]]
ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ən\\]] -| [[rations#en-flex-nom|rations]]
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ənz\\]]
ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ənz\\]] -|}""", - ) - def test_multiple_lines_ipa(self, mock_node_to_wikitext): + def test_multiple_lines_ipa(self): # https://fr.wiktionary.org/wiki/ration#Nom_commun_2 # template "en-nom-rég" page_data = [ WordEntry(word="ration", lang_code="en", lang_name="Anglais") ] - node = TemplateNode(0) + self.wxr.wtp.add_page( + "Modèle:en-nom-rég", + 10, + """{| class='flextable flextable-en' +! Singulier !! Pluriel +|- +| '''ration'''
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ən\\]]
ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ən\\]] +| [[rations#en-flex-nom|rations]]
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ənz\\]]
ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ənz\\]] +|}""", + ) self.wxr.wtp.start_page("ration") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{en-nom-rég|ˈɹæʃ.ən|ˈɹeɪʃ.ən}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -117,25 +117,28 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{|class='flextable' + def test_single_line_multiple_ipa(self): + # https://fr.wiktionary.org/wiki/ration#Verbe + # template "en-conj-rég" + page_data = [ + WordEntry(word="ration", lang_code="en", lang_name="Anglais") + ] + self.wxr.wtp.add_page( + "Modèle:en-conj-rég", + 10, + """{|class='flextable' ! Temps ! Forme |- ! Infinitif | to '''ration'''
[[Annexe:Prononciation/anglais|\\ˈɹæʃ.ən\\]] ou [[Annexe:Prononciation/anglais|\\ˈɹeɪʃ.ən\\]] |}""", - ) - def test_single_line_multiple_ipa(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/ration#Verbe - # template "en-conj-rég" - page_data = [ - WordEntry(word="ration", lang_code="en", lang_name="Anglais") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("ration") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse( + "{{en-conj-rég|inf.pron=ˈɹæʃ.ən|inf.pron2=ˈɹeɪʃ.ən}}" + ) + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -147,33 +150,42 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| + def test_invalid_ipa(self): + # https://fr.wiktionary.org/wiki/animal#Nom_commun_3 + page_data = [ + WordEntry(word="animal", lang_code="en", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:ast-accord-mf", + 10, + """{| ! '''Singulier''' ! '''Pluriel''' |- | [[animal]]
\\[//fr.wiktionary.org/w/index.php?title=ration&action=edit Prononciation ?]\\
| [[animales]]
\\[//fr.wiktionary.org/w/index.php?title=ration&action=edit Prononciation ?]\\
|}""", - ) - def test_invalid_ipa(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/animal#Nom_commun_3 - # template "ast-accord-mf" - page_data = [ - WordEntry(word="animal", lang_code="en", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("animal") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse( + "{{ast-accord-mf|s=animal|ps=|p=animales|pp=}}" + ) + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Pluriel"], "form": "animales"}], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable" + def test_no_column_headers(self): + # https://fr.wiktionary.org/wiki/一万#Nom_commun + # template "zh-formes" + page_data = [ + WordEntry(word="一万", lang_code="zh", lang_name="Chinois") + ] + self.wxr.wtp.add_page( + "Modèle:zh-formes", + 10, + """{| class="flextable" |- ! Simplifié | [[一万#zh|一万]] @@ -181,24 +193,24 @@ def test_invalid_ipa(self, mock_node_to_wikitext): ! Traditionnel | [[一萬#zh|一萬]] |}""", - ) - def test_no_column_headers(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/一万#Nom_commun - # template "zh-formes" - page_data = [ - WordEntry(word="一万", lang_code="zh", lang_name="Chinois") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("一万") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{zh-formes|一万|一萬}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Traditionnel"], "form": "一萬"}], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable" + def test_lt_décl_as(self): + # empty table cells should be ignored + page_data = [ + WordEntry(word="abadai", lang_code="lt", lang_name="Lituanien") + ] + self.wxr.wtp.add_page( + "Modèle:lt-décl-as", + 10, + """{| class="flextable" !Cas ! Singulier ! Pluriel @@ -207,23 +219,23 @@ def test_no_column_headers(self, mock_node_to_wikitext): || [[abadas#lt|abadas]] || '''abadai''' |}""", - ) - def test_lt_décl_as(self, mock_node_to_wikitext): - # empty table cells should be ignored - page_data = [ - WordEntry(word="abadai", lang_code="lt", lang_name="Lituanien") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("abadai") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{lt-décl-as|abad}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Singulier", "Nominatif"], "form": "abadas"}], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{|class="flextable flextable-fr-mfsp" + def test_fr_accord_s(self): + page_data = [ + WordEntry(word="aastais", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:fr-accord-s", + 10, + """{|class="flextable flextable-fr-mfsp" |- | class="invisible" | @@ -241,14 +253,10 @@ def test_lt_décl_as(self, mock_node_to_wikitext): | [[aastaises]]
[[Annexe:Prononciation/français|\\a.a.stɛz\\]] |}""", - ) - def test_fr_accord_s(self, mock_node_to_wikitext): - page_data = [ - WordEntry(word="aastais", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("aastais") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{fr-accord-s|a.a.stɛ|ms=aastais}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -265,9 +273,17 @@ def test_fr_accord_s(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable" + def test_fr_accord_personne(self): + # https://fr.wiktionary.org/wiki/enculé_de_ta_race + page_data = [ + WordEntry( + word="enculé de ta race", lang_code="fr", lang_name="Français" + ) + ] + self.wxr.wtp.add_page( + "Modèle:fr-accord-personne", + 10, + """{| class="flextable" | colspan="2" | ! Singulier !! Pluriel |- @@ -280,17 +296,12 @@ def test_fr_accord_s(self, mock_node_to_wikitext): | [[enculée de ma race]]
[[Annexe:Prononciation/français|\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\]] | [[enculées de notre race]]
[[Annexe:Prononciation/français|\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\]] |}""", - ) - def test_fr_accord_personne(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/enculé_de_ta_race - page_data = [ - WordEntry( - word="enculé de ta race", lang_code="fr", lang_name="Français" - ) - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("enculé de ta race") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse( + "{{fr-accord-personne|1ms = enculé de ma race}}" + ) + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -317,9 +328,15 @@ def test_fr_accord_personne(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable" + def test_ro_nom_tab(self): + # https://fr.wiktionary.org/wiki/fenil#Nom_commun_4 + page_data = [ + WordEntry(word="fenil", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:ro-nom-tab", + 10, + """{| class="flextable" ! masculin ! colspan=2 | Singulier ! colspan=2 | Pluriel @@ -336,15 +353,17 @@ def test_fr_accord_personne(self, mock_node_to_wikitext): | colspan=2| [[fenilule#ro-nom|fenilule]] | colspan=2| [[fenililor#ro-nom|fenililor]] |}""", - ) - def test_ro_nom_tab(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/fenil#Nom_commun_4 - page_data = [ - WordEntry(word="fenil", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("fenil") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse( + """{{ro-nom-tab|gen=masculin +|ns=fenil |np=fenili +|as=fenilul |ap=fenilii +|ds=fenilului |dp=fenililor +|vs=fenilule |vp=fenililor +}}""", + ) + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -365,9 +384,15 @@ def test_ro_nom_tab(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{| class="flextable flextable-sv" + def test_sv_nom_c_ar(self): + # https://fr.wiktionary.org/wiki/robot#Nom_commun_7 + page_data = [ + WordEntry(word="robot", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:sv-nom-c-ar", + 10, + """{| class="flextable flextable-sv" ! class="invisible" | |- ! Commun @@ -382,15 +407,10 @@ def test_ro_nom_tab(self, mock_node_to_wikitext): | class="plur-indef" |[[robotar#sv|robotar]] | class="plur-def" |[[robotarna#sv|robotarna]] |}""", - ) - def test_sv_nom_c_ar(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/robot#Nom_commun_7 - page_data = [ - WordEntry(word="robot", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("robot") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{sv-nom-c-ar}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -400,9 +420,15 @@ def test_sv_nom_c_ar(self, mock_node_to_wikitext): ], ) - @patch( - "wikitextprocessor.Wtp.node_to_wikitext", - return_value="""{|class="flextable" + def test_cs_decl_nom_ma_dur(self): + # https://fr.wiktionary.org/wiki/robot#Nom_commun_1_2 + page_data = [ + WordEntry(word="robot", lang_code="fr", lang_name="Français") + ] + self.wxr.wtp.add_page( + "Modèle:cs-décl-nom-ma-dur", + 10, + """{|class="flextable" |- !scope="col"| Cas !scope="col"| Singulier @@ -412,15 +438,10 @@ def test_sv_nom_c_ar(self, mock_node_to_wikitext): | [[robot#cs-nom|robot''' ''']] | [[roboti#cs-flex-nom|robot'''i ''']]
''ou'' [[robotové#cs-flex-nom|robot'''ové ''']] |}""", - ) - def test_cs_decl_nom_ma_dur(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/robot#Nom_commun_1_2 - page_data = [ - WordEntry(word="robot", lang_code="fr", lang_name="Français") - ] - node = TemplateNode(0) + ) self.wxr.wtp.start_page("robot") - extract_inflection(self.wxr, page_data, node) + root = self.wxr.wtp.parse("{{cs-décl-nom-ma-dur|rad=robot}}") + extract_inflection(self.wxr, page_data, root.children[0]) self.assertEqual( [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ @@ -428,3 +449,42 @@ def test_cs_decl_nom_ma_dur(self, mock_node_to_wikitext): {"form": "robotové", "tags": ["Pluriel", "Nominatif"]}, ], ) + + def test_en_adj(self): + # https://fr.wiktionary.org/wiki/new + page_data = [WordEntry(word="new", lang_code="en", lang_name="Anglais")] + self.wxr.wtp.start_page("new") + root = self.wxr.wtp.parse("{{en-adj-er|pron=ˈnu|pronGB=ˈnjuː}}") + self.wxr.wtp.add_page( + "Modèle:en-adj-er", + 10, + """{| class="flextable" +! Nature +! Forme +|- +| class="titre" | Positif +| '''new'''
[[Annexe:Prononciation/anglais|\\ˈnu\\]] ou [[Annexe:Prononciation/anglais|\\ˈnjuː\\]] +|- +| class="titre" | Comparatif +| [[newer#en|newer]]
[[Annexe:Prononciation/anglais|\\ˈnu.ɚ\\]] ou [[Annexe:Prononciation/anglais|\\ˈnjuː.ə\\]] +|- +| class="titre" | Superlatif +| [[newest#en|newest]]
[[Annexe:Prononciation/anglais|\\ˈnu.ɪst\\]] ou [[Annexe:Prononciation/anglais|\\ˈnjuː.ɪst\\]] +|}""", + ) + extract_inflection(self.wxr, page_data, root.children[0]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], + [ + { + "form": "newer", + "tags": ["Comparatif"], + "ipas": ["\\ˈnu.ɚ\\", "\\ˈnjuː.ə\\"], + }, + { + "form": "newest", + "tags": ["Superlatif"], + "ipas": ["\\ˈnu.ɪst\\", "\\ˈnjuː.ɪst\\"], + }, + ], + )