diff --git a/src/wiktextract/data/fr/linkage_subtitles.json b/src/wiktextract/data/fr/linkage_subtitles.json index 91ac0b5b..56719f3e 100644 --- a/src/wiktextract/data/fr/linkage_subtitles.json +++ b/src/wiktextract/data/fr/linkage_subtitles.json @@ -1,6 +1,7 @@ { "abrév": "abbreviation", "abréviations": "abbreviation", + "antonymes": "antonyms", "app": "related", "apparentés": "related", "apr": "related", diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index fda560af..7d1ca01e 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -70,6 +70,8 @@ def extract_gloss( find_alt_of_form( wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data ) + if "form-of" in page_data[-1].tags: + find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) gloss_text = clean_node( wxr, gloss_data, gloss_only_nodes[:note_index] ).strip(" ()") @@ -192,3 +194,16 @@ def find_alt_of_form( alt_of = clean_node(wxr, None, link) if len(alt_of) > 0: gloss_data.alt_of.append(AltForm(word=alt_of)) + + +def find_form_of_word( + wxr: WiktextractContext, + gloss_nodes: list[Union[str, WikiNode]], + gloss_data: Sense, +) -> None: + form_of = "" + for node in gloss_nodes: + if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: + form_of = clean_node(wxr, None, node) + if len(form_of) > 0: + gloss_data.form_of.append(AltForm(word=form_of)) diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index a9a1220c..830d1b8a 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -31,6 +31,9 @@ def extract_inflection( "commun", # sv-nom-c-ar } ) +IGNORE_TABLE_HEADER_PREFIXES = ( + "voir la conjugaison du verbe", # Modèle:fr-verbe-flexion +) IGNORE_TABLE_CELL = frozenset( { "Déclinaisons", # de-adj @@ -108,7 +111,13 @@ def process_inflection_table( table_header_text = clean_node( wxr, None, table_cell ).replace("\n", " ") - if table_header_text.lower() in IGNORE_TABLE_HEADERS: + if ( + table_header_text.lower() in IGNORE_TABLE_HEADERS + or table_header_text.lower().startswith( + IGNORE_TABLE_HEADER_PREFIXES + ) + or len(table_header_text.strip()) == 0 + ): continue if not current_row_has_data_cell: # if all cells of the row are header cells diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py index 837d81ad..15550eaa 100644 --- a/src/wiktextract/extractor/fr/models.py +++ b/src/wiktextract/extractor/fr/models.py @@ -87,6 +87,7 @@ class Sense(FrenchBaseModel): examples: list[Example] = [] note: str = "" alt_of: list[AltForm] = [] + form_of: list[AltForm] = [] class WordEntry(FrenchBaseModel): @@ -104,6 +105,7 @@ class WordEntry(FrenchBaseModel): forms: list[Form] = Field([], description="Inflection forms list") sounds: list[Sound] = [] translations: list[Translation] = [] + antonyms: list[Linkage] = [] synonyms: list[Linkage] = [] hyponyms: list[Linkage] = [] hypernyms: list[Linkage] = [] diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index f835a09d..3d6a95ad 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -2,7 +2,7 @@ from typing import Any, Optional from wikitextprocessor import NodeKind, WikiNode -from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode +from wikitextprocessor.parser import LEVEL_KIND_FLAGS from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -109,7 +109,7 @@ def process_pos_block( wxr: WiktextractContext, page_data: list[WordEntry], base_data: WordEntry, - pos_title_node: TemplateNode, + pos_title_node: WikiNode, pos_argument: str, pos_title: str, ): @@ -120,6 +120,12 @@ def process_pos_block( page_data[-1].pos = pos_type page_data[-1].pos_title = pos_title page_data[-1].tags.extend(pos_data.get("tags", [])) + for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE): + if ( + level_node_template.template_name == "S" + and level_node_template.template_parameters.get(3) == "flexion" + ): + page_data[-1].tags.append("form-of") child_nodes = list(pos_title_node.filter_empty_str_child()) form_line_start = 0 # Ligne de forme gloss_start = len(child_nodes) diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index 2fc67d1c..54388a54 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -64,7 +64,9 @@ def process_italic_node( and previous_node.template_name.startswith("trad") and len(page_data[-1].translations) > 0 ): - page_data[-1].translations[-1].tags.append(tag.strip("()")) + tag = tag.strip("()") + if len(tag) > 0: + page_data[-1].translations[-1].tags.append(tag) def process_translation_templates( @@ -143,4 +145,5 @@ def process_translation_templates( page_data[-1].translations.append(translation_data) elif len(page_data[-1].translations) > 0: tag = clean_node(wxr, None, template_node).strip("()") - page_data[-1].translations[-1].tags.append(tag) + if len(tag) > 0: + page_data[-1].translations[-1].tags.append(tag) diff --git a/src/wiktextract/extractor/share.py b/src/wiktextract/extractor/share.py index aea4f915..3c22d6dd 100644 --- a/src/wiktextract/extractor/share.py +++ b/src/wiktextract/extractor/share.py @@ -29,7 +29,9 @@ def capture_text_in_parentheses(text: str) -> tuple[list[str], str]: if len(not_captured) > 0: rest_parts.append(not_captured) last_group_end = m.end() - capture_text_list.append(m.group()[1:-1]) + text = m.group()[1:-1].strip() + if len(text) > 0: + capture_text_list.append(text) rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text return capture_text_list, rest_text diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 514f846b..7ad8e998 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -5,14 +5,20 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.gloss import extract_gloss from wiktextract.extractor.fr.models import WordEntry -from wiktextract.extractor.fr.page import process_pos_block +from wiktextract.extractor.fr.page import parse_page, process_pos_block from wiktextract.wxr_context import WiktextractContext class TestFrGloss(TestCase): + maxDiff = None + def setUp(self) -> None: self.wxr = WiktextractContext( - Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") + Wtp(lang_code="fr"), + WiktionaryConfig( + dump_file_lang_code="fr", + capture_language_codes=None, + ), ) def tearDown(self) -> None: @@ -392,3 +398,42 @@ def test_variante_de_dif(self): } ], ) + + def test_form_of(self): + self.wxr.wtp.start_page("dièse") + self.wxr.wtp.add_page("Modèle:langue", 10, "Français") + self.wxr.wtp.add_page("Modèle:S", 10, "Forme de verbe") + self.assertEqual( + parse_page( + self.wxr, + "dièse", + """== {{langue|fr}} == +=== {{S|verbe|fr|flexion}} === +# ''Première personne du singulier de l’indicatif présent du verbe'' [[diéser]]. +# ''Troisième personne du singulier de l’indicatif présent du verbe'' [[diéser]].""", + ), + [ + { + "lang": "Français", + "lang_code": "fr", + "pos": "verb", + "pos_title": "Forme de verbe", + "senses": [ + { + "form_of": [{"word": "diéser"}], + "glosses": [ + "Première personne du singulier de l’indicatif présent du verbe diéser." + ], + }, + { + "form_of": [{"word": "diéser"}], + "glosses": [ + "Troisième personne du singulier de l’indicatif présent du verbe diéser." + ], + }, + ], + "tags": ["form-of"], + "word": "dièse" + } + ], + ) diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py index 6a45e980..3104ac70 100644 --- a/tests/test_fr_linkage.py +++ b/tests/test_fr_linkage.py @@ -230,3 +230,18 @@ def test_italic_sense_node(self): {"word": "more", "sense": "selon les adjectifs"}, ], ) + + def test_no_linkage_empty_tag(self): + page_data = [WordEntry(word="gambo", lang_code="eo", lang="Espéranto")] + self.wxr.wtp.start_page("gambo") + root = self.wxr.wtp.parse("* [[korpo]] ( ''[[corps]]'' )") + extract_linkage(self.wxr, page_data, root, "holonymes") + self.assertEqual( + [ + d.model_dump(exclude_defaults=True) + for d in page_data[-1].holonyms + ], + [ + {"word": "korpo", "sense": "corps"}, + ], + ) diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py index 70578182..c3a2d574 100644 --- a/tests/test_fr_page.py +++ b/tests/test_fr_page.py @@ -11,13 +11,16 @@ class TestFrPage(TestCase): + maxDiff = None + def setUp(self): - self.maxDiff = None - conf1 = WiktionaryConfig( - dump_file_lang_code="fr", - capture_language_codes=None, + self.wxr = WiktextractContext( + Wtp(lang_code="fr"), + WiktionaryConfig( + dump_file_lang_code="fr", + capture_language_codes=None, + ), ) - self.wxr = WiktextractContext(Wtp(lang_code="fr"), conf1) def tearDown(self) -> None: self.wxr.wtp.close_db_conn() diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py index c67138aa..714db121 100644 --- a/tests/test_fr_pronunciation.py +++ b/tests/test_fr_pronunciation.py @@ -111,7 +111,7 @@ def test_no_ipa(self): self.wxr.wtp.add_page( "Modèle:écouter", 10, - 'Suède : écouter « mars [[//fr.wiktionary.org Prononciation ?]] » [[File:LL-Q9027 (swe)-Moonhouse-mars.wav]]', + "Suède : écouter « mars [[//fr.wiktionary.org Prononciation ?]] » [[File:LL-Q9027 (swe)-Moonhouse-mars.wav]]", ) extract_pronunciation( self.wxr,