diff --git a/src/wiktextract/data/fr/linkage_subtitles.json b/src/wiktextract/data/fr/linkage_subtitles.json
index 91ac0b5b..56719f3e 100644
--- a/src/wiktextract/data/fr/linkage_subtitles.json
+++ b/src/wiktextract/data/fr/linkage_subtitles.json
@@ -1,6 +1,7 @@
{
"abrév": "abbreviation",
"abréviations": "abbreviation",
+ "antonymes": "antonyms",
"app": "related",
"apparentés": "related",
"apr": "related",
diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py
index fda560af..7d1ca01e 100644
--- a/src/wiktextract/extractor/fr/gloss.py
+++ b/src/wiktextract/extractor/fr/gloss.py
@@ -70,6 +70,8 @@ def extract_gloss(
find_alt_of_form(
wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data
)
+ if "form-of" in page_data[-1].tags:
+ find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
gloss_text = clean_node(
wxr, gloss_data, gloss_only_nodes[:note_index]
).strip(" ()")
@@ -192,3 +194,16 @@ def find_alt_of_form(
alt_of = clean_node(wxr, None, link)
if len(alt_of) > 0:
gloss_data.alt_of.append(AltForm(word=alt_of))
+
+
+def find_form_of_word(
+ wxr: WiktextractContext,
+ gloss_nodes: list[Union[str, WikiNode]],
+ gloss_data: Sense,
+) -> None:
+ form_of = ""
+ for node in gloss_nodes:
+ if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
+ form_of = clean_node(wxr, None, node)
+ if len(form_of) > 0:
+ gloss_data.form_of.append(AltForm(word=form_of))
diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py
index a9a1220c..830d1b8a 100644
--- a/src/wiktextract/extractor/fr/inflection.py
+++ b/src/wiktextract/extractor/fr/inflection.py
@@ -31,6 +31,9 @@ def extract_inflection(
"commun", # sv-nom-c-ar
}
)
+IGNORE_TABLE_HEADER_PREFIXES = (
+ "voir la conjugaison du verbe", # Modèle:fr-verbe-flexion
+)
IGNORE_TABLE_CELL = frozenset(
{
"Déclinaisons", # de-adj
@@ -108,7 +111,13 @@ def process_inflection_table(
table_header_text = clean_node(
wxr, None, table_cell
).replace("\n", " ")
- if table_header_text.lower() in IGNORE_TABLE_HEADERS:
+ if (
+ table_header_text.lower() in IGNORE_TABLE_HEADERS
+ or table_header_text.lower().startswith(
+ IGNORE_TABLE_HEADER_PREFIXES
+ )
+ or len(table_header_text.strip()) == 0
+ ):
continue
if not current_row_has_data_cell:
# if all cells of the row are header cells
diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py
index 837d81ad..15550eaa 100644
--- a/src/wiktextract/extractor/fr/models.py
+++ b/src/wiktextract/extractor/fr/models.py
@@ -87,6 +87,7 @@ class Sense(FrenchBaseModel):
examples: list[Example] = []
note: str = ""
alt_of: list[AltForm] = []
+ form_of: list[AltForm] = []
class WordEntry(FrenchBaseModel):
@@ -104,6 +105,7 @@ class WordEntry(FrenchBaseModel):
forms: list[Form] = Field([], description="Inflection forms list")
sounds: list[Sound] = []
translations: list[Translation] = []
+ antonyms: list[Linkage] = []
synonyms: list[Linkage] = []
hyponyms: list[Linkage] = []
hypernyms: list[Linkage] = []
diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
index f835a09d..3d6a95ad 100644
--- a/src/wiktextract/extractor/fr/page.py
+++ b/src/wiktextract/extractor/fr/page.py
@@ -2,7 +2,7 @@
from typing import Any, Optional
from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
@@ -109,7 +109,7 @@ def process_pos_block(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
- pos_title_node: TemplateNode,
+ pos_title_node: WikiNode,
pos_argument: str,
pos_title: str,
):
@@ -120,6 +120,12 @@ def process_pos_block(
page_data[-1].pos = pos_type
page_data[-1].pos_title = pos_title
page_data[-1].tags.extend(pos_data.get("tags", []))
+ for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):
+ if (
+ level_node_template.template_name == "S"
+ and level_node_template.template_parameters.get(3) == "flexion"
+ ):
+ page_data[-1].tags.append("form-of")
child_nodes = list(pos_title_node.filter_empty_str_child())
form_line_start = 0 # Ligne de forme
gloss_start = len(child_nodes)
diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py
index 2fc67d1c..54388a54 100644
--- a/src/wiktextract/extractor/fr/translation.py
+++ b/src/wiktextract/extractor/fr/translation.py
@@ -64,7 +64,9 @@ def process_italic_node(
and previous_node.template_name.startswith("trad")
and len(page_data[-1].translations) > 0
):
- page_data[-1].translations[-1].tags.append(tag.strip("()"))
+ tag = tag.strip("()")
+ if len(tag) > 0:
+ page_data[-1].translations[-1].tags.append(tag)
def process_translation_templates(
@@ -143,4 +145,5 @@ def process_translation_templates(
page_data[-1].translations.append(translation_data)
elif len(page_data[-1].translations) > 0:
tag = clean_node(wxr, None, template_node).strip("()")
- page_data[-1].translations[-1].tags.append(tag)
+ if len(tag) > 0:
+ page_data[-1].translations[-1].tags.append(tag)
diff --git a/src/wiktextract/extractor/share.py b/src/wiktextract/extractor/share.py
index aea4f915..3c22d6dd 100644
--- a/src/wiktextract/extractor/share.py
+++ b/src/wiktextract/extractor/share.py
@@ -29,7 +29,9 @@ def capture_text_in_parentheses(text: str) -> tuple[list[str], str]:
if len(not_captured) > 0:
rest_parts.append(not_captured)
last_group_end = m.end()
- capture_text_list.append(m.group()[1:-1])
+ text = m.group()[1:-1].strip()
+ if len(text) > 0:
+ capture_text_list.append(text)
rest_text = " ".join(rest_parts) if len(rest_parts) > 0 else text
return capture_text_list, rest_text
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
index 514f846b..7ad8e998 100644
--- a/tests/test_fr_gloss.py
+++ b/tests/test_fr_gloss.py
@@ -5,14 +5,20 @@
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.fr.gloss import extract_gloss
from wiktextract.extractor.fr.models import WordEntry
-from wiktextract.extractor.fr.page import process_pos_block
+from wiktextract.extractor.fr.page import parse_page, process_pos_block
from wiktextract.wxr_context import WiktextractContext
class TestFrGloss(TestCase):
+ maxDiff = None
+
def setUp(self) -> None:
self.wxr = WiktextractContext(
- Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr")
+ Wtp(lang_code="fr"),
+ WiktionaryConfig(
+ dump_file_lang_code="fr",
+ capture_language_codes=None,
+ ),
)
def tearDown(self) -> None:
@@ -392,3 +398,42 @@ def test_variante_de_dif(self):
}
],
)
+
+ def test_form_of(self):
+ self.wxr.wtp.start_page("dièse")
+ self.wxr.wtp.add_page("Modèle:langue", 10, "Français")
+ self.wxr.wtp.add_page("Modèle:S", 10, "Forme de verbe")
+ self.assertEqual(
+ parse_page(
+ self.wxr,
+ "dièse",
+ """== {{langue|fr}} ==
+=== {{S|verbe|fr|flexion}} ===
+# ''Première personne du singulier de l’indicatif présent du verbe'' [[diéser]].
+# ''Troisième personne du singulier de l’indicatif présent du verbe'' [[diéser]].""",
+ ),
+ [
+ {
+ "lang": "Français",
+ "lang_code": "fr",
+ "pos": "verb",
+ "pos_title": "Forme de verbe",
+ "senses": [
+ {
+ "form_of": [{"word": "diéser"}],
+ "glosses": [
+ "Première personne du singulier de l’indicatif présent du verbe diéser."
+ ],
+ },
+ {
+ "form_of": [{"word": "diéser"}],
+ "glosses": [
+ "Troisième personne du singulier de l’indicatif présent du verbe diéser."
+ ],
+ },
+ ],
+ "tags": ["form-of"],
+ "word": "dièse"
+ }
+ ],
+ )
diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py
index 6a45e980..3104ac70 100644
--- a/tests/test_fr_linkage.py
+++ b/tests/test_fr_linkage.py
@@ -230,3 +230,18 @@ def test_italic_sense_node(self):
{"word": "more", "sense": "selon les adjectifs"},
],
)
+
+ def test_no_linkage_empty_tag(self):
+ page_data = [WordEntry(word="gambo", lang_code="eo", lang="Espéranto")]
+ self.wxr.wtp.start_page("gambo")
+ root = self.wxr.wtp.parse("* [[korpo]] ( ''[[corps]]'' )")
+ extract_linkage(self.wxr, page_data, root, "holonymes")
+ self.assertEqual(
+ [
+ d.model_dump(exclude_defaults=True)
+ for d in page_data[-1].holonyms
+ ],
+ [
+ {"word": "korpo", "sense": "corps"},
+ ],
+ )
diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py
index 70578182..c3a2d574 100644
--- a/tests/test_fr_page.py
+++ b/tests/test_fr_page.py
@@ -11,13 +11,16 @@
class TestFrPage(TestCase):
+ maxDiff = None
+
def setUp(self):
- self.maxDiff = None
- conf1 = WiktionaryConfig(
- dump_file_lang_code="fr",
- capture_language_codes=None,
+ self.wxr = WiktextractContext(
+ Wtp(lang_code="fr"),
+ WiktionaryConfig(
+ dump_file_lang_code="fr",
+ capture_language_codes=None,
+ ),
)
- self.wxr = WiktextractContext(Wtp(lang_code="fr"), conf1)
def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()
diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py
index c67138aa..714db121 100644
--- a/tests/test_fr_pronunciation.py
+++ b/tests/test_fr_pronunciation.py
@@ -111,7 +111,7 @@ def test_no_ipa(self):
self.wxr.wtp.add_page(
"Modèle:écouter",
10,
- 'Suède : écouter « mars [[//fr.wiktionary.org Prononciation ?]] » [[File:LL-Q9027 (swe)-Moonhouse-mars.wav]]',
+ "Suède : écouter « mars [[//fr.wiktionary.org Prononciation ?]] » [[File:LL-Q9027 (swe)-Moonhouse-mars.wav]]",
)
extract_pronunciation(
self.wxr,