From b0d9346e2b43671e303429788dc15c5734a30e12 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 26 Dec 2023 11:48:05 +0800 Subject: [PATCH 1/2] Change find gloss tags strategy of French extractor Previous code assume tag nodes are before gloss text and discards texts before the last tag node. But gloss text could be between tag nodes. --- src/wiktextract/extractor/fr/gloss.py | 20 +++++++------------- tests/test_fr_gloss.py | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index 8b69d5b9..cbaacce8 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -21,9 +21,9 @@ def extract_gloss( ) ) gloss_data = Sense() - gloss_start = 0 # process modifier, theme tempaltes before gloss text # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens + tag_indexes = set() for index, gloss_node in enumerate(gloss_nodes): if isinstance(gloss_node, TemplateNode): categories_data = defaultdict(list) @@ -31,7 +31,6 @@ def extract_gloss( if expanded_text.startswith("(") and expanded_text.endswith( ")" ): - gloss_start = index + 1 tag = expanded_text.strip("() \n") if len(tag) > 0: gloss_data.tags.append(tag) @@ -39,29 +38,24 @@ def extract_gloss( gloss_data.categories.extend( categories_data["categories"] ) - - gloss_only_nodes = [] - tag_indexes = set() - for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start): + tag_indexes.add(index) # if an italic node is between parentheses then it's a tag, also # don't add the parenthese strings to `gloss_only_nodes` - if ( - isinstance(node, WikiNode) - and node.kind == NodeKind.ITALIC - and index > gloss_start + elif ( + isinstance(gloss_node, WikiNode) + and gloss_node.kind == NodeKind.ITALIC and isinstance(gloss_nodes[index - 1], str) and gloss_nodes[index - 1].strip() == "(" and index + 1 < len(gloss_nodes) and isinstance(gloss_nodes[index + 1], str) and gloss_nodes[index + 1].strip() == ")" ): - gloss_data.tags.append(clean_node(wxr, None, node)) + gloss_data.tags.append(clean_node(wxr, None, gloss_node)) tag_indexes |= {index - 1, index, index + 1} - continue gloss_only_nodes = [ node - for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start) + for index, node in enumerate(gloss_nodes) if index not in tag_indexes ] gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes) diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 573f20f0..bf27f5d6 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -280,3 +280,29 @@ def test_nest_gloss(self): }, ], ) + + def test_sandwich_tag(self): + # https://fr.wiktionary.org/wiki/autrice#Nom_commun_4 + self.wxr.wtp.start_page("autrice") + self.wxr.wtp.add_page("Modèle:lexique", 10, "''(Littérature)''") + self.wxr.wtp.add_page("Modèle:rare", 10, "''(Rare)''") + self.wxr.wtp.add_page("Modèle:lien", 10, "Autrice") + self.wxr.wtp.add_page("Modèle:absolument", 10, "''(Absolument)''") + root = self.wxr.wtp.parse( + "# {{lexique|littérature|nl}} {{rare|nl}} {{lien|autrice|fr|dif=Autrice}}, femme qui a créé une œuvre littéraire. {{absolument}} [[écrivaine|Écrivaine]]." + ) + page_data = [ + WordEntry(word="autrice", lang_code="nl", lang_name="Néerlandais") + ] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], + [ + { + "glosses": [ + "Autrice, femme qui a créé une œuvre littéraire. Écrivaine." + ], + "tags": ["Littérature", "Rare", "Absolument"] + } + ], + ) From cc9796b73e30b6ef8205a555415cb8446b8fcfb0 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 26 Dec 2023 13:03:35 +0800 Subject: [PATCH 2/2] Don't expand "trad*" templates that missing required parameter French Wiktionary's translation template requires two unnamed parameters for language code and translation term. But some page like "crise" only have one unnamed arg. Fix exception caused by `None` passed to `clean_node()` --- src/wiktextract/extractor/fr/translation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index 484b8fa0..086cfa11 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -85,6 +85,8 @@ def process_translation_templates( ) elif template_node.template_name.startswith("trad"): # Translation term: https://fr.wiktionary.org/wiki/Modèle:trad + if 2 not in template_node.template_parameters: # required parameter + return translation_term = clean_node( wxr, None,