Skip to content

Commit

Permalink
Merge pull request #436 from xxyzz/fr
Browse files Browse the repository at this point in the history
Change find gloss tags strategy of French extractor
  • Loading branch information
xxyzz authored Dec 26, 2023
2 parents f6b8340 + cc9796b commit 87921d1
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 13 deletions.
20 changes: 7 additions & 13 deletions src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,47 +21,41 @@ def extract_gloss(
)
)
gloss_data = Sense()
gloss_start = 0
# process modifier, theme tempaltes before gloss text
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
tag_indexes = set()
for index, gloss_node in enumerate(gloss_nodes):
if isinstance(gloss_node, TemplateNode):
categories_data = defaultdict(list)
expanded_text = clean_node(wxr, categories_data, gloss_node)
if expanded_text.startswith("(") and expanded_text.endswith(
")"
):
gloss_start = index + 1
tag = expanded_text.strip("() \n")
if len(tag) > 0:
gloss_data.tags.append(tag)
if "categories" in categories_data:
gloss_data.categories.extend(
categories_data["categories"]
)

gloss_only_nodes = []
tag_indexes = set()
for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start):
tag_indexes.add(index)
# if an italic node is between parentheses then it's a tag, also
# don't add the parenthese strings to `gloss_only_nodes`
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and index > gloss_start
elif (
isinstance(gloss_node, WikiNode)
and gloss_node.kind == NodeKind.ITALIC
and isinstance(gloss_nodes[index - 1], str)
and gloss_nodes[index - 1].strip() == "("
and index + 1 < len(gloss_nodes)
and isinstance(gloss_nodes[index + 1], str)
and gloss_nodes[index + 1].strip() == ")"
):
gloss_data.tags.append(clean_node(wxr, None, node))
gloss_data.tags.append(clean_node(wxr, None, gloss_node))
tag_indexes |= {index - 1, index, index + 1}
continue

gloss_only_nodes = [
node
for index, node in enumerate(gloss_nodes[gloss_start:], gloss_start)
for index, node in enumerate(gloss_nodes)
if index not in tag_indexes
]
gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ def process_translation_templates(
)
elif template_node.template_name.startswith("trad"):
# Translation term: https://fr.wiktionary.org/wiki/Modèle:trad
if 2 not in template_node.template_parameters: # required parameter
return
translation_term = clean_node(
wxr,
None,
Expand Down
26 changes: 26 additions & 0 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,29 @@ def test_nest_gloss(self):
},
],
)

def test_sandwich_tag(self):
# https://fr.wiktionary.org/wiki/autrice#Nom_commun_4
self.wxr.wtp.start_page("autrice")
self.wxr.wtp.add_page("Modèle:lexique", 10, "''(Littérature)''")
self.wxr.wtp.add_page("Modèle:rare", 10, "''(Rare)''")
self.wxr.wtp.add_page("Modèle:lien", 10, "Autrice")
self.wxr.wtp.add_page("Modèle:absolument", 10, "''(Absolument)''")
root = self.wxr.wtp.parse(
"# {{lexique|littérature|nl}} {{rare|nl}} {{lien|autrice|fr|dif=Autrice}}, femme qui a créé une œuvre littéraire. {{absolument}} [[écrivaine|Écrivaine]]."
)
page_data = [
WordEntry(word="autrice", lang_code="nl", lang_name="Néerlandais")
]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
[
{
"glosses": [
"Autrice, femme qui a créé une œuvre littéraire. Écrivaine."
],
"tags": ["Littérature", "Rare", "Absolument"]
}
],
)

0 comments on commit 87921d1

Please sign in to comment.