Skip to content

Commit

Permalink
Merge pull request #345 from xxyzz/fr
Browse files Browse the repository at this point in the history
Extract italic nodes as gloss tags for French Wiktionary
  • Loading branch information
xxyzz authored Sep 22, 2023
2 parents 9fe490a + 1937e42 commit c8f7d45
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
19 changes: 19 additions & 0 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,22 @@ def test_variante_de(self):
}
],
)

def test_italic_tag(self):
# https://fr.wiktionary.org/wiki/lenn
self.wxr.wtp.start_page("lenn")
root = self.wxr.wtp.parse(
"# (''localement'') [[bassin#Nom_commun|Bassin]], [[lavoir#Nom_commun|lavoir]]."
)
page_data = [defaultdict(list)]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
page_data,
[
{
"senses": [
{"glosses": ["Bassin, lavoir."], "tags": ["localement"]}
]
}
],
)
16 changes: 13 additions & 3 deletions wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,22 @@ def extract_gloss(
break
else:
gloss_start = index + 1
for mod_template in gloss_nodes[:gloss_start]:
for tag_node in gloss_nodes[:gloss_start]:
gloss_data["tags"].append(
clean_node(wxr, gloss_data, mod_template).strip("()")
clean_node(wxr, gloss_data, tag_node).strip("()")
)

gloss_text = clean_node(wxr, gloss_data, gloss_nodes[gloss_start:])
gloss_only_nodes = []
# extract italic tags
for node in gloss_nodes[gloss_start:]:
if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
gloss_data["tags"].append(clean_node(wxr, None, node))
continue
elif isinstance(node, str) and node.strip() in ["(", ")"]:
# remove parentheses around italic node
continue
gloss_only_nodes.append(node)
gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes)
gloss_data["glosses"] = [gloss_text]
extract_examples(wxr, gloss_data, list_item_node)
page_data[-1]["senses"].append(gloss_data)
Expand Down

0 comments on commit c8f7d45

Please sign in to comment.