Skip to content

Commit

Permalink
Combine separated synonyms tag string and template
Browse files Browse the repository at this point in the history
also ignore empty string tag
  • Loading branch information
xxyzz committed Sep 26, 2023
1 parent be3fd6f commit 15b754f
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 6 deletions.
25 changes: 23 additions & 2 deletions tests/test_fr_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def tearDown(self) -> None:

def test_tags(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("")
self.wxr.wtp.start_page("bonjour")
self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
self.wxr.wtp.add_page("Modèle:Louisiane", 10, body="(Louisiane)")
root = self.wxr.wtp.parse(
Expand All @@ -43,7 +43,7 @@ def test_tags(self):

def test_zh_synonyms(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("")
self.wxr.wtp.start_page("你好")
root = self.wxr.wtp.parse(
"==== {{S|synonymes}} ====\n* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)."
)
Expand All @@ -63,3 +63,24 @@ def test_zh_synonyms(self):
}
],
)

def test_template_as_partial_tag(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("bonjour")
self.wxr.wtp.add_page("Modèle:lien", 10, body="kwei")
self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
self.wxr.wtp.add_page("Modèle:L", 10, body="Atikamekw")
root = self.wxr.wtp.parse(
"==== {{S|synonymes}} ====\n* {{lien|kwei|fr}} {{Canada|nocat=1}} (mot {{L|atj}})"
)
extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
self.assertEqual(
page_data,
[
{
"synonyms": [
{"word": "kwei", "tags": ["Canada", "mot Atikamekw"]}
]
}
],
)
28 changes: 24 additions & 4 deletions wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def extract_linkage(
) -> None:
for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
linkage_data = defaultdict(list)
pending_tag = ""
for index, child_node in enumerate(
list_item_node.filter_empty_str_child()
):
Expand All @@ -28,10 +29,29 @@ def extract_linkage(
else:
linkage_data["word"] = clean_node(wxr, None, child_node)
else:
tag = clean_node(wxr, page_data[-1], child_node).strip("()")
tag = (
child_node
if isinstance(child_node, str)
else clean_node(wxr, page_data[-1], child_node)
)
if tag.strip().startswith("(") and not tag.strip().endswith(
")"
):
pending_tag = tag
continue
elif not tag.strip().startswith("(") and tag.strip().endswith(
")"
):
tag = pending_tag + tag
pending_tag = ""
elif len(pending_tag) > 0:
pending_tag += tag
continue

tag = tag.strip("() \n")
if tag.startswith("— "):
linkage_data["translation"] = tag.removeprefix("— ")
else:
elif len(tag) > 0:
linkage_data["tags"].append(tag)

page_data[-1][linkage_type].append(linkage_data)
Expand All @@ -53,8 +73,8 @@ def process_lien_template(
node: TemplateNode,
linkage_data: Dict[str, Union[str, List[str]]],
) -> None:
# https://fr.wiktionary.org/wiki/Modèle:lien
if "dif" in node.template_parameters:
# link word template: https://fr.wiktionary.org/wiki/Modèle:lien
if "dif" in node.template_parameters: # displayed word
word = clean_node(wxr, None, node.template_parameters.get("dif"))
else:
word = clean_node(wxr, None, node.template_parameters.get(1))
Expand Down

0 comments on commit 15b754f

Please sign in to comment.