Skip to content

Commit

Permalink
Merge pull request #366 from xxyzz/fr
Browse files Browse the repository at this point in the history
Exclude sublist child node from linkage list item node
  • Loading branch information
xxyzz authored Oct 18, 2023
2 parents b0038bd + 1ed25c3 commit b1cb0dd
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 26 deletions.
38 changes: 20 additions & 18 deletions src/wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from ..share import split_tag_text


def extract_linkage(
wxr: WiktextractContext,
Expand All @@ -17,44 +19,44 @@ def extract_linkage(
for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
linkage_data = defaultdict(list)
pending_tag = ""
for index, child_node in enumerate(
list_item_node.filter_empty_str_child()
for index, child_node in enumerate( # remove nested lists
list_item_node.invert_find_child(NodeKind.LIST)
):
if index == 0 or "word" not in linkage_data:
if isinstance(child_node, TemplateNode):
process_linkage_template(wxr, child_node, linkage_data)
else:
linkage_data["word"] = clean_node(wxr, None, child_node)
else:
tag = (
tag_text = (
child_node
if isinstance(child_node, str)
else clean_node(wxr, page_data[-1], child_node)
)
if tag.strip().startswith("(") and not tag.strip().endswith(
")"
):
pending_tag = tag
if tag_text.strip().startswith(
"("
) and not tag_text.strip().endswith(")"):
pending_tag = tag_text
continue
elif not tag.strip().startswith("(") and tag.strip().endswith(
")"
):
tag = pending_tag + tag
elif not tag_text.strip().startswith(
"("
) and tag_text.strip().endswith(")"):
tag_text = pending_tag + tag_text
pending_tag = ""
elif tag.strip() == ",":
elif tag_text.strip() == ",":
# list item has more than one word
page_data[-1][linkage_type].append(linkage_data)
linkage_data = defaultdict(list)
continue
elif len(pending_tag) > 0:
pending_tag += tag
pending_tag += tag_text
continue

tag = tag.strip("() \n")
if tag.startswith("— "):
linkage_data["translation"] = tag.removeprefix("— ")
elif len(tag) > 0:
linkage_data["tags"].append(tag)
for tag in split_tag_text(tag_text):
if tag.startswith("— "):
linkage_data["translation"] = tag.removeprefix("— ")
elif len(tag) > 0:
linkage_data["tags"].append(tag)

page_data[-1][linkage_type].append(linkage_data)

Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def parse_section(
wxr.wtp.start_subsection(subtitle)
if section_type in wxr.config.OTHER_SUBTITLES["ignored_sections"]:
pass
# POS parameters:
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots
elif section_type in wxr.config.POS_SUBTITLES:
process_pos_block(
Expand Down
10 changes: 10 additions & 0 deletions src/wiktextract/extractor/share.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,13 @@ def create_transcode_url(filename: str, transcode_suffix: str) -> str:
"https://upload.wikimedia.org/wikipedia/commons/transcoded/"
+ f"{md5[0]}/{md5[:2]}/{filename}/{filename}.{transcode_suffix}"
)


def split_tag_text(text: str) -> List[str]:
"""
Find tags enclosded in parentheses and remove parentheses
"""
return [
tag.strip("()").strip()
for tag in re.split(r"(?<=\))\s+(?=\()", text.strip())
]
43 changes: 35 additions & 8 deletions tests/test_fr_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def test_tags(self):
self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
self.wxr.wtp.add_page("Modèle:Louisiane", 10, body="(Louisiane)")
root = self.wxr.wtp.parse(
"==== {{S|synonymes}} ====\n* [[bon matin]] {{Canada|nocat=1}} {{Louisiane|nocat=1}}"
"* [[bon matin]] {{Canada|nocat=1}} {{Louisiane|nocat=1}}"
)
extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
extract_linkage(self.wxr, page_data, root, "synonyms")
self.assertEqual(
page_data,
[
Expand All @@ -41,9 +41,9 @@ def test_zh_synonyms(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("你好")
root = self.wxr.wtp.parse(
"==== {{S|synonymes}} ====\n* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)."
"* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)."
)
extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
extract_linkage(self.wxr, page_data, root, "synonyms")
self.assertEqual(
page_data,
[
Expand All @@ -67,9 +67,9 @@ def test_template_as_partial_tag(self):
self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
self.wxr.wtp.add_page("Modèle:L", 10, body="Atikamekw")
root = self.wxr.wtp.parse(
"==== {{S|synonymes}} ====\n* {{lien|kwei|fr}} {{Canada|nocat=1}} (mot {{L|atj}})"
"* {{lien|kwei|fr}} {{Canada|nocat=1}} (mot {{L|atj}})"
)
extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
extract_linkage(self.wxr, page_data, root, "synonyms")
self.assertEqual(
page_data,
[
Expand All @@ -85,9 +85,9 @@ def test_list_item_has_two_words(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("masse")
root = self.wxr.wtp.parse(
"==== {{S|dérivés}} ====\n* [[être à la masse]], [[mettre à la masse]]"
"* [[être à la masse]], [[mettre à la masse]]"
)
extract_linkage(self.wxr, page_data, root.children[0], "derived")
extract_linkage(self.wxr, page_data, root, "derived")
self.assertEqual(
page_data,
[
Expand All @@ -99,3 +99,30 @@ def test_list_item_has_two_words(self):
}
],
)

def test_sub_list(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("lézard ocellé")
root = self.wxr.wtp.parse(
"""* [[saurien]]s (Sauria)
** [[lacertidé]]s (Lacertidae) (famille des lézards typiques)
"""
)
extract_linkage(self.wxr, page_data, root, "hypernyms")
self.assertEqual(
page_data,
[
{
"hypernyms": [
{"tags": ["Sauria"], "word": "sauriens"},
{
"tags": [
"Lacertidae",
"famille des lézards typiques",
],
"word": "lacertidés",
},
]
}
],
)

0 comments on commit b1cb0dd

Please sign in to comment.