Skip to content

Commit

Permalink
Merge pull request #441 from xxyzz/fr
Browse files Browse the repository at this point in the history
Extract sense text and index data from description list(`;`)
  • Loading branch information
xxyzz authored Dec 28, 2023
2 parents a570899 + 54c733e commit 2bdf0f7
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
14 changes: 14 additions & 0 deletions src/wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.page import clean_node
Expand Down Expand Up @@ -71,6 +73,18 @@ def process_linkage_list(
if sense_index_text.isdigit():
sense_index = int(sense_index_text)
continue
# sense could also be in ";" description list
if (
template_or_list_node.kind == NodeKind.LIST_ITEM
and template_or_list_node.sarg == ";"
):
sense_text = clean_node(wxr, None, template_or_list_node.children)
index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
m = re.search(index_pattern, sense_text)
if m is not None:
sense_text = re.sub(index_pattern, "", sense_text)
sense_index = int(m.group(1))
continue

linkage_data = Linkage()
if len(sense_text) > 0:
Expand Down
12 changes: 11 additions & 1 deletion tests/test_fr_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,18 @@ def test_sub_list(self):
)

def test_sense(self):
# https://fr.wiktionary.org/wiki/autrice
# https://fr.wiktionary.org/wiki/embouteillage
page_data = [
WordEntry(word="test", lang_code="fr", lang_name="Français")
WordEntry(word="autrice", lang_code="fr", lang_name="Français")
]
self.wxr.wtp.start_page("autrice")
root = self.wxr.wtp.parse(
"""{{(|Celle qui est à l’origine de quelque chose|1}}
* [[artisane]]
; Mise en bouteille (sens 1)
* [[bouchonnerie]]
"""
)
extract_linkage(self.wxr, page_data, root, "synonymes")
Expand All @@ -138,6 +143,11 @@ def test_sense(self):
"sense": "Celle qui est à l’origine de quelque chose",
"sense_index": 1,
},
{
"word": "bouchonnerie",
"sense": "Mise en bouteille",
"sense_index": 1,
},
],
)

Expand Down

0 comments on commit 2bdf0f7

Please sign in to comment.