Skip to content

Commit

Permalink
Merge pull request #378 from xxyzz/fr
Browse files Browse the repository at this point in the history
Update French extractor's linkage and form line code
  • Loading branch information
xxyzz authored Oct 24, 2023
2 parents b2ab827 + 694649d commit 435627b
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 76 deletions.
8 changes: 8 additions & 0 deletions json_schema/fr.json
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,14 @@
"translation": {
"description": "French translation",
"type": "string"
},
"sense": {
"description": "Definition of the word",
"type": "string"
},
"sense_index": {
"description": "Number of the definition, start from 1",
"type": "integer"
}
}
}
Expand Down
31 changes: 17 additions & 14 deletions src/wiktextract/extractor/fr/form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Dict, List, Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wikitextprocessor.parser import TemplateNode, HTMLNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -56,19 +56,22 @@ def process_equiv_pour_template(
wxr: WiktextractContext, node: TemplateNode, page_data: List[Dict]
) -> None:
# equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour
form_type = node.template_parameters.get(1)
for template_arg_index in range(2, 8):
form = clean_node(
wxr, None, node.template_parameters.get(template_arg_index, "")
)
if len(form) > 0:
page_data[-1]["forms"].append(
{
"form": form,
"tags": [f"pour {form_type}"],
"source": "form line template 'équiv-pour'",
}
)
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
)
form_tag = ""
for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML):
if child.kind == NodeKind.ITALIC:
form_tag = clean_node(wxr, None, child).strip("() ")
elif isinstance(child, HTMLNode) and child.tag == "bdi":
form_data = {
"form": clean_node(wxr, None, child),
"source": "form line template 'équiv-pour'",
}
if len(form_tag) > 0:
form_data["tags"] = [form_tag]
if len(form_data["form"]) > 0:
page_data[-1]["forms"].append(form_data)


def process_zh_mot_template(
Expand Down
30 changes: 27 additions & 3 deletions src/wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,34 @@ def extract_linkage(
level_node: WikiNode,
linkage_type: str,
) -> None:
for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
sense_text = ""
sense_index = 0
for template_or_list_node in level_node.find_child_recursively(
NodeKind.LIST_ITEM | NodeKind.TEMPLATE
):
# list table start template: https://fr.wiktionary.org/wiki/Modèle:(
if (
isinstance(template_or_list_node, TemplateNode)
and template_or_list_node.template_name == "("
):
sense_text = clean_node(
wxr, None, template_or_list_node.template_parameters.get(1, "")
)
sense_index_text = template_or_list_node.template_parameters.get(
2, "0"
)
if sense_index_text.isdigit():
sense_index = int(sense_index_text)
continue

linkage_data = defaultdict(list)
if len(sense_text) > 0:
linkage_data["sense"] = sense_text
if sense_index != 0:
linkage_data["sense_index"] = sense_index
pending_tag = ""
for index, child_node in enumerate( # remove nested lists
list_item_node.invert_find_child(NodeKind.LIST)
template_or_list_node.invert_find_child(NodeKind.LIST)
):
if index == 0 or "word" not in linkage_data:
if isinstance(child_node, TemplateNode):
Expand Down Expand Up @@ -57,7 +80,8 @@ def extract_linkage(
elif len(tag) > 0:
linkage_data["tags"].append(tag)

page_data[-1][linkage_type].append(linkage_data)
if "word" in linkage_data:
page_data[-1][linkage_type].append(linkage_data)


def process_linkage_template(
Expand Down
80 changes: 48 additions & 32 deletions tests/test_fr_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,38 +42,6 @@ def test_gender(self, mock_clean_node):
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(page_data, [{"tags": ["masculin"]}])

def test_equiv_pour(self):
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(
"{{équiv-pour|une femme|autrice|auteure|auteuse|lang=fr}}"
)
page_data = [defaultdict(list)]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
page_data,
[
{
"forms": [
{
"form": "autrice",
"tags": ["pour une femme"],
"source": "form line template 'équiv-pour'",
},
{
"form": "auteure",
"tags": ["pour une femme"],
"source": "form line template 'équiv-pour'",
},
{
"form": "auteuse",
"tags": ["pour une femme"],
"source": "form line template 'équiv-pour'",
},
]
}
],
)

def test_zh_mot(self):
self.wxr.wtp.start_page("")
self.wxr.wtp.add_page("Modèle:zh-mot", 10, body="{{lang}} {{pron}}")
Expand Down Expand Up @@ -134,3 +102,51 @@ def test_template_in_pron_argument(self):
page_data,
[{"sounds": [{"ipa": "mi.ne.ʁa.l‿aʁ.ʒi.lø"}]}],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value="''(pour un homme, on dit'' : <bdi lang=\"fr\" xml:lang=\"fr\" class=\"lang-fr\">[[auteur#fr|auteur]]</bdi> ; ''pour une personne non-binaire, on peut dire'' : <bdi lang=\"fr\" xml:lang=\"fr\" class=\"lang-fr\">[[autaire#fr|autaire]]</bdi>, <bdi lang=\"fr\" xml:lang=\"fr\" class=\"lang-fr\">[[auteurice#fr|auteurice]]</bdi>, <bdi lang=\"fr\" xml:lang=\"fr\" class=\"lang-fr\">[[auteur·ice#fr|auteur·ice]]</bdi>'')''"
)
def test_equiv_pour_template(self, mock_node_to_wikitext):
self.maxDiff = None
self.wxr.wtp.start_page("autrice")
root = self.wxr.wtp.parse(
"{{équiv-pour|un homme|auteur|2egenre=une personne non-binaire|2egenre1=autaire|2egenre2=auteurice|2egenre3=auteur·ice|lang=fr}}"
)
page_data = [defaultdict(list)]
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(
page_data,
[
{
"forms": [
{
"form": "auteur",
"tags": ["pour un homme, on dit"],
"source": "form line template 'équiv-pour'",
},
{
"form": "autaire",
"tags": [
"pour une personne non-binaire, on peut dire"
],
"source": "form line template 'équiv-pour'",
},
{
"form": "auteurice",
"tags": [
"pour une personne non-binaire, on peut dire"
],
"source": "form line template 'équiv-pour'",
},
{
"form": "auteur·ice",
"tags": [
"pour une personne non-binaire, on peut dire"
],
"source": "form line template 'équiv-pour'",
},
]
}
],
)
18 changes: 7 additions & 11 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,11 @@ def test_not_italic_tag(self):
[
{
"senses": [
{"glosses": ["Oiseau aquatique de taille moyenne du genre Rhynchops."]}
{
"glosses": [
"Oiseau aquatique de taille moyenne du genre Rhynchops."
]
}
]
}
],
Expand All @@ -220,18 +224,10 @@ def test_preserve_space_between_tags(self):
# https://fr.wiktionary.org/wiki/becs-en-ciseaux
# the space between italic node and the link node should be preserved
self.wxr.wtp.start_page("becs-en-ciseaux")
root = self.wxr.wtp.parse(
"# ''Pluriel de'' [[bec-en-ciseaux]]."
)
root = self.wxr.wtp.parse("# ''Pluriel de'' [[bec-en-ciseaux]].")
page_data = [defaultdict(list)]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
page_data,
[
{
"senses": [
{"glosses": ["Pluriel de bec-en-ciseaux."]}
]
}
],
[{"senses": [{"glosses": ["Pluriel de bec-en-ciseaux."]}]}],
)
24 changes: 24 additions & 0 deletions tests/test_fr_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,27 @@ def test_sub_list(self):
}
],
)

def test_sense(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("autrice")
root = self.wxr.wtp.parse(
"""{{(|Celle qui est à l’origine de quelque chose|1}}
* [[artisane]]
"""
)
extract_linkage(self.wxr, page_data, root, "synonyms")
self.assertEqual(
page_data,
[
{
"synonyms": [
{
"word": "artisane",
"sense": "Celle qui est à l’origine de quelque chose",
"sense_index": 1,
},
]
}
],
)
22 changes: 10 additions & 12 deletions tests/test_fr_note.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,17 @@ def tearDown(self) -> None:
def test_list_notes(self):
# list created from template "note-féminisation"
# https://fr.wiktionary.org/wiki/autrice
self.wxr.wtp.add_page("Modèle:note-féminisation", 10, "* list 1\n* list 2")
self.wxr.wtp.add_page(
"Modèle:note-féminisation", 10, "* list 1\n* list 2"
)
self.wxr.wtp.start_page("autrice")
nodes = self.wxr.wtp.parse("""==== {{S|notes}} ====
nodes = self.wxr.wtp.parse(
"""==== {{S|notes}} ====
paragrapy 1
{{note-féminisation}}""")
{{note-féminisation}}"""
)
page_data = [defaultdict(list)]
extract_note(self.wxr, page_data, nodes.children[0])
self.assertEqual(page_data, [
{
"notes": [
"paragrapy 1",
"list 1",
"list 2"
]
}
])
self.assertEqual(
page_data, [{"notes": ["paragrapy 1", "list 1", "list 2"]}]
)
5 changes: 1 addition & 4 deletions tests/test_inflection_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

class InflTests(unittest.TestCase):
def setUp(self):
self.maxDiff = 100000
self.maxDiff = None
self.wxr = WiktextractContext(Wtp(), WiktionaryConfig())
self.wxr.wtp.start_page("testpage")
self.wxr.wtp.start_section("English")
Expand Down Expand Up @@ -203,7 +203,6 @@ def test_English_verb1(self):
"form": "wanderest",
"source": "Conjugation",
"tags": [
"archaic",
"present",
"second-person",
"singular"
Expand All @@ -222,7 +221,6 @@ def test_English_verb1(self):
"form": "wanderedst",
"source": "Conjugation",
"tags": [
"archaic",
"past",
"second-person",
"singular"
Expand All @@ -241,7 +239,6 @@ def test_English_verb1(self):
"form": "wandereth",
"source": "Conjugation",
"tags": [
"archaic",
"present",
"singular",
"third-person"
Expand Down

0 comments on commit 435627b

Please sign in to comment.