Skip to content

Commit

Permalink
Merge pull request #440 from xxyzz/fr
Browse files Browse the repository at this point in the history
Improve French extractor's translation section code
  • Loading branch information
xxyzz authored Dec 28, 2023
2 parents 8556c0b + 98667b1 commit a570899
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 35 deletions.
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/fr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ class Translation(FrenchBaseModel):
lang_name: str = Field("", description="Translation language name")
word: str = Field("", description="Translation term")
sense: str = Field("", description="Translation gloss")
sense_index: int = Field(
0, ge=0, description="Number of the definition, start from 1"
)
tags: list[str] = []
roman: str = ""
traditional_writing: str = Field(
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def parse_section(
wxr.config.capture_translations
and section_type in wxr.config.OTHER_SUBTITLES["translations"]
):
extract_translation(wxr, page_data, level_node)
extract_translation(wxr, page_data, base_data, level_node)
elif (
wxr.config.capture_inflections
and section_type
Expand Down
23 changes: 16 additions & 7 deletions src/wiktextract/extractor/fr/translation.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from typing import Optional

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .models import Translation, WordEntry


def extract_translation(
wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: WikiNode,
) -> None:
base_translation_data = Translation()
for level_node_child in level_node.filter_empty_str_child():
Expand Down Expand Up @@ -38,6 +41,10 @@ def extract_translation(
wxr, child_node, previous_node, page_data
)
previous_node = child_node
elif level_node_child.kind in LEVEL_KIND_FLAGS:
from .page import parse_section

parse_section(wxr, page_data, base_data, level_node_child)


def process_italic_node(
Expand Down Expand Up @@ -70,11 +77,13 @@ def process_translation_templates(
return
elif template_node.template_name == "trad-début":
# translation box start: https://fr.wiktionary.org/wiki/Modèle:trad-début
sense_parameter = template_node.template_parameters.get(1)
if sense_parameter is not None:
sense_text = clean_node(wxr, None, sense_parameter)
if len(sense_text) > 0:
base_translation_data.sense = sense_text
sense_parameter = template_node.template_parameters.get(1, "")
sense_text = clean_node(wxr, None, sense_parameter)
base_translation_data.sense = sense_text
base_translation_data.sense_index = int(
template_node.template_parameters.get(2, "0")
)

elif template_node.template_name == "T":
# Translation language: https://fr.wiktionary.org/wiki/Modèle:T
base_translation_data.lang_code = template_node.template_parameters.get(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ def test_sandwich_tag(self):
"glosses": [
"Autrice, femme qui a créé une œuvre littéraire. Écrivaine."
],
"tags": ["Littérature", "Rare", "Absolument"]
"tags": ["Littérature", "Rare", "Absolument"],
}
],
)
4 changes: 3 additions & 1 deletion tests/test_fr_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ def test_invalid_ipa(self, mock_node_to_wikitext):
def test_no_column_headers(self, mock_node_to_wikitext):
# https://fr.wiktionary.org/wiki/一万#Nom_commun
# template "zh-formes"
page_data = [WordEntry(word="一万", lang_code="zh", lang_name="Chinois")]
page_data = [
WordEntry(word="一万", lang_code="zh", lang_name="Chinois")
]
node = TemplateNode(0)
self.wxr.wtp.start_page("一万")
extract_inflection(self.wxr, page_data, node)
Expand Down
73 changes: 48 additions & 25 deletions tests/test_fr_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ def test_italic_tag(self):
root = self.wxr.wtp.parse(
"=== Traductions ===\n* {{trad-début|Formule pour saluer}}\n* {{T|sq}} : {{trad+|sq|mirëdita}}, {{trad-|sq|mirë mëngjes}} ''(le matin)''"
)
page_data = [
WordEntry(word="bonjour", lang_code="fr", lang_name="Français")
]
extract_translation(self.wxr, page_data, root.children[0])
base_data = WordEntry(
word="bonjour", lang_code="fr", lang_name="Français"
)
page_data = [base_data.model_copy(deep=True)]
extract_translation(self.wxr, page_data, base_data, root.children[0])
self.assertEqual(
page_data[-1].model_dump(exclude_defaults=True),
{
Expand Down Expand Up @@ -59,10 +60,11 @@ def test_template_tag(self):
root = self.wxr.wtp.parse(
"=== Traductions ===\n* {{T|ar}} : {{trad+|ar|مرحبا|dif=مرحبًا|tr={{transliterator|ar|مرحبا}}}} {{informel|nocat=1}}"
)
page_data = [
WordEntry(word="bonjour", lang_code="fr", lang_name="Français")
]
extract_translation(self.wxr, page_data, root.children[0])
base_data = WordEntry(
word="bonjour", lang_code="fr", lang_name="Français"
)
page_data = [base_data.model_copy(deep=True)]
extract_translation(self.wxr, page_data, base_data, root.children[0])
self.assertEqual(
page_data[-1].model_dump(exclude_defaults=True),
{
Expand All @@ -87,10 +89,11 @@ def test_traditional_writing(self):
root = self.wxr.wtp.parse(
"=== Traductions ===\n* {{T|mn}} : {{trad+|mn|сайн байна уу|tr=sain baina uu|tradi=ᠰᠠᠶᠢᠨ ᠪᠠᠶᠢᠨ᠎ᠠ ᠤᠤ}}"
)
page_data = [
WordEntry(word="bonjour", lang_code="fr", lang_name="Français")
]
extract_translation(self.wxr, page_data, root.children[0])
base_data = WordEntry(
word="bonjour", lang_code="fr", lang_name="Français"
)
page_data = [base_data.model_copy(deep=True)]
extract_translation(self.wxr, page_data, base_data, root.children[0])
self.assertEqual(
page_data[-1].model_dump(exclude_defaults=True),
{
Expand All @@ -117,10 +120,11 @@ def test_trad_template_gender_parameter(self):
root = self.wxr.wtp.parse(
"=== Traductions ===\n* {{T|de}} : {{trad|de|Kambium|n}}"
)
page_data = [
WordEntry(word="cambium", lang_code="fr", lang_name="Français")
]
extract_translation(self.wxr, page_data, root.children[0])
base_data = WordEntry(
word="cambium", lang_code="fr", lang_name="Français"
)
page_data = [base_data.model_copy(deep=True)]
extract_translation(self.wxr, page_data, base_data, root.children[0])
self.assertEqual(
page_data[-1].model_dump(exclude_defaults=True),
{
Expand All @@ -140,18 +144,31 @@ def test_trad_template_gender_parameter(self):

def test_template_sense_parameter(self):
self.wxr.wtp.start_page("masse")
self.wxr.wtp.add_page("Modèle:info lex", 10, body="(Finance)")
self.wxr.wtp.add_page("Modèle:T", 10, body="Croate")
self.wxr.wtp.add_page("Modèle:trad+", 10, body="masa")
self.wxr.wtp.add_page("Modèle:S", 10, "{{{1}}}")
self.wxr.wtp.add_page("Modèle:info lex", 10, "(Finance)")
self.wxr.wtp.add_page(
"Modèle:T",
10,
"""{{#switch: {{{1}}}
| hr = Croate
| af = Afrikaans
}}""",
)
self.wxr.wtp.add_page("Modèle:trad+", 10, "masa")
root = self.wxr.wtp.parse(
"""=== Traductions ===
"""==== {{S|traductions}} ====
{{trad-début|{{info lex|finance}}|12}}
* {{T|hr}} : {{trad+|hr|masa}}"""
* {{T|hr}} : {{trad+|hr|masa}}
{{trad-fin}}
===== {{S|traductions à trier}} =====
* {{T|af|trier}} : {{trad+|af|massa}}"""
)
base_data = WordEntry(
word="masse", lang_code="fr", lang_name="Français"
)
page_data = [
WordEntry(word="masse", lang_code="fr", lang_name="Français")
]
extract_translation(self.wxr, page_data, root.children[0])
page_data = [base_data.model_copy(deep=True)]
extract_translation(self.wxr, page_data, base_data, root.children[0])
self.assertEqual(
page_data[-1].model_dump(exclude_defaults=True),
{
Expand All @@ -164,6 +181,12 @@ def test_template_sense_parameter(self):
"lang_name": "Croate",
"word": "masa",
"sense": "(Finance)",
"sense_index": 12,
},
{
"lang_code": "af",
"lang_name": "Afrikaans",
"word": "massa",
},
],
},
Expand Down

0 comments on commit a570899

Please sign in to comment.