Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't use append_base_data() in French extractor #406

Merged
merged 2 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 12 additions & 18 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.datautils import append_base_data
from wiktextract.page import LEVEL_KINDS, clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -45,7 +44,11 @@ def parse_section(
# https://fr.wiktionary.org/wiki/Modèle:S
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
section_type = level_node_template.template_parameters.get(1)
subtitle = clean_node(wxr, page_data[-1], level_node.largs)
subtitle = clean_node(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node.largs,
)
wxr.wtp.start_subsection(subtitle)
if section_type in wxr.config.OTHER_SUBTITLES["ignored_sections"]:
pass
Expand All @@ -69,7 +72,7 @@ def parse_section(
wxr.config.capture_pronunciation
and section_type in wxr.config.OTHER_SUBTITLES["pronunciation"]
):
extract_pronunciation(wxr, page_data, level_node)
extract_pronunciation(wxr, page_data, level_node, base_data)
elif (
wxr.config.capture_linkages
and section_type in wxr.config.LINKAGE_SUBTITLES
Expand Down Expand Up @@ -104,7 +107,8 @@ def process_pos_block(
pos_title: str,
):
pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"]
append_base_data(page_data, "pos", pos_type, base_data)
page_data.append(copy.deepcopy(base_data))
page_data[-1]["pos"] = pos_type
page_data[-1]["pos_title"] = pos_title
child_nodes = list(pos_title_node.filter_empty_str_child())
form_line_start = 0 # Ligne de forme
Expand Down Expand Up @@ -163,27 +167,17 @@ def parse_page(
# https://fr.wiktionary.org/wiki/Modèle:langue
# https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
if subtitle_template.template_name == "langue":
categories_and_links = defaultdict(list)
base_data = defaultdict(list, {"word": wxr.wtp.title})
lang_code = subtitle_template.template_parameters.get(1)
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue
lang_name = clean_node(
wxr, categories_and_links, subtitle_template
)
lang_name = clean_node(wxr, base_data, subtitle_template)
wxr.wtp.start_section(lang_name)
base_data = defaultdict(
list,
{
"lang_name": lang_name,
"lang_code": lang_code,
"word": wxr.wtp.title,
},
)
base_data.update(categories_and_links)
page_data.append(copy.deepcopy(base_data))
base_data["lang_name"] = lang_name
base_data["lang_code"] = lang_code
etymology_data: Optional[EtymologyData] = None
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
new_etymology_data = parse_section(
Expand Down
16 changes: 11 additions & 5 deletions src/wiktextract/extractor/fr/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,25 @@


def extract_pronunciation(
wxr: WiktextractContext, page_data: List[Dict], level_node: WikiNode
wxr: WiktextractContext,
page_data: List[Dict],
level_node: WikiNode,
base_data: Dict[str, str],
) -> None:
sound_data = []
lang_code = base_data.get("lang_code")
for list_node in level_node.find_child(NodeKind.LIST):
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
sound_data.extend(
process_pron_list_item(
wxr, list_item_node, page_data, defaultdict(list)
wxr, list_item_node, defaultdict(list), lang_code
)
)

if len(sound_data) == 0:
return
if len(page_data) == 0:
page_data.append(deepcopy(base_data))

if level_node.kind == NodeKind.LEVEL3:
# Add extracted sound data to all sense dictionaries that have the same
Expand Down Expand Up @@ -53,10 +59,10 @@ def extract_pronunciation(
def process_pron_list_item(
wxr: WiktextractContext,
list_item_node: WikiNode,
page_data: List[Dict],
sound_data: Dict[str, Union[str, List[str]]],
lang_code: str,
) -> List[Dict[str, Union[str, List[str]]]]:
pron_key = "zh-pron" if page_data[-1].get("lang_code") == "zh" else "ipa"
pron_key = "zh-pron" if lang_code == "zh" else "ipa"

for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name in PRON_TEMPLATES:
Expand All @@ -81,7 +87,7 @@ def process_pron_list_item(
):
new_sound_data = deepcopy(sound_data)
process_pron_list_item(
wxr, nest_list_item, page_data, new_sound_data
wxr, nest_list_item, new_sound_data, lang_code
)
if pron_key in new_sound_data:
returned_data.append(new_sound_data)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from wiktextract.wxr_context import WiktextractContext


class TestFormLine(unittest.TestCase):
class TestFrGloss(unittest.TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr")
Expand Down Expand Up @@ -115,7 +115,7 @@ def test_zh_exemple_template(self):
root = self.wxr.wtp.parse(
"=== {{S|nom|zh}} ===\n# Cheval.\n{{zh-exemple|这匹'''马'''很大。|Ce cheval est grand.|Zhè pǐ '''mǎ''' hěn dà.<br/>⠌⠢⠆ ⠏⠊⠄ ⠍⠔⠄ ⠓⠴⠄ ⠙⠔⠆⠐⠆}}"
)
page_data = [defaultdict(list)]
page_data = []
process_pos_block(
self.wxr,
page_data,
Expand Down
17 changes: 12 additions & 5 deletions tests/test_fr_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_pron_list(self):
root = self.wxr.wtp.parse(
"=== Prononciation ===\n* {{pron|bɔ̃.ʒuʁ|fr}}\n** {{écouter|France (Paris)|bõ.ʒuːʁ|audio=Fr-bonjour.ogg|lang=fr}}"
)
extract_pronunciation(self.wxr, page_data, root.children[0])
extract_pronunciation(self.wxr, page_data, root.children[0], {})
self.assertEqual(
page_data,
[
Expand Down Expand Up @@ -60,13 +60,18 @@ def test_pron_list(self):
)

def test_str_pron(self):
page_data = [defaultdict(list, {"lang_code": "zh"})]
page_data = []
self.wxr.wtp.add_page("Modèle:Yale-zh", 10, body="Yale")
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(
"=== {{S|prononciation}} ===\n* '''cantonais''' {{pron||yue}}\n** {{Yale-zh}} : nei⁵hou²"
)
extract_pronunciation(self.wxr, page_data, root.children[0])
extract_pronunciation(
self.wxr,
page_data,
root.children[0],
defaultdict(list, {"lang_code": "zh"}),
)
self.assertEqual(
page_data[0].get("sounds"),
[{"tags": ["cantonais", "Yale"], "zh-pron": "nei⁵hou²"}],
Expand All @@ -78,14 +83,16 @@ def test_no_ipa(self):
files.
Test wikitext from https://fr.wiktionary.org/wiki/mars
"""
page_data = [defaultdict(list)]
page_data = []
self.wxr.wtp.start_page("")
root = self.wxr.wtp.parse(
"""=== {{S|prononciation}} ===
{{ébauche-pron|sv}}
* {{écouter|lang=sv|Suède||audio=LL-Q9027 (swe)-Moonhouse-mars.wav}}"""
)
extract_pronunciation(self.wxr, page_data, root.children[0])
extract_pronunciation(
self.wxr, page_data, root.children[0], defaultdict(list)
)
self.assertEqual(
page_data[0].get("sounds"),
[
Expand Down