Skip to content

Commit

Permalink
Merge pull request #439 from xxyzz/fr
Browse files Browse the repository at this point in the history
Extract "Paronymes" section under the "Prononciation" section
  • Loading branch information
xxyzz authored Dec 27, 2023
2 parents 82fc4d5 + c39c4ed commit 8556c0b
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 6 deletions.
19 changes: 13 additions & 6 deletions src/wiktextract/extractor/fr/pronunciation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
from wiktextract.extractor.share import create_audio_url_dict
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -15,11 +15,18 @@ def extract_pronunciation(
) -> None:
sound_data = []
lang_code = base_data.lang_code
for list_node in level_node.find_child(NodeKind.LIST):
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
sound_data.extend(
process_pron_list_item(wxr, list_item_node, Sound(), lang_code)
)
for node in level_node.find_child(NodeKind.LIST | LEVEL_KIND_FLAGS):
if node.kind == NodeKind.LIST:
for list_item_node in node.find_child(NodeKind.LIST_ITEM):
sound_data.extend(
process_pron_list_item(
wxr, list_item_node, Sound(), lang_code
)
)
else:
from .page import parse_section

parse_section(wxr, page_data, base_data, node)

if len(sound_data) == 0:
return
Expand Down
31 changes: 31 additions & 0 deletions tests/test_fr_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,34 @@ def test_no_ipa(self):
"mp3_url": "https://upload.wikimedia.org/wikipedia/commons/transcoded/3/3f/LL-Q9027_(swe)-Moonhouse-mars.wav/LL-Q9027_(swe)-Moonhouse-mars.wav.mp3",
},
)

def test_paronymes_subsection(self):
# https://fr.wiktionary.org/wiki/wagonnet
page_data = []
self.wxr.wtp.add_page("Modèle:pron", 10, body="\\{{{1|}}}\\")
self.wxr.wtp.start_page("wagonnet")
root = self.wxr.wtp.parse(
"""=== {{S|prononciation}} ===
* {{pron|va.ɡɔ.nɛ|fr}}
==== {{S|paronymes}} ====
* [[wagonnée]]
* [[wagonnier]]
"""
)
extract_pronunciation(
self.wxr,
page_data,
root.children[0],
WordEntry(word="wagonnet", lang_code="fr", lang_name="Français"),
)
self.assertEqual(
page_data[0].model_dump(exclude_defaults=True),
{
"word": "wagonnet",
"lang_code": "fr",
"lang_name": "Français",
"paronyms": [{"word": "wagonnée"}, {"word": "wagonnier"}],
"sounds": [{"ipa": "\\va.ɡɔ.nɛ\\"}],
},
)

0 comments on commit 8556c0b

Please sign in to comment.