-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #857 from xxyzz/ko
[ko] extract some sound templates
- Loading branch information
Showing
6 changed files
with
201 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
from wikitextprocessor import NodeKind, TemplateNode | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from ..share import set_sound_file_url_fields | ||
from .models import Sound, WordEntry | ||
|
||
SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA"]) | ||
|
||
|
||
def extract_sound_template( | ||
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode | ||
) -> None: | ||
if node.template_name == "발음 듣기": | ||
extract_listen_pronunciation_template(wxr, word_entry, node) | ||
elif node.template_name == "IPA": | ||
extract_ipa_template(wxr, word_entry, node) | ||
elif node.template_name == "ko-IPA": | ||
extract_ko_ipa_template(wxr, word_entry, node) | ||
|
||
|
||
def extract_listen_pronunciation_template( | ||
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode | ||
) -> None: | ||
# https://ko.wiktionary.org/wiki/틀:발음_듣기 | ||
for key in range(1, 9): | ||
if key not in node.template_parameters: | ||
break | ||
value = clean_node(wxr, None, node.template_parameters[key]) | ||
if value == "": | ||
continue | ||
elif key % 2 == 1: | ||
sound = Sound() | ||
set_sound_file_url_fields(wxr, value, sound) | ||
word_entry.sounds.append(sound) | ||
elif len(word_entry.sounds) > 0: | ||
word_entry.sounds[-1].raw_tags.append(value) | ||
|
||
|
||
def extract_ipa_template( | ||
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode | ||
) -> None: | ||
# https://ko.wiktionary.org/wiki/틀:IPA | ||
for key in range(1, 5): | ||
if key not in node.template_parameters: | ||
break | ||
value = clean_node(wxr, None, node.template_parameters[key]) | ||
if value == "": | ||
continue | ||
elif key % 2 == 1: | ||
sound = Sound(ipa=value) | ||
word_entry.sounds.append(sound) | ||
elif len(word_entry.sounds) > 0: | ||
word_entry.sounds[-1].raw_tags.append(value) | ||
|
||
|
||
def extract_ko_ipa_template( | ||
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode | ||
) -> None: | ||
# https://ko.wiktionary.org/wiki/틀:ko-IPA | ||
expanded_node = wxr.wtp.parse( | ||
wxr.wtp.node_to_wikitext(node), expand_all=True | ||
) | ||
for ul_tag in expanded_node.find_html("ul"): | ||
for li_tag in ul_tag.find_html("li"): | ||
sound = Sound() | ||
for i_tag in li_tag.find_html("i"): | ||
sound.raw_tags.append(clean_node(wxr, None, i_tag)) | ||
break | ||
for span_tag in li_tag.find_html("span"): | ||
span_class = span_tag.attrs.get("class", "") | ||
if span_class == "IPA": | ||
sound.ipa = clean_node(wxr, None, span_tag) | ||
elif span_class == "Kore": | ||
sound.hangul = clean_node(wxr, None, span_tag) | ||
if sound.hangul != "" or sound.ipa != "": | ||
word_entry.sounds.append(sound) | ||
|
||
for table in expanded_node.find_html("table"): | ||
for tr_tag in table.find_html("tr"): | ||
sound = Sound() | ||
for th_tag in tr_tag.find_html("th"): | ||
for span_tag in th_tag.find_html("span"): | ||
sound.raw_tags.append(clean_node(wxr, None, span_tag)) | ||
break | ||
for td_tag in tr_tag.find_html( | ||
"td", attr_name="class", attr_value="IPA" | ||
): | ||
sound.roman = clean_node(wxr, None, td_tag) | ||
break | ||
if sound.roman != "": | ||
word_entry.sounds.append(sound) | ||
|
||
for link_node in expanded_node.find_child(NodeKind.LINK): | ||
clean_node(wxr, word_entry, link_node) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
from unittest import TestCase | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.ko.page import parse_page | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestKoSound(TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="ko"), | ||
WiktionaryConfig( | ||
dump_file_lang_code="ko", | ||
capture_language_codes=None, | ||
), | ||
) | ||
|
||
def tearDown(self) -> None: | ||
self.wxr.wtp.close_db_conn() | ||
|
||
def test_common_sound_templates(self): | ||
data = parse_page( | ||
self.wxr, | ||
"answer", | ||
"""== 영어 == | ||
{{발음 듣기|en-uk-answer.ogg|영국|en-us-answer.ogg|미국}} | ||
{{IPA|ˈɑːn.sə(ɹ)|영|ˈæn.sɚ|미}} | ||
==== 타동사 ==== | ||
# [[대답하다]], [[대꾸하다]].""", | ||
) | ||
self.assertEqual(data[0]["sounds"][0]["audio"], "en-uk-answer.ogg") | ||
self.assertEqual(data[0]["sounds"][0]["raw_tags"], ["영국"]) | ||
self.assertEqual(data[0]["sounds"][1]["audio"], "en-us-answer.ogg") | ||
self.assertEqual(data[0]["sounds"][1]["raw_tags"], ["미국"]) | ||
self.assertEqual( | ||
data[0]["sounds"][2:], | ||
[ | ||
{"ipa": "ˈɑːn.sə(ɹ)", "raw_tags": ["영"]}, | ||
{"ipa": "ˈæn.sɚ", "raw_tags": ["미"]}, | ||
], | ||
) | ||
self.assertEqual( | ||
data[0]["senses"][0]["glosses"], ["대답하다, 대꾸하다."] | ||
) | ||
|
||
def test_ko_ipa_template(self): | ||
self.wxr.wtp.add_page( | ||
"틀:ko-IPA", | ||
10, | ||
"""<ul><li>(<i>[[w:대한민국 표준어|표준어]]/[[w:경기 방언|서울]]</i>) [[w:국제 음성 기호|IPA]]<sup>([[위키낱말사전:국제 음성 기호|표기]])</sup>: <span class="IPA">[ka̠]</span></li><li class="ko-pron__ph">발음: <span class="Kore" lang="ko">[<span>가</span>]</span></li></ul><table><tr><th colspan="2">로마자 표기 목록</th></tr><tr><th>[[부록:로마자 표기법/국어|국어의 로마자 표기]]<br/><span>Revised Romanization</span></th><td class="IPA">ga</td></tr></table>[[분류:한국어 IPA 발음이 포함된 낱말]]""", | ||
) | ||
data = parse_page( | ||
self.wxr, | ||
"가", | ||
"""== 한국어 == | ||
{{ko-IPA}} | ||
=== 명사 === | ||
==== 명사 1 ==== | ||
# 어떤""", | ||
) | ||
self.assertEqual( | ||
data[0]["sounds"], | ||
[ | ||
{"ipa": "[ka̠]", "raw_tags": ["표준어/서울"]}, | ||
{"hangul": "[가]"}, | ||
{"roman": "ga", "raw_tags": ["Revised Romanization"]}, | ||
], | ||
) | ||
self.assertEqual( | ||
data[0]["categories"], ["한국어 IPA 발음이 포함된 낱말"] | ||
) |