Skip to content

Commit

Permalink
Merge pull request #857 from xxyzz/ko
Browse files Browse the repository at this point in the history
[ko] extract some sound templates
  • Loading branch information
xxyzz authored Oct 8, 2024
2 parents 3500e78 + 761dffe commit 0c43779
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 2 deletions.
16 changes: 16 additions & 0 deletions src/wiktextract/extractor/ko/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,21 @@ class Sense(KoreanBaseModel):
examples: list[Example] = []


class Sound(KoreanBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []
hangul: str = ""
roman: str = ""


class WordEntry(KoreanBaseModel):
model_config = ConfigDict(title="Korean Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -42,3 +57,4 @@ class WordEntry(KoreanBaseModel):
tags: list[str] = []
raw_tags: list[str] = []
etymology_text: str = ""
sounds: list[Sound] = []
9 changes: 7 additions & 2 deletions src/wiktextract/extractor/ko/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .sound import SOUND_TEMPLATES, extract_sound_template

PANEL_TEMPLATES = set()
PANEL_PREFIXES = set()
Expand Down Expand Up @@ -65,8 +66,12 @@ def parse_language_section(
pos="unknown",
)
extract_section_categories(wxr, page_data, base_data, level2_node)
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)
for t_node in level2_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name in SOUND_TEMPLATES:
extract_sound_template(wxr, base_data, t_node)

for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)

# no POS section
if len(page_data) == pre_data_len:
Expand Down
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .section_titles import POS_DATA
from .sound import SOUND_TEMPLATES, extract_sound_template


def extract_pos_section(
Expand All @@ -23,6 +24,10 @@ def extract_pos_section(
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))

for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name in SOUND_TEMPLATES:
extract_sound_template(wxr, page_data[-1], t_node)

for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
if list_node.sarg.endswith("#"):
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ko/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
"접미사": {"pos": "suffix", "tags": ["morpheme"]},
"접두사": {"pos": "prefix", "tags": ["morpheme"]},
"의미": {"pos": "unknown"},
"타동사": {"pos": "verb", "tags": ["transitive"]},
}
95 changes: 95 additions & 0 deletions src/wiktextract/extractor/ko/sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from wikitextprocessor import NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry

SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA"])


def extract_sound_template(
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
) -> None:
if node.template_name == "발음 듣기":
extract_listen_pronunciation_template(wxr, word_entry, node)
elif node.template_name == "IPA":
extract_ipa_template(wxr, word_entry, node)
elif node.template_name == "ko-IPA":
extract_ko_ipa_template(wxr, word_entry, node)


def extract_listen_pronunciation_template(
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
) -> None:
# https://ko.wiktionary.org/wiki/틀:발음_듣기
for key in range(1, 9):
if key not in node.template_parameters:
break
value = clean_node(wxr, None, node.template_parameters[key])
if value == "":
continue
elif key % 2 == 1:
sound = Sound()
set_sound_file_url_fields(wxr, value, sound)
word_entry.sounds.append(sound)
elif len(word_entry.sounds) > 0:
word_entry.sounds[-1].raw_tags.append(value)


def extract_ipa_template(
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
) -> None:
# https://ko.wiktionary.org/wiki/틀:IPA
for key in range(1, 5):
if key not in node.template_parameters:
break
value = clean_node(wxr, None, node.template_parameters[key])
if value == "":
continue
elif key % 2 == 1:
sound = Sound(ipa=value)
word_entry.sounds.append(sound)
elif len(word_entry.sounds) > 0:
word_entry.sounds[-1].raw_tags.append(value)


def extract_ko_ipa_template(
wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
) -> None:
# https://ko.wiktionary.org/wiki/틀:ko-IPA
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
)
for ul_tag in expanded_node.find_html("ul"):
for li_tag in ul_tag.find_html("li"):
sound = Sound()
for i_tag in li_tag.find_html("i"):
sound.raw_tags.append(clean_node(wxr, None, i_tag))
break
for span_tag in li_tag.find_html("span"):
span_class = span_tag.attrs.get("class", "")
if span_class == "IPA":
sound.ipa = clean_node(wxr, None, span_tag)
elif span_class == "Kore":
sound.hangul = clean_node(wxr, None, span_tag)
if sound.hangul != "" or sound.ipa != "":
word_entry.sounds.append(sound)

for table in expanded_node.find_html("table"):
for tr_tag in table.find_html("tr"):
sound = Sound()
for th_tag in tr_tag.find_html("th"):
for span_tag in th_tag.find_html("span"):
sound.raw_tags.append(clean_node(wxr, None, span_tag))
break
for td_tag in tr_tag.find_html(
"td", attr_name="class", attr_value="IPA"
):
sound.roman = clean_node(wxr, None, td_tag)
break
if sound.roman != "":
word_entry.sounds.append(sound)

for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, word_entry, link_node)
77 changes: 77 additions & 0 deletions tests/test_ko_sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.ko.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestKoSound(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="ko"),
WiktionaryConfig(
dump_file_lang_code="ko",
capture_language_codes=None,
),
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_common_sound_templates(self):
data = parse_page(
self.wxr,
"answer",
"""== 영어 ==
{{발음 듣기|en-uk-answer.ogg|영국|en-us-answer.ogg|미국}}
{{IPA|ˈɑːn.sə(ɹ)|영|ˈæn.sɚ|미}}
==== 타동사 ====
# [[대답하다]], [[대꾸하다]].""",
)
self.assertEqual(data[0]["sounds"][0]["audio"], "en-uk-answer.ogg")
self.assertEqual(data[0]["sounds"][0]["raw_tags"], ["영국"])
self.assertEqual(data[0]["sounds"][1]["audio"], "en-us-answer.ogg")
self.assertEqual(data[0]["sounds"][1]["raw_tags"], ["미국"])
self.assertEqual(
data[0]["sounds"][2:],
[
{"ipa": "ˈɑːn.sə(ɹ)", "raw_tags": ["영"]},
{"ipa": "ˈæn.sɚ", "raw_tags": ["미"]},
],
)
self.assertEqual(
data[0]["senses"][0]["glosses"], ["대답하다, 대꾸하다."]
)

def test_ko_ipa_template(self):
self.wxr.wtp.add_page(
"틀:ko-IPA",
10,
"""<ul><li>(<i>[[w:대한민국 표준어|표준어]]/[[w:경기 방언|서울]]</i>) [[w:국제 음성 기호|IPA]]<sup>([[위키낱말사전:국제 음성 기호|표기]])</sup>: <span class="IPA">[ka̠]</span></li><li class="ko-pron__ph">발음: <span class="Kore" lang="ko">[<span>가</span>]</span></li></ul><table><tr><th colspan="2">로마자 표기 목록</th></tr><tr><th>[[부록:로마자 표기법/국어|국어의 로마자 표기]]<br/><span>Revised Romanization</span></th><td class="IPA">ga</td></tr></table>[[분류:한국어 IPA 발음이 포함된 낱말]]""",
)
data = parse_page(
self.wxr,
"가",
"""== 한국어 ==
{{ko-IPA}}
=== 명사 ===
==== 명사 1 ====
# 어떤""",
)
self.assertEqual(
data[0]["sounds"],
[
{"ipa": "[ka̠]", "raw_tags": ["표준어/서울"]},
{"hangul": "[가]"},
{"roman": "ga", "raw_tags": ["Revised Romanization"]},
],
)
self.assertEqual(
data[0]["categories"], ["한국어 IPA 발음이 포함된 낱말"]
)

0 comments on commit 0c43779

Please sign in to comment.