Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ko] improve extract gloss list and example list code #896

Merged
merged 3 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 31 additions & 4 deletions src/wiktextract/extractor/ko/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..ruby import extract_ruby
from .models import Example, Sense
from ..share import set_sound_file_url_fields
from .models import Example, Sense, Sound


def extract_example_list_item(
Expand All @@ -14,6 +15,8 @@ def extract_example_list_item(
parent_example: Example | None = None,
) -> None:
example = Example() if parent_example is None else parent_example
e_text_nodes = []
e_tr_nodes = []
after_lang_template = False
for node in list_item.children:
if isinstance(node, TemplateNode) and node.template_name == "lang":
Expand All @@ -33,11 +36,31 @@ def extract_example_list_item(
extract_ux_template(wxr, sense, example, node)
break
elif after_lang_template:
example.translation += clean_node(wxr, None, node)
e_tr_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
break
elif (
isinstance(node, WikiNode)
and node.kind == NodeKind.LINK
and len(node.largs) > 0
and len(node.largs[0]) > 0
and isinstance(node.largs[0][0], str)
and node.largs[0][0].startswith("File:")
):
sound = Sound()
sound_file = node.largs[0][0].removeprefix("File:").strip()
set_sound_file_url_fields(wxr, sound_file, sound)
if sound.audio != "":
example.sounds.append(sound)
else:
example.text += clean_node(wxr, None, node)
e_text_nodes.append(node)

e_text = clean_node(wxr, sense, e_text_nodes)
if e_text != "":
example.text = e_text
e_tr = clean_node(wxr, sense, e_tr_nodes)
if e_tr != "":
example.translation = e_tr

if len(example.text) > 0:
if lang_code == "zh" and "/" in example.text:
Expand All @@ -56,7 +79,11 @@ def extract_example_list_item(
for nested_list in list_item.find_child(NodeKind.LIST):
for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(
wxr, sense, nested_list_item, lang_code, example
wxr,
sense,
nested_list_item,
lang_code,
example if example.text == "" else Example(),
)


Expand Down
37 changes: 21 additions & 16 deletions src/wiktextract/extractor/ko/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,22 @@ class KoreanBaseModel(BaseModel):
)


class Sound(KoreanBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []
hangul: str = ""
roman: str = ""
other: str = ""


class Example(KoreanBaseModel):
text: str = ""
translation: str = ""
Expand All @@ -21,6 +37,7 @@ class Example(KoreanBaseModel):
tags: list[str] = []
literal_meaning: str = ""
note: str = ""
sounds: list[Sound] = []


class AltForm(KoreanBaseModel):
Expand All @@ -36,22 +53,7 @@ class Sense(KoreanBaseModel):
examples: list[Example] = []
note: str = ""
form_of: list[AltForm] = []


class Sound(KoreanBaseModel):
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []
hangul: str = ""
roman: str = ""
other: str = ""
pattern: str = Field(default="", description="Sentence structure, 문형")


class Linkage(KoreanBaseModel):
Expand Down Expand Up @@ -100,3 +102,6 @@ class WordEntry(KoreanBaseModel):
etymology_texts: list[str] = []
note: str = ""
forms: list[Form] = []
pattern: str = Field(
default="", description="Sentence structure, 문형", exclude=True
)
11 changes: 10 additions & 1 deletion src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ def extract_pos_section(
for list_item in node.find_child(NodeKind.LIST_ITEM):
if node.sarg.startswith("#") and node.sarg.endswith("#"):
extract_gloss_list_item(
wxr, page_data[-1], list_item, Sense()
wxr,
page_data[-1],
list_item,
Sense(pattern=page_data[-1].pattern),
)
else:
extract_unorderd_list_item(wxr, page_data[-1], list_item)
Expand Down Expand Up @@ -174,6 +177,12 @@ def extract_unorderd_list_item(
):
extract_linkage_list_item(wxr, word_entry, list_item, "")
break
elif isinstance(node, str) and "문형:" in node:
word_entry.pattern = node[node.index(":") + 1 :].strip()
word_entry.pattern += clean_node(
wxr, None, list_item.children[index + 1 :]
)
break
else:
if len(word_entry.senses) > 0:
extract_example_list_item(
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ko/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"연어": {"pos": "phrase", "tags": ["idiomatic"]},
"동사 활용형": {"pos": "verb", "tags": ["form-of"]},
"재귀동사": {"pos": "verb", "tags": ["reflexive"]},
"보조형용사": {"pos": "adj", "tags": ["auxiliary"]},
}

LINKAGE_SECTIONS = {
Expand Down
42 changes: 42 additions & 0 deletions tests/test_ko_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,45 @@ def test_jibong_yuseol_template(self):
"ref": "1614년, 이수광, 《지봉유설》, 〈2권 外國 條〉",
},
)

def test_sound_file(self):
data = parse_page(
self.wxr,
"사람",
"""== 중국어 ==
=== 명사 ===
==== 명사 1 ====
# 어떤 지역이나 시기에 태어나거나 살고 있거나 살았던 자.
:* 한국 '''사람''' [[File:Ko-한국 사람.oga]]""",
)
self.assertEqual(
data[0]["senses"][0]["examples"][0]["text"], "한국 사람"
)
self.assertEqual(
data[0]["senses"][0]["examples"][0]["sounds"][0]["audio"],
"Ko-한국 사람.oga",
)

def test_wrong_nested_list(self):
data = parse_page(
self.wxr,
"들다",
"""== 중국어 ==
=== 명사 ===
==== 명사 1 ====
# 한 곳에서 다른 어디로 또는 밖에서 속이나 안으로 향해 가거나, 오거나 또는 어디에 자리하다.
: 안으로 드시지요.
:* 물이 어디에 '''들어''' 있어요? [[File:물이 어디에 들어 있어요?.ogg]]""",
)
self.assertEqual(
data[0]["senses"][0]["examples"][0]["text"], "안으로 드시지요."
)
self.assertTrue("sounds" not in data[0]["senses"][0]["examples"][0])
self.assertEqual(
data[0]["senses"][0]["examples"][1]["text"],
"물이 어디에 들어 있어요?",
)
self.assertEqual(
data[0]["senses"][0]["examples"][1]["sounds"][0]["audio"],
"물이 어디에 들어 있어요?.ogg",
)
26 changes: 26 additions & 0 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,29 @@ def test_nested_gloss_lists(self):
},
],
)

def test_pattern_list(self):
data = parse_page(
self.wxr,
"대하다",
"""== 한국어 ==
=== 동사 ===
==== 동사 2 ====
*문형: […을] [(…과) …을]
# 마주 향하여 있다.
*문형: […에/에게 -게] […을 …으로] […을 -게]
# 어떤 태도로 상대하다.""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["마주 향하여 있다."],
"pattern": "[…을] [(…과) …을]",
},
{
"glosses": ["어떤 태도로 상대하다."],
"pattern": "[…에/에게 -게] […을 …으로] […을 -게]",
},
],
)