Skip to content

Commit

Permalink
Merge pull request #892 from xxyzz/ko
Browse files Browse the repository at this point in the history
[ko] improve etymology and pos section code
  • Loading branch information
xxyzz authored Oct 29, 2024
2 parents 658a856 + f3548cb commit fcb2cf5
Show file tree
Hide file tree
Showing 9 changed files with 219 additions and 26 deletions.
17 changes: 12 additions & 5 deletions src/wiktextract/extractor/ko/etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,21 @@
def extract_etymology_section(
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
) -> None:
for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
text = clean_node(wxr, None, list_item.children)
if len(text) > 0:
word_entry.etymology_texts.append(text)
if len(word_entry.etymology_texts) > 0:
word_entry.etymology_texts.clear()
word_entry.categories.clear()

for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
text = clean_node(wxr, word_entry, list_item.children)
if len(text) > 0:
word_entry.etymology_texts.append(text)

if len(word_entry.etymology_texts) == 0: # no list
text = clean_node(
wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
wxr,
word_entry,
list(level_node.invert_find_child(LEVEL_KIND_FLAGS)),
)
if len(text) > 0:
word_entry.etymology_texts.append(text)
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/ko/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ class Translation(KoreanBaseModel):
sense: str = ""


class Form(KoreanBaseModel):
form: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(KoreanBaseModel):
model_config = ConfigDict(title="Korean Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -92,3 +98,5 @@ class WordEntry(KoreanBaseModel):
antonyms: list[Linkage] = []
translations: list[Translation] = []
etymology_texts: list[str] = []
note: str = ""
forms: list[Form] = []
6 changes: 5 additions & 1 deletion src/wiktextract/extractor/ko/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,11 @@ def parse_section(
)
elif title_text == "어원":
extract_etymology_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr,
page_data[-1]
if len(page_data) > 0 and len(page_data[-1].etymology_texts) == 0
else base_data,
level_node,
)
elif title_text in ["참고 문헌", "독음", "자원"]:
pass # ignore
Expand Down
93 changes: 85 additions & 8 deletions src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import re

from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
from wikitextprocessor import (
HTMLNode,
LevelNode,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand All @@ -10,7 +16,7 @@
extract_linkage_list_item,
extract_linkage_template,
)
from .models import AltForm, Sense, WordEntry
from .models import AltForm, Form, Sense, WordEntry
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import SOUND_TEMPLATES, extract_sound_template
from .tags import translate_raw_tags
Expand Down Expand Up @@ -53,6 +59,8 @@ def extract_pos_section(
if len(page_data[-1].senses) > 0
else "",
)
elif node.template_name in HEADER_TEMPLATES:
extract_header_template(wxr, page_data[-1], node)
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
if node.sarg.startswith("#"):
Expand Down Expand Up @@ -131,14 +139,20 @@ def extract_unorderd_list_item(
break
elif (
isinstance(node, str)
and ("참고:" in node or "참조:" in node)
and len(word_entry.senses) > 0
and re.search(r"(?:참고|참조|활용):", node) is not None
):
sense = word_entry.senses[-1]
sense.note = node[node.index(":") + 1 :].strip()
sense.note += clean_node(
wxr, sense, list_item.children[index + 1 :]
note_str = node[node.index(":") + 1 :].strip()
note_str += clean_node(
wxr,
word_entry.senses[-1]
if len(word_entry.senses) > 0
else word_entry,
list_item.children[index + 1 :],
)
if len(word_entry.senses) > 0:
word_entry.senses[-1].note = note_str
else:
word_entry.note = note_str
break
elif (
isinstance(node, str)
Expand All @@ -163,3 +177,66 @@ def extract_form_of_template(
word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
if len(word) > 0:
sense.form_of.append(AltForm(word=word))


HEADER_TEMPLATES = frozenset(
[
"ko-verb",
"한국어 동사",
"ko-noun",
"한국어 명사",
"ko-proper noun",
"한국어 고유명사",
]
)


def extract_header_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
if t_node.template_name in ["ko-verb", "한국어 동사"]:
extract_ko_verb_template(wxr, word_entry, t_node)
elif t_node.template_name in [
"ko-noun",
"한국어 명사",
"ko-proper noun",
"한국어 고유명사",
]:
extract_ko_noun_template(wxr, word_entry, t_node)


def extract_ko_verb_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://ko.wiktionary.org/wiki/틀:한국어_동사
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
clean_node(wxr, word_entry, expanded_node)
for top_span_tag in expanded_node.find_html(
"span", attr_name="class", attr_value="headword-line"
):
raw_tag = ""
for node in top_span_tag.children:
if isinstance(node, str):
if "(" in node:
raw_tag = node[node.rindex("(") + 1 :].strip(", ")
else:
raw_tag = node.strip(", ")
elif isinstance(node, HTMLNode) and node.tag == "b":
form = Form(form=clean_node(wxr, None, node))
if raw_tag != "":
form.raw_tags.append(raw_tag)
if form.form != "":
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_ko_noun_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://ko.wiktionary.org/wiki/틀:한국어_명사
# https://ko.wiktionary.org/wiki/틀:한국어_고유명사
hanja = clean_node(wxr, None, t_node.template_parameters.get("한자", ""))
if hanja != "":
word_entry.forms.append(Form(form=hanja, tags=["hanja"]))
6 changes: 6 additions & 0 deletions src/wiktextract/extractor/ko/sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry
from .tags import translate_raw_tags

SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA", "ja-pron"])

Expand Down Expand Up @@ -44,6 +45,7 @@ def extract_listen_pronunciation_template(
word_entry.sounds.append(sound)
elif len(word_entry.sounds) > 0:
word_entry.sounds[-1].raw_tags.append(value)
translate_raw_tags(word_entry.sounds[-1])


def extract_ipa_template(
Expand All @@ -61,6 +63,7 @@ def extract_ipa_template(
word_entry.sounds.append(sound)
elif len(word_entry.sounds) > 0:
word_entry.sounds[-1].raw_tags.append(value)
translate_raw_tags(word_entry.sounds[-1])


def extract_ko_ipa_template(
Expand All @@ -83,6 +86,7 @@ def extract_ko_ipa_template(
elif span_class == "Kore":
sound.hangul = clean_node(wxr, None, span_tag)
if sound.hangul != "" or sound.ipa != "":
translate_raw_tags(sound)
word_entry.sounds.append(sound)

for table in expanded_node.find_html("table"):
Expand All @@ -98,6 +102,7 @@ def extract_ko_ipa_template(
sound.roman = clean_node(wxr, None, td_tag)
break
if sound.roman != "":
translate_raw_tags(sound)
word_entry.sounds.append(sound)

for link_node in expanded_node.find_child(NodeKind.LINK):
Expand Down Expand Up @@ -127,5 +132,6 @@ def extract_ja_pron_template(
elif span_class == "IPA":
sound.ipa = clean_node(wxr, None, span_tag)
if sound.ipa != "" or sound.roman != "":
translate_raw_tags(sound)
word_entry.sounds.append(sound)
clean_node(wxr, word_entry, expanded_node)
35 changes: 33 additions & 2 deletions src/wiktextract/extractor/ko/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,34 @@
"타동사": "transitive",
}

TAGS = {**GLOSS_TAGS}
SOUND_TAGS = {
# 틀:ko-IPA
"Revised Romanization": ["revised", "romanization"],
"Revised Romanization (translit.)": [
"revised",
"romanization",
"transliteration",
],
"McCune-Reischauer": "McCune-Reischauer",
"Yale Romanization": ["Yale", "romanization"],
# 틀:ja-pron
"도쿄": "Tokyo",
# 틀:발음 듣기, 틀:IPA
"영국": "UK",
"미국": "US",
"영": "UK",
"미": "US",
}

HEADER_TAGS = {
# 틀:한국어_동사
"부정사형": "infinitive",
"연결어미형": "sequential",
"명사형": "noun",
"사동사": "causative",
}

TAGS = {**GLOSS_TAGS, **SOUND_TAGS, **HEADER_TAGS}

TOPICS = {
"금융": "finance",
Expand Down Expand Up @@ -47,7 +74,11 @@ def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in TAGS:
data.tags.append(TAGS[raw_tag])
tr_tag = TAGS[raw_tag]
if isinstance(tr_tag, str):
data.tags.append(tr_tag)
elif isinstance(tr_tag, list):
data.tags.extend(tr_tag)
elif raw_tag in TOPICS:
data.topics.append(TOPICS[raw_tag])
else:
Expand Down
24 changes: 24 additions & 0 deletions tests/test_ko_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,27 @@ def test_no_list(self):
self.assertEqual(
data[0]["etymology_texts"], ["아기의 이 모음 역행 동화"]
)

def test_not_include_subsection_lists(self):
data = parse_page(
self.wxr,
"병신",
"""== 한국어 ==
=== 어원 1 ===
* 욕설로 사용되는 용례는 1950년대부터 확인됨.
==== 명사 ====
# 다쳐서 몸이 온전하지 못하거나 혹은 태어나면서부터 기형의 몸을 가진 사람.
=== 어원 2 ===
* <span class="etyl">[[w:한문|한문]][[Category:한국어 terms borrowed from 한문|없다]]</span> <i class="Hant mention" lang="xzh">[[丙申|丙申]]</i>.
==== 명사 ====
# 육십 간지 가운데 하나.""",
)
self.assertEqual(
data[0]["etymology_texts"],
["욕설로 사용되는 용례는 1950년대부터 확인됨."],
)
self.assertEqual(data[1]["etymology_texts"], ["한문 丙申."])
self.assertEqual(
data[1]["categories"], ["한국어 terms borrowed from 한문"]
)
40 changes: 40 additions & 0 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,43 @@ def test_label_template(self):
"glosses": ["열매가 맺히다"],
},
)

def test_note_list_above_gloss_list(self):
data = parse_page(
self.wxr,
"놓치다",
"""== 한국어 ==
=== 명사 ===
*활용: 놓치어(놓쳐), 놓치니
# 손에 잡거나 쥐고 있던 것을 잘못하여 놓아 버리다.
# 일을 하기에 적절한 때나 기회를 그냥 보내다.""",
)
self.assertEqual(data[0]["note"], "놓치어(놓쳐), 놓치니")

def test_ko_verb(self):
self.wxr.wtp.add_page(
"틀:ko-verb",
10,
"""<span class="headword-line"><strong class="Kore headword" lang="ko">없다</strong> (<span lang="ko-Latn" class="headword-tr tr Latn" dir="ltr">eopda</span>) (부정사형 <b class="None" lang="ko">[[없어#한국어|없어]]</b>[[Category:한국어 비표준 문자가 포함된 낱말 (링크)|없다]], 연결어미형 <b class="None" lang="ko">[[없으니#한국어|없으니]]</b>, 명사형 <b class="None" lang="ko">[[없음#한국어|없음]]</b>, 사동사 <b class="None" lang="ko">[[없애다#한국어|없애다]]</b>)</span>[[Category:한국어 동사|없다]]""",
)
data = parse_page(
self.wxr,
"없다",
"""== 한국어 ==
=== 형용사 ===
{{ko-verb|nm=없음|cv=없애다}}
# 대상이 실제로 존재하지 않는 상태이다.""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "없어", "tags": ["infinitive"]},
{"form": "없으니", "tags": ["sequential"]},
{"form": "없음", "tags": ["noun"]},
{"form": "없애다", "tags": ["causative"]},
],
)
self.assertEqual(
data[0]["categories"],
["한국어 비표준 문자가 포함된 낱말 (링크)", "한국어 동사"],
)
16 changes: 6 additions & 10 deletions tests/test_ko_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ def test_common_sound_templates(self):
# [[대답하다]], [[대꾸하다]].""",
)
self.assertEqual(data[0]["sounds"][0]["audio"], "en-uk-answer.ogg")
self.assertEqual(data[0]["sounds"][0]["raw_tags"], ["영국"])
self.assertEqual(data[0]["sounds"][0]["tags"], ["UK"])
self.assertEqual(data[0]["sounds"][1]["audio"], "en-us-answer.ogg")
self.assertEqual(data[0]["sounds"][1]["raw_tags"], ["미국"])
self.assertEqual(data[0]["sounds"][1]["tags"], ["US"])
self.assertEqual(
data[0]["sounds"][2:],
[
{"ipa": "ˈɑːn.sə(ɹ)", "raw_tags": [""]},
{"ipa": "ˈæn.sɚ", "raw_tags": [""]},
{"ipa": "ˈɑːn.sə(ɹ)", "tags": ["UK"]},
{"ipa": "ˈæn.sɚ", "tags": ["US"]},
],
)
self.assertEqual(
Expand Down Expand Up @@ -69,7 +69,7 @@ def test_ko_ipa_template(self):
[
{"ipa": "[ka̠]", "raw_tags": ["표준어/서울"]},
{"hangul": "[가]"},
{"roman": "ga", "raw_tags": ["Revised Romanization"]},
{"roman": "ga", "tags": ["revised", "romanization"]},
],
)
self.assertEqual(
Expand All @@ -94,11 +94,7 @@ def test_ja_pron(self):
self.assertEqual(
data[0]["sounds"],
[
{
"roman": "[tóꜜòzàì]",
"other": "とーざい",
"raw_tags": ["도쿄"],
},
{"roman": "[tóꜜòzàì]", "other": "とーざい", "tags": ["Tokyo"]},
{"ipa": "[to̞ːza̠i]"},
],
)
Expand Down

0 comments on commit fcb2cf5

Please sign in to comment.