Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ko] improve gloss and linkage section code #895

Merged
merged 2 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions src/wiktextract/extractor/ko/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,31 @@ def extract_proverb_section(
break
if linkage.word != "":
word_entry.proverbs.append(linkage)
else:
for t_node in list_item.find_child(NodeKind.TEMPLATE):
if t_node.template_name in ["l", "연결"]:
extract_l_template(wxr, word_entry, t_node, "proverbs")


def extract_l_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str,
) -> None:
# https://ko.wiktionary.org/wiki/틀:연결
# https://en.wiktionary.org/wiki/Template:link
for word_arg in [3, 2]:
if word_arg in t_node.template_parameters:
word = clean_node(wxr, None, t_node.template_parameters[word_arg])
if word == "":
break
linkage = Linkage(word=word)
for sense_arg in ["t", 4]:
if sense_arg in t_node.template_parameters:
linkage.sense = clean_node(
wxr, None, t_node.template_parameters[sense_arg]
)
break
getattr(word_entry, linkage_type).append(linkage)
break
25 changes: 19 additions & 6 deletions src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ def extract_pos_section(
extract_header_template(wxr, page_data[-1], node)
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
if node.sarg.startswith("#"):
extract_gloss_list_item(wxr, page_data[-1], list_item)
if node.sarg.startswith("#") and node.sarg.endswith("#"):
extract_gloss_list_item(
wxr, page_data[-1], list_item, Sense()
)
else:
extract_unorderd_list_item(wxr, page_data[-1], list_item)

Expand All @@ -73,19 +75,30 @@ def extract_pos_section(


def extract_gloss_list_item(
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
parent_sense: Sense,
) -> None:
gloss_nodes = []
sense = Sense()
sense = parent_sense.model_copy(deep=True)
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
gloss_text = clean_node(wxr, sense, gloss_nodes)
if len(gloss_text) > 0:
sense.glosses.append(gloss_text)
translate_raw_tags(sense)
word_entry.senses.append(sense)
gloss_nodes.clear()
for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_unorderd_list_item(wxr, word_entry, nested_list_item)
if node.sarg.startswith("#") and node.sarg.endswith("#"):
extract_gloss_list_item(
wxr, word_entry, nested_list_item, sense
)
else:
extract_unorderd_list_item(
wxr, word_entry, nested_list_item
)
continue
elif isinstance(node, TemplateNode) and node.template_name.endswith(
" of"
Expand Down Expand Up @@ -127,7 +140,7 @@ def extract_unorderd_list_item(
if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
new_list_item.children = list_item.children[index + 1 :]
extract_gloss_list_item(wxr, word_entry, new_list_item)
extract_gloss_list_item(wxr, word_entry, new_list_item, Sense())
break
elif isinstance(node, str) and "어원:" in node:
etymology_nodes = []
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ko/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@
"같이 보기": "related",
"복합어": "derived",
"관련 단어": "related",
"동의어": "synonyms",
}
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/ko/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
"자동사": "intransitive",
"직역": "literally",
"타동사": "transitive",
"드물게": "rare",
"원래의 의미": "naturally",
"문학적": "literary",
"해학적": "humorous",
"완곡적": "euphemistic",
}

SOUND_TAGS = {
Expand Down
27 changes: 27 additions & 0 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,30 @@ def test_ko_verb(self):
data[0]["categories"],
["한국어 비표준 문자가 포함된 낱말 (링크)", "한국어 동사"],
)

def test_nested_gloss_lists(self):
data = parse_page(
self.wxr,
"병신",
"""== 한국어 ==
=== 어원 1 ===
==== 명사 ====
# 하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말.
## 남에게 [[당하다|당하거나]] [[헌신하다|헌신하기만]] 하는 대상을 동정하거나, 혹은 그런 사람이 자신의 [[처지]]를 [[하소연하다|하소연할]] 때 사용하는 표현.""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말."
]
},
{
"glosses": [
"하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말.",
"남에게 당하거나 헌신하기만 하는 대상을 동정하거나, 혹은 그런 사람이 자신의 처지를 하소연할 때 사용하는 표현.",
]
},
],
)
20 changes: 20 additions & 0 deletions tests/test_ko_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,23 @@ def test_zh_pinyin(self):
{"word": "土产", "roman": "tǔchǎn", "sense": "흙, 땅"},
],
)

def test_l_template(self):
data = parse_page(
self.wxr,
"병신",
"""== 중국어 ==
=== 명사 ===
# 다쳐서
==== 관용구 ====
* {{l|ko|병신도 제 재미에 산다|t=사람은 각자 자기 잘 난 맛에 산다라는 뜻}}""",
)
self.assertEqual(
data[0]["proverbs"],
[
{
"word": "병신도 제 재미에 산다",
"sense": "사람은 각자 자기 잘 난 맛에 산다라는 뜻",
}
],
)