Skip to content

Commit

Permalink
Merge pull request #895 from xxyzz/ko
Browse files Browse the repository at this point in the history
[ko] improve gloss and linkage section code
  • Loading branch information
xxyzz authored Oct 30, 2024
2 parents 598a4ee + d9ef41b commit d49d402
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 6 deletions.
28 changes: 28 additions & 0 deletions src/wiktextract/extractor/ko/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,31 @@ def extract_proverb_section(
break
if linkage.word != "":
word_entry.proverbs.append(linkage)
else:
for t_node in list_item.find_child(NodeKind.TEMPLATE):
if t_node.template_name in ["l", "연결"]:
extract_l_template(wxr, word_entry, t_node, "proverbs")


def extract_l_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str,
) -> None:
# https://ko.wiktionary.org/wiki/틀:연결
# https://en.wiktionary.org/wiki/Template:link
for word_arg in [3, 2]:
if word_arg in t_node.template_parameters:
word = clean_node(wxr, None, t_node.template_parameters[word_arg])
if word == "":
break
linkage = Linkage(word=word)
for sense_arg in ["t", 4]:
if sense_arg in t_node.template_parameters:
linkage.sense = clean_node(
wxr, None, t_node.template_parameters[sense_arg]
)
break
getattr(word_entry, linkage_type).append(linkage)
break
25 changes: 19 additions & 6 deletions src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ def extract_pos_section(
extract_header_template(wxr, page_data[-1], node)
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
if node.sarg.startswith("#"):
extract_gloss_list_item(wxr, page_data[-1], list_item)
if node.sarg.startswith("#") and node.sarg.endswith("#"):
extract_gloss_list_item(
wxr, page_data[-1], list_item, Sense()
)
else:
extract_unorderd_list_item(wxr, page_data[-1], list_item)

Expand All @@ -73,19 +75,30 @@ def extract_pos_section(


def extract_gloss_list_item(
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
parent_sense: Sense,
) -> None:
gloss_nodes = []
sense = Sense()
sense = parent_sense.model_copy(deep=True)
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
gloss_text = clean_node(wxr, sense, gloss_nodes)
if len(gloss_text) > 0:
sense.glosses.append(gloss_text)
translate_raw_tags(sense)
word_entry.senses.append(sense)
gloss_nodes.clear()
for nested_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_unorderd_list_item(wxr, word_entry, nested_list_item)
if node.sarg.startswith("#") and node.sarg.endswith("#"):
extract_gloss_list_item(
wxr, word_entry, nested_list_item, sense
)
else:
extract_unorderd_list_item(
wxr, word_entry, nested_list_item
)
continue
elif isinstance(node, TemplateNode) and node.template_name.endswith(
" of"
Expand Down Expand Up @@ -127,7 +140,7 @@ def extract_unorderd_list_item(
if re.fullmatch(r"\d+(?:-\d+)?\.?", bold_text):
new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
new_list_item.children = list_item.children[index + 1 :]
extract_gloss_list_item(wxr, word_entry, new_list_item)
extract_gloss_list_item(wxr, word_entry, new_list_item, Sense())
break
elif isinstance(node, str) and "어원:" in node:
etymology_nodes = []
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ko/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@
"같이 보기": "related",
"복합어": "derived",
"관련 단어": "related",
"동의어": "synonyms",
}
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/ko/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
"자동사": "intransitive",
"직역": "literally",
"타동사": "transitive",
"드물게": "rare",
"원래의 의미": "naturally",
"문학적": "literary",
"해학적": "humorous",
"완곡적": "euphemistic",
}

SOUND_TAGS = {
Expand Down
27 changes: 27 additions & 0 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,30 @@ def test_ko_verb(self):
data[0]["categories"],
["한국어 비표준 문자가 포함된 낱말 (링크)", "한국어 동사"],
)

def test_nested_gloss_lists(self):
data = parse_page(
self.wxr,
"병신",
"""== 한국어 ==
=== 어원 1 ===
==== 명사 ====
# 하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말.
## 남에게 [[당하다|당하거나]] [[헌신하다|헌신하기만]] 하는 대상을 동정하거나, 혹은 그런 사람이 자신의 [[처지]]를 [[하소연하다|하소연할]] 때 사용하는 표현.""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말."
]
},
{
"glosses": [
"하는 짓이나 생각이 변변치 못한 사람을 낮잡아 이르는 말.",
"남에게 당하거나 헌신하기만 하는 대상을 동정하거나, 혹은 그런 사람이 자신의 처지를 하소연할 때 사용하는 표현.",
]
},
],
)
20 changes: 20 additions & 0 deletions tests/test_ko_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,23 @@ def test_zh_pinyin(self):
{"word": "土产", "roman": "tǔchǎn", "sense": "흙, 땅"},
],
)

def test_l_template(self):
data = parse_page(
self.wxr,
"병신",
"""== 중국어 ==
=== 명사 ===
# 다쳐서
==== 관용구 ====
* {{l|ko|병신도 제 재미에 산다|t=사람은 각자 자기 잘 난 맛에 산다라는 뜻}}""",
)
self.assertEqual(
data[0]["proverbs"],
[
{
"word": "병신도 제 재미에 산다",
"sense": "사람은 각자 자기 잘 난 맛에 산다라는 뜻",
}
],
)

0 comments on commit d49d402

Please sign in to comment.