From 3d243900a27f947a9430491079d30fbdaf2712dc Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Nov 2024 14:10:35 +0800 Subject: [PATCH 1/3] [ko] extract sentence structure list to "pattern" field --- src/wiktextract/extractor/ko/models.py | 4 ++++ src/wiktextract/extractor/ko/pos.py | 11 ++++++++++- tests/test_ko_gloss.py | 26 ++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/ko/models.py b/src/wiktextract/extractor/ko/models.py index 39843e53..d3fec4ef 100644 --- a/src/wiktextract/extractor/ko/models.py +++ b/src/wiktextract/extractor/ko/models.py @@ -36,6 +36,7 @@ class Sense(KoreanBaseModel): examples: list[Example] = [] note: str = "" form_of: list[AltForm] = [] + pattern: str = Field(default="", description="Sentence structure, 문형") class Sound(KoreanBaseModel): @@ -100,3 +101,6 @@ class WordEntry(KoreanBaseModel): etymology_texts: list[str] = [] note: str = "" forms: list[Form] = [] + pattern: str = Field( + default="", description="Sentence structure, 문형", exclude=True + ) diff --git a/src/wiktextract/extractor/ko/pos.py b/src/wiktextract/extractor/ko/pos.py index dd899e1f..53e674df 100644 --- a/src/wiktextract/extractor/ko/pos.py +++ b/src/wiktextract/extractor/ko/pos.py @@ -65,7 +65,10 @@ def extract_pos_section( for list_item in node.find_child(NodeKind.LIST_ITEM): if node.sarg.startswith("#") and node.sarg.endswith("#"): extract_gloss_list_item( - wxr, page_data[-1], list_item, Sense() + wxr, + page_data[-1], + list_item, + Sense(pattern=page_data[-1].pattern), ) else: extract_unorderd_list_item(wxr, page_data[-1], list_item) @@ -174,6 +177,12 @@ def extract_unorderd_list_item( ): extract_linkage_list_item(wxr, word_entry, list_item, "") break + elif isinstance(node, str) and "문형:" in node: + word_entry.pattern = node[node.index(":") + 1 :].strip() + word_entry.pattern += clean_node( + wxr, None, list_item.children[index + 1 :] + ) + break else: if len(word_entry.senses) > 0: extract_example_list_item( diff --git a/tests/test_ko_gloss.py b/tests/test_ko_gloss.py index 399ca9ae..f01f1cd5 100644 --- a/tests/test_ko_gloss.py +++ b/tests/test_ko_gloss.py @@ -203,3 +203,29 @@ def test_nested_gloss_lists(self): }, ], ) + + def test_pattern_list(self): + data = parse_page( + self.wxr, + "대하다", + """== 한국어 == +=== 동사 === +==== 동사 2 ==== +*문형: […을] [(…과) …을] +# 마주 향하여 있다. +*문형: […에/에게 -게] […을 …으로] […을 -게] +# 어떤 태도로 상대하다.""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["마주 향하여 있다."], + "pattern": "[…을] [(…과) …을]", + }, + { + "glosses": ["어떤 태도로 상대하다."], + "pattern": "[…에/에게 -게] […을 …으로] […을 -게]", + }, + ], + ) From 6982a6910e4be976eb65c6675a9d061e82fcb416 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Nov 2024 14:38:47 +0800 Subject: [PATCH 2/3] [ko] extract sound file in example lists also restore empty spaces in example texts --- src/wiktextract/extractor/ko/example.py | 27 +++++++++++++++++--- src/wiktextract/extractor/ko/models.py | 33 +++++++++++++------------ tests/test_ko_example.py | 18 ++++++++++++++ 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/src/wiktextract/extractor/ko/example.py b/src/wiktextract/extractor/ko/example.py index 2d537125..39b8176a 100644 --- a/src/wiktextract/extractor/ko/example.py +++ b/src/wiktextract/extractor/ko/example.py @@ -3,7 +3,8 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from ..ruby import extract_ruby -from .models import Example, Sense +from ..share import set_sound_file_url_fields +from .models import Example, Sense, Sound def extract_example_list_item( @@ -14,6 +15,8 @@ def extract_example_list_item( parent_example: Example | None = None, ) -> None: example = Example() if parent_example is None else parent_example + e_text_nodes = [] + e_tr_nodes = [] after_lang_template = False for node in list_item.children: if isinstance(node, TemplateNode) and node.template_name == "lang": @@ -33,11 +36,29 @@ def extract_example_list_item( extract_ux_template(wxr, sense, example, node) break elif after_lang_template: - example.translation += clean_node(wxr, None, node) + e_tr_nodes.append(node) elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: break + elif ( + isinstance(node, WikiNode) + and node.kind == NodeKind.LINK + and len(node.largs) > 0 + and len(node.largs[0]) > 0 + and isinstance(node.largs[0][0], str) + and node.largs[0][0].startswith("File:") + ): + sound = Sound() + sound_file = node.largs[0][0].removeprefix("File:").strip() + set_sound_file_url_fields(wxr, sound_file, sound) + if sound.audio != "": + example.sounds.append(sound) else: - example.text += clean_node(wxr, None, node) + e_text_nodes.append(node) + + if example.text == "": + example.text = clean_node(wxr, sense, e_text_nodes) + if example.translation == "": + example.translation = clean_node(wxr, sense, e_tr_nodes) if len(example.text) > 0: if lang_code == "zh" and "/" in example.text: diff --git a/src/wiktextract/extractor/ko/models.py b/src/wiktextract/extractor/ko/models.py index d3fec4ef..a3dc775f 100644 --- a/src/wiktextract/extractor/ko/models.py +++ b/src/wiktextract/extractor/ko/models.py @@ -10,6 +10,22 @@ class KoreanBaseModel(BaseModel): ) +class Sound(KoreanBaseModel): + ipa: str = Field(default="", description="International Phonetic Alphabet") + audio: str = Field(default="", description="Audio file name") + wav_url: str = "" + oga_url: str = "" + ogg_url: str = "" + mp3_url: str = "" + opus_url: str = "" + flac_url: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + hangul: str = "" + roman: str = "" + other: str = "" + + class Example(KoreanBaseModel): text: str = "" translation: str = "" @@ -21,6 +37,7 @@ class Example(KoreanBaseModel): tags: list[str] = [] literal_meaning: str = "" note: str = "" + sounds: list[Sound] = [] class AltForm(KoreanBaseModel): @@ -39,22 +56,6 @@ class Sense(KoreanBaseModel): pattern: str = Field(default="", description="Sentence structure, 문형") -class Sound(KoreanBaseModel): - ipa: str = Field(default="", description="International Phonetic Alphabet") - audio: str = Field(default="", description="Audio file name") - wav_url: str = "" - oga_url: str = "" - ogg_url: str = "" - mp3_url: str = "" - opus_url: str = "" - flac_url: str = "" - tags: list[str] = [] - raw_tags: list[str] = [] - hangul: str = "" - roman: str = "" - other: str = "" - - class Linkage(KoreanBaseModel): word: str sense: str = "" diff --git a/tests/test_ko_example.py b/tests/test_ko_example.py index fc91daa4..238d5f8d 100644 --- a/tests/test_ko_example.py +++ b/tests/test_ko_example.py @@ -177,3 +177,21 @@ def test_jibong_yuseol_template(self): "ref": "1614년, 이수광, 《지봉유설》, 〈2권 外國 條〉", }, ) + + def test_sound_file(self): + data = parse_page( + self.wxr, + "사람", + """== 중국어 == +=== 명사 === +==== 명사 1 ==== +# 어떤 지역이나 시기에 태어나거나 살고 있거나 살았던 자. +:* 한국 '''사람''' [[File:Ko-한국 사람.oga]]""", + ) + self.assertEqual( + data[0]["senses"][0]["examples"][0]["text"], "한국 사람" + ) + self.assertEqual( + data[0]["senses"][0]["examples"][0]["sounds"][0]["audio"], + "Ko-한국 사람.oga", + ) From 6bfa818f35eedd51d49cdb522f33ad2f28ebda70 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Nov 2024 15:10:04 +0800 Subject: [PATCH 3/3] [ko] don't overwrite example data added from parent list --- src/wiktextract/extractor/ko/example.py | 16 +++++++++---- .../extractor/ko/section_titles.py | 1 + tests/test_ko_example.py | 24 +++++++++++++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/ko/example.py b/src/wiktextract/extractor/ko/example.py index 39b8176a..5f7932b9 100644 --- a/src/wiktextract/extractor/ko/example.py +++ b/src/wiktextract/extractor/ko/example.py @@ -55,10 +55,12 @@ def extract_example_list_item( else: e_text_nodes.append(node) - if example.text == "": - example.text = clean_node(wxr, sense, e_text_nodes) - if example.translation == "": - example.translation = clean_node(wxr, sense, e_tr_nodes) + e_text = clean_node(wxr, sense, e_text_nodes) + if e_text != "": + example.text = e_text + e_tr = clean_node(wxr, sense, e_tr_nodes) + if e_tr != "": + example.translation = e_tr if len(example.text) > 0: if lang_code == "zh" and "/" in example.text: @@ -77,7 +79,11 @@ def extract_example_list_item( for nested_list in list_item.find_child(NodeKind.LIST): for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM): extract_example_list_item( - wxr, sense, nested_list_item, lang_code, example + wxr, + sense, + nested_list_item, + lang_code, + example if example.text == "" else Example(), ) diff --git a/src/wiktextract/extractor/ko/section_titles.py b/src/wiktextract/extractor/ko/section_titles.py index da8ac0a0..7fce8dce 100644 --- a/src/wiktextract/extractor/ko/section_titles.py +++ b/src/wiktextract/extractor/ko/section_titles.py @@ -32,6 +32,7 @@ "연어": {"pos": "phrase", "tags": ["idiomatic"]}, "동사 활용형": {"pos": "verb", "tags": ["form-of"]}, "재귀동사": {"pos": "verb", "tags": ["reflexive"]}, + "보조형용사": {"pos": "adj", "tags": ["auxiliary"]}, } LINKAGE_SECTIONS = { diff --git a/tests/test_ko_example.py b/tests/test_ko_example.py index 238d5f8d..6635176a 100644 --- a/tests/test_ko_example.py +++ b/tests/test_ko_example.py @@ -195,3 +195,27 @@ def test_sound_file(self): data[0]["senses"][0]["examples"][0]["sounds"][0]["audio"], "Ko-한국 사람.oga", ) + + def test_wrong_nested_list(self): + data = parse_page( + self.wxr, + "들다", + """== 중국어 == +=== 명사 === +==== 명사 1 ==== +# 한 곳에서 다른 어디로 또는 밖에서 속이나 안으로 향해 가거나, 오거나 또는 어디에 자리하다. +: 안으로 드시지요. +:* 물이 어디에 '''들어''' 있어요? [[File:물이 어디에 들어 있어요?.ogg]]""", + ) + self.assertEqual( + data[0]["senses"][0]["examples"][0]["text"], "안으로 드시지요." + ) + self.assertTrue("sounds" not in data[0]["senses"][0]["examples"][0]) + self.assertEqual( + data[0]["senses"][0]["examples"][1]["text"], + "물이 어디에 들어 있어요?", + ) + self.assertEqual( + data[0]["senses"][0]["examples"][1]["sounds"][0]["audio"], + "물이 어디에 들어 있어요?.ogg", + )