From 23a9a709b83601b3ca3e87e38ebe54655df99be1 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 30 Jan 2024 12:10:32 +0800 Subject: [PATCH] Extract gloss text from low quality zh edition page Some pages don't have POS title and don't use gloss list. --- src/wiktextract/extractor/zh/page.py | 7 ++++ tests/test_zh_gloss.py | 51 ++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py index 61f35eb3d..4868c350e 100644 --- a/src/wiktextract/extractor/zh/page.py +++ b/src/wiktextract/extractor/zh/page.py @@ -148,6 +148,13 @@ def process_pos_block( parse_section(wxr, page_data, base_data, child) else: parse_section(wxr, page_data, base_data, child) + if len(page_data[-1].senses) == 0: + # low quality pages don't put gloss in list + gloss_text = clean_node( + wxr, page_data[-1], list(node.invert_find_child(LEVEL_KIND_FLAGS)) + ) + if len(gloss_text) > 0: + page_data[-1].senses.append(Sense(glosses=[gloss_text])) def extract_etymology( diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py index d2af44788..3157640cf 100644 --- a/tests/test_zh_gloss.py +++ b/tests/test_zh_gloss.py @@ -13,7 +13,7 @@ from wiktextract.wxr_context import WiktextractContext -class TestExample(TestCase): +class TestGloss(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="zh"), @@ -140,19 +140,42 @@ def test_soft_redirect_ja_see(self): ) def test_gloss_text_only_page(self): - self.assertEqual( - parse_page( - self.wxr, + # title, page wikitext, results + test_cases = [ + [ "paraphrase", - """== 英语 == -释义;意译""", - ), + "== 英语 ==\n释义;意译", + [ + { + "lang": "英语", + "lang_code": "en", + "senses": [{"glosses": ["释义;意译"]}], + "word": "paraphrase", + } + ], + ], [ - { - "lang": "英语", - "lang_code": "en", - "senses": [{"glosses": ["释义;意译"]}], - "word": "paraphrase", - } + "鐵面無私", + "==漢語==\n===釋義===\n形容[[公正]]严明,绝不因[[徇私]]或畏权而讲情面。", + [ + { + "lang": "漢語", + "lang_code": "zh", + "senses": [ + { + "glosses": [ + "形容公正严明,绝不因徇私或畏权而讲情面。" + ] + } + ], + "word": "鐵面無私", + } + ], ], - ) + ] + for title, wikitext, results in test_cases: + with self.subTest(title=title, wikitext=wikitext, results=results): + self.assertEqual( + parse_page(self.wxr, title, wikitext), + results, + )