Skip to content

Commit

Permalink
Extract gloss text from low quality zh edition page
Browse files Browse the repository at this point in the history
Some pages don't have POS title and don't use gloss list.
  • Loading branch information
xxyzz committed Jan 30, 2024
1 parent ef6ac38 commit 23a9a70
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 14 deletions.
7 changes: 7 additions & 0 deletions src/wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,13 @@ def process_pos_block(
parse_section(wxr, page_data, base_data, child)
else:
parse_section(wxr, page_data, base_data, child)
if len(page_data[-1].senses) == 0:
# low quality pages don't put gloss in list
gloss_text = clean_node(
wxr, page_data[-1], list(node.invert_find_child(LEVEL_KIND_FLAGS))
)
if len(gloss_text) > 0:
page_data[-1].senses.append(Sense(glosses=[gloss_text]))


def extract_etymology(
Expand Down
51 changes: 37 additions & 14 deletions tests/test_zh_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from wiktextract.wxr_context import WiktextractContext


class TestExample(TestCase):
class TestGloss(TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="zh"),
Expand Down Expand Up @@ -140,19 +140,42 @@ def test_soft_redirect_ja_see(self):
)

def test_gloss_text_only_page(self):
self.assertEqual(
parse_page(
self.wxr,
# title, page wikitext, results
test_cases = [
[
"paraphrase",
"""== 英语 ==
释义;意译""",
),
"== 英语 ==\n释义;意译",
[
{
"lang": "英语",
"lang_code": "en",
"senses": [{"glosses": ["释义;意译"]}],
"word": "paraphrase",
}
],
],
[
{
"lang": "英语",
"lang_code": "en",
"senses": [{"glosses": ["释义;意译"]}],
"word": "paraphrase",
}
"鐵面無私",
"==漢語==\n===釋義===\n形容[[公正]]严明,绝不因[[徇私]]或畏权而讲情面。",
[
{
"lang": "漢語",
"lang_code": "zh",
"senses": [
{
"glosses": [
"形容公正严明,绝不因徇私或畏权而讲情面。"
]
}
],
"word": "鐵面無私",
}
],
],
)
]
for title, wikitext, results in test_cases:
with self.subTest(title=title, wikitext=wikitext, results=results):
self.assertEqual(
parse_page(self.wxr, title, wikitext),
results,
)

0 comments on commit 23a9a70

Please sign in to comment.