From ef6ac385d44f7ae0817536da70ca77f8716fdc71 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 30 Jan 2024 10:54:00 +0800 Subject: [PATCH] Extract span node's title attribute from expanded qualifier template --- src/wiktextract/extractor/zh/translation.py | 16 +++++++++++++--- tests/test_zh_translation.py | 21 ++++++++++++++++++++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 319dee3c0..7c603966f 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -121,9 +121,19 @@ def process_translation_list_item( continue else: # qualifier template - tag = clean_node(wxr, None, child) - if len(tag) > 0: - tr_data.tags.append(tag.strip("()")) + expanded_template = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(child), expand_all=True + ) + find_title = False + for span_node in expanded_template.find_html("span"): + tag = span_node.attrs.get("title", "") + if len(tag) > 0: + tr_data.tags.append(tag.strip()) + find_title = True + if not find_title: + tag = clean_node(wxr, None, child) + if len(tag) > 0: + tr_data.tags.append(tag.strip("()")) elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: if len(tr_data.word) > 0: page_data[-1].translations.append(tr_data.model_copy(deep=True)) diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 40d6561bd..c381be439 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -180,8 +180,21 @@ def test_language_name_template(self): def test_l_template(self): self.wxr.wtp.start_page("茄子") + self.wxr.wtp.add_page("Template:cs", 10, "捷克语") + self.wxr.wtp.add_page( + "Template:l", + 10, + """{{{2}}} +{{#if:{{{g|}}}|m}}""", + ) + self.wxr.wtp.add_page( + "Template:口", 10, '〉' + ) page_data = [WordEntry(word="茄子", lang_code="zh", lang="漢語")] - node = self.wxr.wtp.parse("* 南非語: {{l|af|eiervrug}}") + node = self.wxr.wtp.parse( + """* 南非語: {{l|af|eiervrug}} +* {{cs}}: {{l|cs|patližán|g=m}} {{口}}""" + ) extract_translation(self.wxr, page_data, node) self.assertEqual( [ @@ -194,5 +207,11 @@ def test_l_template(self): "lang": "南非語", "word": "eiervrug", }, + { + "lang_code": "cs", + "lang": "捷克语", + "word": "patližán", + "tags": ["陽性名詞", "口语词汇"], + }, ], )