diff --git a/json_schema/zh.json b/json_schema/zh.json index 429915d8..745762f5 100644 --- a/json_schema/zh.json +++ b/json_schema/zh.json @@ -147,6 +147,12 @@ "items": { "type": "string" } + }, + "descendants": { + "type": "array", + "items": { + "$ref": "#/$defs/descendant" + } } }, "$defs": { @@ -315,6 +321,46 @@ "enum": ["zh-Hant", "zh-Hans"] } } + }, + "descendant": { + "type": "object", + "properties": { + "lang_code": { + "description": "ISO 639-1 code", + "type": "string" + }, + "lang_name": { + "type": "string" + }, + "word": { + "type": "string" + }, + "roman": { + "type": "string" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "descendants": { + "type": "array", + "items": { + "$refs": "#/$defs/descendant" + } + }, + "ruby": { + "description": "Japanese Kanji and furigana", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "string" + } + } + } + } } } } diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py new file mode 100644 index 00000000..34b0ac79 --- /dev/null +++ b/tests/test_zh_descendant.py @@ -0,0 +1,117 @@ +from collections import defaultdict +from unittest import TestCase +from unittest.mock import Mock + +from wikitextprocessor import Wtp + +from wiktextract.extractor.zh.descendant import extract_descendants +from wiktextract.thesaurus import close_thesaurus_db +from wiktextract.wxr_context import WiktextractContext + + +class TestDescendant(TestCase): + def setUp(self): + self.wxr = WiktextractContext(Wtp(lang_code="zh"), Mock()) + + def tearDown(self): + self.wxr.wtp.close_db_conn() + close_thesaurus_db( + self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn + ) + + def test_ruby(self): + # https://zh.wiktionary.org/wiki/你好 + self.wxr.wtp.start_page("你好") + self.wxr.wtp.add_page( + "Template:desc", + 10, + ' 日語:', + ) + self.wxr.wtp.add_page( + "Template:ja-r", + 10, + '[[你好#日語|-{你好(ニイハオ)}-]] (nīhao)', + ) + root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}") + page_data = defaultdict(list) + extract_descendants(self.wxr, root, page_data) + self.assertEqual( + page_data.get("descendants"), + [ + { + "lang_code": "ja", + "lang_name": "日語", + "roman": "nīhao", + "ruby": [("你好", "ニイハオ")], + "word": "你好", + } + ], + ) + + def test_roman_only_list(self): + self.wxr.wtp.start_page("你好") + self.wxr.wtp.add_page( + "Template:desc", + 10, + ' 壯語:[[mwngz ndei#壯語|-{mwngz ndei}-]] (仿譯)', + ) + root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}") + page_data = defaultdict(list) + extract_descendants(self.wxr, root, page_data) + self.assertEqual( + page_data.get("descendants"), + [ + { + "lang_code": "za", + "lang_name": "壯語", + "tags": ["仿譯"], + "word": "mwngz ndei", + } + ], + ) + + def test_nested_list(self): + # https://zh.wiktionary.org/wiki/オタク + self.wxr.wtp.start_page("オタク") + self.wxr.wtp.add_page( + "Template:desc", + 10, + ' 官話:', + ) + self.wxr.wtp.add_page( + "Template:zh-l", + 10, + '{{{1}}} ({{{1}}}', + ) + root = self.wxr.wtp.parse( + """*: {{desc|cmn|-}} {{zh-l|御宅族}} +*:* {{desc|cmn|-|der=1}} {{zh-l|宅男}} +*:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}""" + ) + page_data = defaultdict(list) + extract_descendants(self.wxr, root, page_data) + self.assertEqual( + page_data.get("descendants"), + [ + { + "descendants": [ + { + "lang_code": "cmn", + "lang_name": "官話", + "roman": "宅男", + "word": "宅男", + }, + { + "lang_code": "cmn", + "lang_name": "官話", + "roman": "宅女", + "word": "宅女", + }, + ], + "lang_code": "cmn", + "lang_name": "官話", + "roman": "御宅族", + "word": "御宅族", + } + ], + ) diff --git a/wiktextract/data/zh/linkage_subtitles.json b/wiktextract/data/zh/linkage_subtitles.json index 20048177..369e2221 100644 --- a/wiktextract/data/zh/linkage_subtitles.json +++ b/wiktextract/data/zh/linkage_subtitles.json @@ -84,7 +84,6 @@ "派生詞": "derived", "派生詞彙": "derived", "派生詞語": "derived", - "派生語彙": "derived", "派生词": "derived", "派生词汇": "derived", "派生词组": "derived", @@ -133,4 +132,4 @@ "部分詞": "meronyms", "關聯詞": "related", "關聯詞彙": "related" -} \ No newline at end of file +} diff --git a/wiktextract/data/zh/other_subtitles.json b/wiktextract/data/zh/other_subtitles.json index 2c963e9e..fb924e47 100644 --- a/wiktextract/data/zh/other_subtitles.json +++ b/wiktextract/data/zh/other_subtitles.json @@ -74,5 +74,8 @@ "translations": [ "翻譯", "翻译" + ], + "descendants": [ + "派生語彙" ] -} \ No newline at end of file +} diff --git a/wiktextract/extractor/ruby.py b/wiktextract/extractor/ruby.py index e453a2c3..528c6ac3 100644 --- a/wiktextract/extractor/ruby.py +++ b/wiktextract/extractor/ruby.py @@ -1,6 +1,7 @@ from typing import List, Optional, Tuple, Union from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import HTMLNode, LevelNode, TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -58,7 +59,6 @@ def extract_ruby( # Otherwise content is WikiNode, and we must recurse into it. kind = contents.kind new_node = WikiNode(kind, contents.loc) - new_contents.append(new_node) if kind in { NodeKind.LEVEL2, NodeKind.LEVEL3, @@ -68,6 +68,8 @@ def extract_ruby( NodeKind.LINK, }: # Process args and children + if kind != NodeKind.LINK: + new_node = LevelNode(new_node.loc) new_args = [] for arg in contents.largs: e1, c1 = extract_ruby(wxr, arg) @@ -108,6 +110,8 @@ def extract_ruby( NodeKind.URL, }: # Process only args + if kind == NodeKind.TEMPLATE: + new_node = TemplateNode(new_node.loc) new_args = [] for arg in contents.largs: e1, c1 = extract_ruby(wxr, arg) @@ -116,6 +120,7 @@ def extract_ruby( new_node.largs = new_args elif kind == NodeKind.HTML: # Keep attrs and args as-is, process children + new_node = HTMLNode(new_node.loc) new_node.attrs = contents.attrs new_node.sarg = contents.sarg e1, c1 = extract_ruby(wxr, contents.children) @@ -123,4 +128,5 @@ def extract_ruby( new_node.children = c1 else: raise RuntimeError(f"extract_ruby: unhandled kind {kind}") + new_contents.append(new_node) return extracted, new_contents diff --git a/wiktextract/extractor/zh/descendant.py b/wiktextract/extractor/zh/descendant.py new file mode 100644 index 00000000..8bdeccc9 --- /dev/null +++ b/wiktextract/extractor/zh/descendant.py @@ -0,0 +1,97 @@ +from collections import defaultdict +from typing import Dict + +from wikitextprocessor import NodeKind, WikiNode + +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + +from ..ruby import extract_ruby + +DESCENDANT_TEMPLATES = frozenset(["desc", "descendant"]) + + +def extract_descendants( + wxr: WiktextractContext, + level_node: WikiNode, + parent_data: Dict, +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): + extract_descendant_list_item(wxr, list_item_node, parent_data) + + +def extract_descendant_list_item( + wxr: WiktextractContext, + list_item_node: WikiNode, + parent_data: Dict, +) -> None: + lang_code = "" + lang_name = "" + descendant_data = defaultdict(list) + for template_node in list_item_node.find_child(NodeKind.TEMPLATE): + expanded_template = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(template_node), expand_all=True + ) + if template_node.template_name.lower() in DESCENDANT_TEMPLATES: + lang_code = template_node.template_parameters.get(1) + descendant_data["lang_code"] = lang_code + ruby_data, nodes_without_ruby = extract_ruby( + wxr, expanded_template.children + ) + if len(ruby_data) > 0: + descendant_data["ruby"] = ruby_data + for child_index, child_node in enumerate(nodes_without_ruby): + if isinstance(child_node, str) and child_node.endswith(":"): + lang_name = child_node.strip(" :") + descendant_data["lang_name"] = lang_name + elif ( + isinstance(child_node, WikiNode) + and child_node.kind == NodeKind.HTML + ): + if child_node.tag == "span": + class_names = child_node.attrs.get("class", "") + if ( + "Latn" in class_names or "tr" in class_names + ) and "word" in descendant_data: + # template:ja-r + descendant_data["roman"] = clean_node( + wxr, None, child_node + ) + elif "lang" in child_node.attrs: + if "word" in descendant_data: + parent_data["descendants"].append(descendant_data) + descendant_data = defaultdict( + list, + { + "lang_code": lang_code, + "lang_name": lang_name, + }, + ) + if len(ruby_data) > 0: + descendant_data["ruby"] = ruby_data + descendant_data["word"] = clean_node( + wxr, None, child_node + ) + if "qualifier-content" in class_names: + descendant_data["tags"].append( + clean_node(wxr, None, child_node) + ) + elif child_node.tag == "i": + # template:zh-l + for span_tag in child_node.find_html( + "span", attr_name="class", attr_value="Latn" + ): + descendant_data["roman"] = clean_node( + wxr, None, span_tag + ) + + if "word" in descendant_data: + parent_data["descendants"].append(descendant_data) + + if list_item_node.contain_node(NodeKind.LIST): + extract_descendants( + wxr, + list_item_node, + descendant_data if "word" in descendant_data else parent_data, + ) diff --git a/wiktextract/extractor/zh/linkage.py b/wiktextract/extractor/zh/linkage.py index ca98cd04..d85d2e48 100644 --- a/wiktextract/extractor/zh/linkage.py +++ b/wiktextract/extractor/zh/linkage.py @@ -13,6 +13,7 @@ split_chinese_variants, strip_nodes, ) +from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item def extract_linkages( @@ -34,6 +35,7 @@ def extract_linkages( append_to = find_similar_gloss(page_data, sense) elif isinstance(node, WikiNode): if node.kind == NodeKind.LIST_ITEM: + is_descendant = False not_term_indexes = set() filtered_children = list(node.filter_empty_str_child()) linkage_data = defaultdict(list) @@ -57,6 +59,14 @@ def extract_linkages( linkage_data["tags"].append( clean_node(wxr, None, item_child).strip("()") ) + elif template_name.lower() in DESCENDANT_TEMPLATES: + extract_descendant_list_item( + wxr, node, page_data[-1] + ) + is_descendant = True + break + if is_descendant: + continue # sense template before entry and they are inside the same # list item terms = clean_node( diff --git a/wiktextract/extractor/zh/page.py b/wiktextract/extractor/zh/page.py index 3d726d0e..f8567099 100644 --- a/wiktextract/extractor/zh/page.py +++ b/wiktextract/extractor/zh/page.py @@ -10,6 +10,7 @@ from wiktextract.page import LEVEL_KINDS, clean_node from wiktextract.wxr_context import WiktextractContext +from .descendant import extract_descendants from .gloss import extract_gloss from .headword_line import extract_headword_line from .inflection import extract_inflections @@ -19,76 +20,12 @@ # Templates that are used to form panels on pages and that # should be ignored in various positions -PANEL_TEMPLATES = { - "CJKV", - "French personal pronouns", - "French possessive adjectives", - "French possessive pronouns", - "Han etym", - "Japanese demonstratives", - "Latn-script", - "Webster 1913", - "attention", - "attn", - "character info", - "character info/new", - "character info/var", - "delete", - "dial syn", - "dialect synonyms", - "examples", - "hu-corr", - "hu-suff-pron", - "interwiktionary", - "ja-kanjitab", - "ko-hanja-search", - "maintenance box", - "maintenance line", - "merge", - "morse links", - "move", - "multiple images", - "picdic", - "picdicimg", - "picdiclabel", - "punctuation", - "reconstructed", - "request box", - "rfap", - "rfc", - "rfc-header", - "rfc-level", - "rfc-sense", - "rfd", - "rfdate", - "rfdatek", - "rfdef", - "rfe", - "rfe/dowork", - "rfgender", - "rfi", - "rfinfl", - "rfp", - "rfquotek", - "rfscript", - "rftranslit", - "selfref", - "stroke order", - "t-needed", - "unblock", - "unsupportedpage", - "wrongtitle", - "zh-forms", - "zh-hanzi-box", -} +PANEL_TEMPLATES = {} # Template name prefixes used for language-specific panel templates (i.e., # templates that create side boxes or notice boxes or that should generally # be ignored). -PANEL_PREFIXES = { - "list:compass points/", - "list:Gregorian calendar months/", -} +PANEL_PREFIXES = {} # Additional templates to be expanded in the pre-expand phase ADDITIONAL_EXPAND_TEMPLATES = { @@ -174,6 +111,11 @@ def parse_section( and subtitle in wxr.config.OTHER_SUBTITLES["inflection_sections"] ): extract_inflections(wxr, page_data, node) + elif ( + wxr.config.capture_descendants + and subtitle in wxr.config.OTHER_SUBTITLES["descendants"] + ): + extract_descendants(wxr, node, page_data[-1]) else: wxr.wtp.debug( f"Unhandled subtitle: {subtitle}",