diff --git a/json_schema/zh.json b/json_schema/zh.json
index 429915d8..745762f5 100644
--- a/json_schema/zh.json
+++ b/json_schema/zh.json
@@ -147,6 +147,12 @@
"items": {
"type": "string"
}
+ },
+ "descendants": {
+ "type": "array",
+ "items": {
+ "$ref": "#/$defs/descendant"
+ }
}
},
"$defs": {
@@ -315,6 +321,46 @@
"enum": ["zh-Hant", "zh-Hans"]
}
}
+ },
+ "descendant": {
+ "type": "object",
+ "properties": {
+ "lang_code": {
+ "description": "ISO 639-1 code",
+ "type": "string"
+ },
+ "lang_name": {
+ "type": "string"
+ },
+ "word": {
+ "type": "string"
+ },
+ "roman": {
+ "type": "string"
+ },
+ "tags": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "descendants": {
+ "type": "array",
+ "items": {
+ "$refs": "#/$defs/descendant"
+ }
+ },
+ "ruby": {
+ "description": "Japanese Kanji and furigana",
+ "type": "array",
+ "items": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ }
+ }
+ }
}
}
}
diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py
new file mode 100644
index 00000000..34b0ac79
--- /dev/null
+++ b/tests/test_zh_descendant.py
@@ -0,0 +1,117 @@
+from collections import defaultdict
+from unittest import TestCase
+from unittest.mock import Mock
+
+from wikitextprocessor import Wtp
+
+from wiktextract.extractor.zh.descendant import extract_descendants
+from wiktextract.thesaurus import close_thesaurus_db
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestDescendant(TestCase):
+ def setUp(self):
+ self.wxr = WiktextractContext(Wtp(lang_code="zh"), Mock())
+
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+ close_thesaurus_db(
+ self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
+ )
+
+ def test_ruby(self):
+ # https://zh.wiktionary.org/wiki/你好
+ self.wxr.wtp.start_page("你好")
+ self.wxr.wtp.add_page(
+ "Template:desc",
+ 10,
+ '→ 日語:',
+ )
+ self.wxr.wtp.add_page(
+ "Template:ja-r",
+ 10,
+ '[[你好#日語|-{你好}-]] (nīhao)',
+ )
+ root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}")
+ page_data = defaultdict(list)
+ extract_descendants(self.wxr, root, page_data)
+ self.assertEqual(
+ page_data.get("descendants"),
+ [
+ {
+ "lang_code": "ja",
+ "lang_name": "日語",
+ "roman": "nīhao",
+ "ruby": [("你好", "ニイハオ")],
+ "word": "你好",
+ }
+ ],
+ )
+
+ def test_roman_only_list(self):
+ self.wxr.wtp.start_page("你好")
+ self.wxr.wtp.add_page(
+ "Template:desc",
+ 10,
+ '→ 壯語:[[mwngz ndei#壯語|-{mwngz ndei}-]] (仿譯)',
+ )
+ root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}")
+ page_data = defaultdict(list)
+ extract_descendants(self.wxr, root, page_data)
+ self.assertEqual(
+ page_data.get("descendants"),
+ [
+ {
+ "lang_code": "za",
+ "lang_name": "壯語",
+ "tags": ["仿譯"],
+ "word": "mwngz ndei",
+ }
+ ],
+ )
+
+ def test_nested_list(self):
+ # https://zh.wiktionary.org/wiki/オタク
+ self.wxr.wtp.start_page("オタク")
+ self.wxr.wtp.add_page(
+ "Template:desc",
+ 10,
+ '⇒ 官話:',
+ )
+ self.wxr.wtp.add_page(
+ "Template:zh-l",
+ 10,
+ '{{{1}}} ({{{1}}}',
+ )
+ root = self.wxr.wtp.parse(
+ """*: {{desc|cmn|-}} {{zh-l|御宅族}}
+*:* {{desc|cmn|-|der=1}} {{zh-l|宅男}}
+*:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}"""
+ )
+ page_data = defaultdict(list)
+ extract_descendants(self.wxr, root, page_data)
+ self.assertEqual(
+ page_data.get("descendants"),
+ [
+ {
+ "descendants": [
+ {
+ "lang_code": "cmn",
+ "lang_name": "官話",
+ "roman": "宅男",
+ "word": "宅男",
+ },
+ {
+ "lang_code": "cmn",
+ "lang_name": "官話",
+ "roman": "宅女",
+ "word": "宅女",
+ },
+ ],
+ "lang_code": "cmn",
+ "lang_name": "官話",
+ "roman": "御宅族",
+ "word": "御宅族",
+ }
+ ],
+ )
diff --git a/wiktextract/data/zh/linkage_subtitles.json b/wiktextract/data/zh/linkage_subtitles.json
index 20048177..369e2221 100644
--- a/wiktextract/data/zh/linkage_subtitles.json
+++ b/wiktextract/data/zh/linkage_subtitles.json
@@ -84,7 +84,6 @@
"派生詞": "derived",
"派生詞彙": "derived",
"派生詞語": "derived",
- "派生語彙": "derived",
"派生词": "derived",
"派生词汇": "derived",
"派生词组": "derived",
@@ -133,4 +132,4 @@
"部分詞": "meronyms",
"關聯詞": "related",
"關聯詞彙": "related"
-}
\ No newline at end of file
+}
diff --git a/wiktextract/data/zh/other_subtitles.json b/wiktextract/data/zh/other_subtitles.json
index 2c963e9e..fb924e47 100644
--- a/wiktextract/data/zh/other_subtitles.json
+++ b/wiktextract/data/zh/other_subtitles.json
@@ -74,5 +74,8 @@
"translations": [
"翻譯",
"翻译"
+ ],
+ "descendants": [
+ "派生語彙"
]
-}
\ No newline at end of file
+}
diff --git a/wiktextract/extractor/ruby.py b/wiktextract/extractor/ruby.py
index e453a2c3..528c6ac3 100644
--- a/wiktextract/extractor/ruby.py
+++ b/wiktextract/extractor/ruby.py
@@ -1,6 +1,7 @@
from typing import List, Optional, Tuple, Union
from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import HTMLNode, LevelNode, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
@@ -58,7 +59,6 @@ def extract_ruby(
# Otherwise content is WikiNode, and we must recurse into it.
kind = contents.kind
new_node = WikiNode(kind, contents.loc)
- new_contents.append(new_node)
if kind in {
NodeKind.LEVEL2,
NodeKind.LEVEL3,
@@ -68,6 +68,8 @@ def extract_ruby(
NodeKind.LINK,
}:
# Process args and children
+ if kind != NodeKind.LINK:
+ new_node = LevelNode(new_node.loc)
new_args = []
for arg in contents.largs:
e1, c1 = extract_ruby(wxr, arg)
@@ -108,6 +110,8 @@ def extract_ruby(
NodeKind.URL,
}:
# Process only args
+ if kind == NodeKind.TEMPLATE:
+ new_node = TemplateNode(new_node.loc)
new_args = []
for arg in contents.largs:
e1, c1 = extract_ruby(wxr, arg)
@@ -116,6 +120,7 @@ def extract_ruby(
new_node.largs = new_args
elif kind == NodeKind.HTML:
# Keep attrs and args as-is, process children
+ new_node = HTMLNode(new_node.loc)
new_node.attrs = contents.attrs
new_node.sarg = contents.sarg
e1, c1 = extract_ruby(wxr, contents.children)
@@ -123,4 +128,5 @@ def extract_ruby(
new_node.children = c1
else:
raise RuntimeError(f"extract_ruby: unhandled kind {kind}")
+ new_contents.append(new_node)
return extracted, new_contents
diff --git a/wiktextract/extractor/zh/descendant.py b/wiktextract/extractor/zh/descendant.py
new file mode 100644
index 00000000..8bdeccc9
--- /dev/null
+++ b/wiktextract/extractor/zh/descendant.py
@@ -0,0 +1,97 @@
+from collections import defaultdict
+from typing import Dict
+
+from wikitextprocessor import NodeKind, WikiNode
+
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+from ..ruby import extract_ruby
+
+DESCENDANT_TEMPLATES = frozenset(["desc", "descendant"])
+
+
+def extract_descendants(
+ wxr: WiktextractContext,
+ level_node: WikiNode,
+ parent_data: Dict,
+) -> None:
+ for list_node in level_node.find_child(NodeKind.LIST):
+ for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
+ extract_descendant_list_item(wxr, list_item_node, parent_data)
+
+
+def extract_descendant_list_item(
+ wxr: WiktextractContext,
+ list_item_node: WikiNode,
+ parent_data: Dict,
+) -> None:
+ lang_code = ""
+ lang_name = ""
+ descendant_data = defaultdict(list)
+ for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
+ expanded_template = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(template_node), expand_all=True
+ )
+ if template_node.template_name.lower() in DESCENDANT_TEMPLATES:
+ lang_code = template_node.template_parameters.get(1)
+ descendant_data["lang_code"] = lang_code
+ ruby_data, nodes_without_ruby = extract_ruby(
+ wxr, expanded_template.children
+ )
+ if len(ruby_data) > 0:
+ descendant_data["ruby"] = ruby_data
+ for child_index, child_node in enumerate(nodes_without_ruby):
+ if isinstance(child_node, str) and child_node.endswith(":"):
+ lang_name = child_node.strip(" :")
+ descendant_data["lang_name"] = lang_name
+ elif (
+ isinstance(child_node, WikiNode)
+ and child_node.kind == NodeKind.HTML
+ ):
+ if child_node.tag == "span":
+ class_names = child_node.attrs.get("class", "")
+ if (
+ "Latn" in class_names or "tr" in class_names
+ ) and "word" in descendant_data:
+ # template:ja-r
+ descendant_data["roman"] = clean_node(
+ wxr, None, child_node
+ )
+ elif "lang" in child_node.attrs:
+ if "word" in descendant_data:
+ parent_data["descendants"].append(descendant_data)
+ descendant_data = defaultdict(
+ list,
+ {
+ "lang_code": lang_code,
+ "lang_name": lang_name,
+ },
+ )
+ if len(ruby_data) > 0:
+ descendant_data["ruby"] = ruby_data
+ descendant_data["word"] = clean_node(
+ wxr, None, child_node
+ )
+ if "qualifier-content" in class_names:
+ descendant_data["tags"].append(
+ clean_node(wxr, None, child_node)
+ )
+ elif child_node.tag == "i":
+ # template:zh-l
+ for span_tag in child_node.find_html(
+ "span", attr_name="class", attr_value="Latn"
+ ):
+ descendant_data["roman"] = clean_node(
+ wxr, None, span_tag
+ )
+
+ if "word" in descendant_data:
+ parent_data["descendants"].append(descendant_data)
+
+ if list_item_node.contain_node(NodeKind.LIST):
+ extract_descendants(
+ wxr,
+ list_item_node,
+ descendant_data if "word" in descendant_data else parent_data,
+ )
diff --git a/wiktextract/extractor/zh/linkage.py b/wiktextract/extractor/zh/linkage.py
index ca98cd04..d85d2e48 100644
--- a/wiktextract/extractor/zh/linkage.py
+++ b/wiktextract/extractor/zh/linkage.py
@@ -13,6 +13,7 @@
split_chinese_variants,
strip_nodes,
)
+from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item
def extract_linkages(
@@ -34,6 +35,7 @@ def extract_linkages(
append_to = find_similar_gloss(page_data, sense)
elif isinstance(node, WikiNode):
if node.kind == NodeKind.LIST_ITEM:
+ is_descendant = False
not_term_indexes = set()
filtered_children = list(node.filter_empty_str_child())
linkage_data = defaultdict(list)
@@ -57,6 +59,14 @@ def extract_linkages(
linkage_data["tags"].append(
clean_node(wxr, None, item_child).strip("()")
)
+ elif template_name.lower() in DESCENDANT_TEMPLATES:
+ extract_descendant_list_item(
+ wxr, node, page_data[-1]
+ )
+ is_descendant = True
+ break
+ if is_descendant:
+ continue
# sense template before entry and they are inside the same
# list item
terms = clean_node(
diff --git a/wiktextract/extractor/zh/page.py b/wiktextract/extractor/zh/page.py
index 3d726d0e..f8567099 100644
--- a/wiktextract/extractor/zh/page.py
+++ b/wiktextract/extractor/zh/page.py
@@ -10,6 +10,7 @@
from wiktextract.page import LEVEL_KINDS, clean_node
from wiktextract.wxr_context import WiktextractContext
+from .descendant import extract_descendants
from .gloss import extract_gloss
from .headword_line import extract_headword_line
from .inflection import extract_inflections
@@ -19,76 +20,12 @@
# Templates that are used to form panels on pages and that
# should be ignored in various positions
-PANEL_TEMPLATES = {
- "CJKV",
- "French personal pronouns",
- "French possessive adjectives",
- "French possessive pronouns",
- "Han etym",
- "Japanese demonstratives",
- "Latn-script",
- "Webster 1913",
- "attention",
- "attn",
- "character info",
- "character info/new",
- "character info/var",
- "delete",
- "dial syn",
- "dialect synonyms",
- "examples",
- "hu-corr",
- "hu-suff-pron",
- "interwiktionary",
- "ja-kanjitab",
- "ko-hanja-search",
- "maintenance box",
- "maintenance line",
- "merge",
- "morse links",
- "move",
- "multiple images",
- "picdic",
- "picdicimg",
- "picdiclabel",
- "punctuation",
- "reconstructed",
- "request box",
- "rfap",
- "rfc",
- "rfc-header",
- "rfc-level",
- "rfc-sense",
- "rfd",
- "rfdate",
- "rfdatek",
- "rfdef",
- "rfe",
- "rfe/dowork",
- "rfgender",
- "rfi",
- "rfinfl",
- "rfp",
- "rfquotek",
- "rfscript",
- "rftranslit",
- "selfref",
- "stroke order",
- "t-needed",
- "unblock",
- "unsupportedpage",
- "wrongtitle",
- "zh-forms",
- "zh-hanzi-box",
-}
+PANEL_TEMPLATES = {}
# Template name prefixes used for language-specific panel templates (i.e.,
# templates that create side boxes or notice boxes or that should generally
# be ignored).
-PANEL_PREFIXES = {
- "list:compass points/",
- "list:Gregorian calendar months/",
-}
+PANEL_PREFIXES = {}
# Additional templates to be expanded in the pre-expand phase
ADDITIONAL_EXPAND_TEMPLATES = {
@@ -174,6 +111,11 @@ def parse_section(
and subtitle in wxr.config.OTHER_SUBTITLES["inflection_sections"]
):
extract_inflections(wxr, page_data, node)
+ elif (
+ wxr.config.capture_descendants
+ and subtitle in wxr.config.OTHER_SUBTITLES["descendants"]
+ ):
+ extract_descendants(wxr, node, page_data[-1])
else:
wxr.wtp.debug(
f"Unhandled subtitle: {subtitle}",