Merge pull request #788 from xxyzz/zh

[zh] add config.json and improve extract example code
tatuylonen · Aug 26, 2024 · 122811a · 122811a
2 parents c15dac4 + 7a5849e
commit 122811a
Show file tree

Hide file tree

Showing 8 changed files with 204 additions and 71 deletions.
diff --git a/src/wiktextract/data/zh/config.json b/src/wiktextract/data/zh/config.json
@@ -0,0 +1,6 @@
+{
+  "analyze_templates": false,
+  "extract_thesaurus_pages": true,
+  "save_ns_names": ["Main", "Template", "Module", "Thesaurus"],
+  "extract_ns_names": ["Main"]
+}
diff --git a/src/wiktextract/extractor/zh/etymology.py b/src/wiktextract/extractor/zh/etymology.py
@@ -27,10 +27,9 @@ def extract_etymology(
         level_node_index = next_level_index
         break
     for etymology_node in level_node.children[:level_node_index]:
-        if (
-            isinstance(etymology_node, TemplateNode)
-            and etymology_node.template_name == "zh-x"
-        ):
+        if isinstance(
+            etymology_node, TemplateNode
+        ) and etymology_node.template_name in ["zh-x", "zh-q"]:
             for example_data in extract_template_zh_x(
                 wxr, etymology_node, Example()
             ):
@@ -49,7 +48,7 @@ def extract_etymology(
             for template_node in etymology_node.find_child_recursively(
                 NodeKind.TEMPLATE
             ):
-                if template_node.template_name == "zh-x":
+                if template_node.template_name in ["zh-x", "zh-q"]:
                     has_zh_x = True
                     for example_data in extract_template_zh_x(
                         wxr, template_node, Example()

diff --git a/src/wiktextract/extractor/zh/example.py b/src/wiktextract/extractor/zh/example.py
@@ -49,12 +49,12 @@ def extract_examples(
                         extract_quote_templates(wxr, child, example_data)
                     elif template_name in {"ja-x", "ja-usex"}:
                         extract_template_ja_usex(wxr, child, example_data)
-                    elif template_name in {"zh-x", "zh-usex"}:
-                        for zh_x_example in extract_template_zh_x(
-                            wxr, child, example_data
-                        ):
-                            sense_data.examples.append(zh_x_example)
-                        clean_node(wxr, sense_data, child)
+                        clean_node(wxr, sense_data, child)  # add cat link
+                    elif template_name in {"zh-x", "zh-usex", "zh-q"}:
+                        sense_data.examples.extend(
+                            extract_template_zh_x(wxr, child, example_data)
+                        )
+                        clean_node(wxr, sense_data, child)  # add cat link
                     elif template_name in {"ux", "eg", "usex"}:
                         extract_template_ux(wxr, child, example_data)
                     elif template_name == "uxi":
@@ -124,26 +124,27 @@ def extract_quote_templates(
 
 
 def extract_template_ja_usex(
-    wxr: WiktextractContext, node: WikiNode, example_data: Example
+    wxr: WiktextractContext, node: TemplateNode, example_data: Example
 ) -> None:
     expanded_node = wxr.wtp.parse(
         wxr.wtp.node_to_wikitext(node), expand_all=True
     )
-    ruby_data, node_without_ruby = extract_ruby(wxr, expanded_node.children)
-    expanded_text = clean_node(wxr, None, node_without_ruby)
-    for line_num, expanded_line in enumerate(expanded_text.splitlines()):
-        if line_num == 0:
-            key = "text"
-        elif line_num == 1:
-            key = "roman"
-        else:
-            key = "translation"
-        if key == "text":
-            example_data.text = expanded_line
-        else:
-            setattr(example_data, key, expanded_line)
-    if len(ruby_data) > 0:
+    for span_tag in expanded_node.find_html(
+        "span", attr_name="class", attr_value="Jpan"
+    ):
+        ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
+        example_data.text = clean_node(wxr, None, node_without_ruby)
         example_data.ruby = ruby_data
+    for span_tag in expanded_node.find_html_recursively(
+        "span", attr_name="class", attr_value="tr"
+    ):
+        example_data.roman = clean_node(wxr, None, span_tag)
+    example_data.translation = clean_node(
+        wxr, None, node.template_parameters.get(3, "")
+    )
+    example_data.literal_meaning = clean_node(
+        wxr, None, node.template_parameters.get("lit", "")
+    )
 
 
 def extract_template_zh_x(
@@ -159,21 +160,27 @@ def extract_template_zh_x(
     for dl_tag in expanded_node.find_html_recursively("dl"):
         has_dl_tag = True
         ref = ""
-        pinyin = ""
+        roman = ""
         translation = ""
+        roman_raw_tags = []
         for dd_tag in dl_tag.find_html("dd"):
             dd_text = clean_node(wxr, None, dd_tag)
-            # Module:Zh-usex uses "出自：" now: https://zh.wiktionary.org/w/index.php?title=Module:Zh-usex&diff=prev&oldid=8430896
-            if dd_text.startswith(("出自：", "來自：")):
-                ref = dd_text.removeprefix("出自：").removeprefix("來自：")
+            if dd_text.startswith("出自："):
+                ref = dd_text.removeprefix("出自：")
             else:
-                is_pinyin = False
+                is_roman = False
                 for span_tag in dd_tag.find_html_recursively(
                     "span", attr_name="lang", attr_value="Latn"
                 ):
-                    pinyin = clean_node(wxr, None, span_tag)
-                    is_pinyin = True
-                if not is_pinyin:
+                    roman = clean_node(wxr, None, span_tag)
+                    is_roman = True
+                    for span_tag in dd_tag.find_html_recursively("span"):
+                        span_text = clean_node(wxr, None, span_tag)
+                        if span_text.startswith("[") and span_text.endswith(
+                            "]"
+                        ):
+                            roman_raw_tags.append(span_text.strip("[]"))
+                if not is_roman:
                     translation = dd_text
 
         example_text = ""
@@ -188,34 +195,50 @@ def extract_template_zh_x(
                     raw_tag = clean_node(wxr, None, span_tag)
                     example = parent_example.model_copy(deep=True)
                     example.text = example_text
-                    example.roman = pinyin
+                    example.roman = roman
                     example.translation = translation
                     example.raw_tags.extend(raw_tag.strip("[]").split("，"))
-                    if len(ref) > 0:
+                    example.raw_tags.extend(roman_raw_tags)
+                    if len(ref) > 0:  # don't override parent quote-* template
                         example.ref = ref
                     translate_raw_tags(example)
                     results.append(example)
 
     # no source, single line example
     if not has_dl_tag:
-        pinyin = ""
+        roman = ""
+        raw_tags = []
         for span_tag in expanded_node.find_html(
             "span", attr_name="lang", attr_value="Latn"
         ):
-            pinyin = clean_node(wxr, None, span_tag)
+            roman = clean_node(wxr, None, span_tag)
+        for span_tag in expanded_node.find_html("span"):
+            span_text = clean_node(wxr, None, span_tag)
+            if span_text.startswith("[") and span_text.endswith("]"):
+                raw_tags.append(span_text.strip("[]"))
+        translation = clean_node(
+            wxr, None, template_node.template_parameters.get(2, "")
+        )
+        literal_meaning = clean_node(
+            wxr, None, template_node.template_parameters.get("lit", "")
+        )
         for span_tag in expanded_node.find_html("span"):
             span_lang = span_tag.attrs.get("lang", "")
             if span_lang in ["zh-Hant", "zh-Hans"]:
                 example_text = clean_node(wxr, None, span_tag)
                 if len(example_text) > 0:
                     example_data = parent_example.model_copy(deep=True)
                     example_data.text = example_text
-                    example_data.roman = pinyin
+                    example_data.roman = roman
                     example_data.tags.append(
                         "Traditional Chinese"
                         if span_lang == "zh-Hant"
                         else "Simplified Chinese"
                     )
+                    example_data.translation = translation
+                    example_data.literal_meaning = literal_meaning
+                    example_data.raw_tags.extend(raw_tags)
+                    translate_raw_tags(example_data)
                     results.append(example_data)
     return results
 

diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
@@ -21,6 +21,7 @@ class Example(ChineseBaseModel):
     translation: str = Field(
         default="", description="Chinese translation of the example sentence"
     )
+    literal_meaning: str = ""
     roman: str = Field(
         default="", description="Romanization of the example sentence"
     )

diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py
@@ -262,10 +262,18 @@
 
 # example sentence template
 # https://zh.wiktionary.org/wiki/Template:Zh-x
+# https://zh.wiktionary.org/wiki/Module:Zh-usex/data
 ZH_X_TAGS = {
     "繁體": "Traditional Chinese",
     "簡體": "Simplified Chinese",
     "繁體和簡體": ["Traditional Chinese", "Simplified Chinese"],
+    "漢語拼音": "Pinyin",
+    "粵拼": "Jyutping",
+    "現代標準漢語": "Standard Chinese",
+    "文言文": "Classical Chinese",
+    "官話白話文": "Written vernacular Chinese",
+    "粵語": "Cantonese",
+    "吳語": "Wu",
 }
 
 

diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py
@@ -1,12 +1,15 @@
 from typing import Optional, Union
 
 from mediawiki_langcodes import code_to_name, name_to_code
-from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
-
-from wiktextract.page import clean_node
-from wiktextract.wxr_context import WiktextractContext
-
+from wikitextprocessor.parser import (
+    LEVEL_KIND_FLAGS,
+    NodeKind,
+    TemplateNode,
+    WikiNode,
+)
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
 from .models import Translation, WordEntry
 from .section_titles import TRANSLATIONS_TITLES
 from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
@@ -17,6 +20,7 @@ def extract_translation(
     page_data: list[WordEntry],
     level_node: WikiNode,
     sense: str = "",
+    is_subpage: bool = False,
 ) -> None:
     for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
         if isinstance(child, TemplateNode):
@@ -26,7 +30,10 @@ def extract_translation(
                 and 1 in child.template_parameters
             ):
                 sense = clean_node(wxr, None, child.template_parameters.get(1))
-            elif template_name in {"see translation subpage", "trans-see"}:
+            elif (
+                template_name in {"see translation subpage", "trans-see"}
+                and not is_subpage
+            ):
                 translation_subpage(wxr, page_data, child)
             elif template_name == "multitrans":
                 wikitext = "".join(
@@ -173,7 +180,7 @@ def translation_subpage(
     )
     translation_node = find_subpage_section(wxr, target_section_node)
     if translation_node is not None:
-        extract_translation(wxr, page_data, translation_node)
+        extract_translation(wxr, page_data, translation_node, is_subpage=True)
 
 
 def find_subpage_section(