Skip to content

Commit

Permalink
Merge pull request #788 from xxyzz/zh
Browse files Browse the repository at this point in the history
[zh] add config.json and improve extract example code
  • Loading branch information
xxyzz authored Aug 26, 2024
2 parents c15dac4 + 7a5849e commit 122811a
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 71 deletions.
6 changes: 6 additions & 0 deletions src/wiktextract/data/zh/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": true,
"save_ns_names": ["Main", "Template", "Module", "Thesaurus"],
"extract_ns_names": ["Main"]
}
9 changes: 4 additions & 5 deletions src/wiktextract/extractor/zh/etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@ def extract_etymology(
level_node_index = next_level_index
break
for etymology_node in level_node.children[:level_node_index]:
if (
isinstance(etymology_node, TemplateNode)
and etymology_node.template_name == "zh-x"
):
if isinstance(
etymology_node, TemplateNode
) and etymology_node.template_name in ["zh-x", "zh-q"]:
for example_data in extract_template_zh_x(
wxr, etymology_node, Example()
):
Expand All @@ -49,7 +48,7 @@ def extract_etymology(
for template_node in etymology_node.find_child_recursively(
NodeKind.TEMPLATE
):
if template_node.template_name == "zh-x":
if template_node.template_name in ["zh-x", "zh-q"]:
has_zh_x = True
for example_data in extract_template_zh_x(
wxr, template_node, Example()
Expand Down
91 changes: 57 additions & 34 deletions src/wiktextract/extractor/zh/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ def extract_examples(
extract_quote_templates(wxr, child, example_data)
elif template_name in {"ja-x", "ja-usex"}:
extract_template_ja_usex(wxr, child, example_data)
elif template_name in {"zh-x", "zh-usex"}:
for zh_x_example in extract_template_zh_x(
wxr, child, example_data
):
sense_data.examples.append(zh_x_example)
clean_node(wxr, sense_data, child)
clean_node(wxr, sense_data, child) # add cat link
elif template_name in {"zh-x", "zh-usex", "zh-q"}:
sense_data.examples.extend(
extract_template_zh_x(wxr, child, example_data)
)
clean_node(wxr, sense_data, child) # add cat link
elif template_name in {"ux", "eg", "usex"}:
extract_template_ux(wxr, child, example_data)
elif template_name == "uxi":
Expand Down Expand Up @@ -124,26 +124,27 @@ def extract_quote_templates(


def extract_template_ja_usex(
wxr: WiktextractContext, node: WikiNode, example_data: Example
wxr: WiktextractContext, node: TemplateNode, example_data: Example
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
)
ruby_data, node_without_ruby = extract_ruby(wxr, expanded_node.children)
expanded_text = clean_node(wxr, None, node_without_ruby)
for line_num, expanded_line in enumerate(expanded_text.splitlines()):
if line_num == 0:
key = "text"
elif line_num == 1:
key = "roman"
else:
key = "translation"
if key == "text":
example_data.text = expanded_line
else:
setattr(example_data, key, expanded_line)
if len(ruby_data) > 0:
for span_tag in expanded_node.find_html(
"span", attr_name="class", attr_value="Jpan"
):
ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
example_data.text = clean_node(wxr, None, node_without_ruby)
example_data.ruby = ruby_data
for span_tag in expanded_node.find_html_recursively(
"span", attr_name="class", attr_value="tr"
):
example_data.roman = clean_node(wxr, None, span_tag)
example_data.translation = clean_node(
wxr, None, node.template_parameters.get(3, "")
)
example_data.literal_meaning = clean_node(
wxr, None, node.template_parameters.get("lit", "")
)


def extract_template_zh_x(
Expand All @@ -159,21 +160,27 @@ def extract_template_zh_x(
for dl_tag in expanded_node.find_html_recursively("dl"):
has_dl_tag = True
ref = ""
pinyin = ""
roman = ""
translation = ""
roman_raw_tags = []
for dd_tag in dl_tag.find_html("dd"):
dd_text = clean_node(wxr, None, dd_tag)
# Module:Zh-usex uses "出自:" now: https://zh.wiktionary.org/w/index.php?title=Module:Zh-usex&diff=prev&oldid=8430896
if dd_text.startswith(("出自:", "來自:")):
ref = dd_text.removeprefix("出自:").removeprefix("來自:")
if dd_text.startswith("出自:"):
ref = dd_text.removeprefix("出自:")
else:
is_pinyin = False
is_roman = False
for span_tag in dd_tag.find_html_recursively(
"span", attr_name="lang", attr_value="Latn"
):
pinyin = clean_node(wxr, None, span_tag)
is_pinyin = True
if not is_pinyin:
roman = clean_node(wxr, None, span_tag)
is_roman = True
for span_tag in dd_tag.find_html_recursively("span"):
span_text = clean_node(wxr, None, span_tag)
if span_text.startswith("[") and span_text.endswith(
"]"
):
roman_raw_tags.append(span_text.strip("[]"))
if not is_roman:
translation = dd_text

example_text = ""
Expand All @@ -188,34 +195,50 @@ def extract_template_zh_x(
raw_tag = clean_node(wxr, None, span_tag)
example = parent_example.model_copy(deep=True)
example.text = example_text
example.roman = pinyin
example.roman = roman
example.translation = translation
example.raw_tags.extend(raw_tag.strip("[]").split(","))
if len(ref) > 0:
example.raw_tags.extend(roman_raw_tags)
if len(ref) > 0: # don't override parent quote-* template
example.ref = ref
translate_raw_tags(example)
results.append(example)

# no source, single line example
if not has_dl_tag:
pinyin = ""
roman = ""
raw_tags = []
for span_tag in expanded_node.find_html(
"span", attr_name="lang", attr_value="Latn"
):
pinyin = clean_node(wxr, None, span_tag)
roman = clean_node(wxr, None, span_tag)
for span_tag in expanded_node.find_html("span"):
span_text = clean_node(wxr, None, span_tag)
if span_text.startswith("[") and span_text.endswith("]"):
raw_tags.append(span_text.strip("[]"))
translation = clean_node(
wxr, None, template_node.template_parameters.get(2, "")
)
literal_meaning = clean_node(
wxr, None, template_node.template_parameters.get("lit", "")
)
for span_tag in expanded_node.find_html("span"):
span_lang = span_tag.attrs.get("lang", "")
if span_lang in ["zh-Hant", "zh-Hans"]:
example_text = clean_node(wxr, None, span_tag)
if len(example_text) > 0:
example_data = parent_example.model_copy(deep=True)
example_data.text = example_text
example_data.roman = pinyin
example_data.roman = roman
example_data.tags.append(
"Traditional Chinese"
if span_lang == "zh-Hant"
else "Simplified Chinese"
)
example_data.translation = translation
example_data.literal_meaning = literal_meaning
example_data.raw_tags.extend(raw_tags)
translate_raw_tags(example_data)
results.append(example_data)
return results

Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class Example(ChineseBaseModel):
translation: str = Field(
default="", description="Chinese translation of the example sentence"
)
literal_meaning: str = ""
roman: str = Field(
default="", description="Romanization of the example sentence"
)
Expand Down
8 changes: 8 additions & 0 deletions src/wiktextract/extractor/zh/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,18 @@

# example sentence template
# https://zh.wiktionary.org/wiki/Template:Zh-x
# https://zh.wiktionary.org/wiki/Module:Zh-usex/data
ZH_X_TAGS = {
"繁體": "Traditional Chinese",
"簡體": "Simplified Chinese",
"繁體和簡體": ["Traditional Chinese", "Simplified Chinese"],
"漢語拼音": "Pinyin",
"粵拼": "Jyutping",
"現代標準漢語": "Standard Chinese",
"文言文": "Classical Chinese",
"官話白話文": "Written vernacular Chinese",
"粵語": "Cantonese",
"吳語": "Wu",
}


Expand Down
23 changes: 15 additions & 8 deletions src/wiktextract/extractor/zh/translation.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from typing import Optional, Union

from mediawiki_langcodes import code_to_name, name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Translation, WordEntry
from .section_titles import TRANSLATIONS_TITLES
from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
Expand All @@ -17,6 +20,7 @@ def extract_translation(
page_data: list[WordEntry],
level_node: WikiNode,
sense: str = "",
is_subpage: bool = False,
) -> None:
for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
if isinstance(child, TemplateNode):
Expand All @@ -26,7 +30,10 @@ def extract_translation(
and 1 in child.template_parameters
):
sense = clean_node(wxr, None, child.template_parameters.get(1))
elif template_name in {"see translation subpage", "trans-see"}:
elif (
template_name in {"see translation subpage", "trans-see"}
and not is_subpage
):
translation_subpage(wxr, page_data, child)
elif template_name == "multitrans":
wikitext = "".join(
Expand Down Expand Up @@ -173,7 +180,7 @@ def translation_subpage(
)
translation_node = find_subpage_section(wxr, target_section_node)
if translation_node is not None:
extract_translation(wxr, page_data, translation_node)
extract_translation(wxr, page_data, translation_node, is_subpage=True)


def find_subpage_section(
Expand Down
Loading

0 comments on commit 122811a

Please sign in to comment.