Skip to content

Commit

Permalink
Merge pull request #474 from xxyzz/zh
Browse files Browse the repository at this point in the history
Handle zh edition soft redirect templates and gloss sentence only pages
  • Loading branch information
xxyzz authored Jan 26, 2024
2 parents a1e399e + 63e4692 commit e4ac345
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 39 deletions.
7 changes: 7 additions & 0 deletions src/wiktextract/data/zh/pos_subtitles.json
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,9 @@
"諺語": {
"pos": "proverb"
},
"變位": {
"pos": "conj"
},
"词组": {
"pos": "phrase"
},
Expand Down Expand Up @@ -581,6 +584,10 @@
"部件": {
"pos": "component"
},
"釋義": {
"description": "Means 'definition', some pages don't have POS but use this title",
"pos": ""
},
"量詞": {
"pos": "classifier"
},
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,3 @@ def find_alt_of_form(
alt_of = clean_node(wxr, None, link)
if len(alt_of) > 0:
gloss_data.alt_of.append(AltForm(word=alt_of))
gloss_data.tags.append("alt-of")
10 changes: 6 additions & 4 deletions src/wiktextract/extractor/zh/headword_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -40,13 +41,14 @@
def extract_headword_line(
wxr: WiktextractContext,
page_data: list[WordEntry],
node: WikiNode,
node: TemplateNode,
lang_code: str,
) -> None:
template_name = node.template_name
if template_name != "head" and not template_name.startswith(
f"{lang_code}-"
):
if (
template_name != "head"
and not template_name.startswith(f"{lang_code}-")
) or template_name.endswith("-see"):
return

expanded_node = wxr.wtp.parse(
Expand Down
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,7 @@ class WordEntry(ChineseBaseModel):
notes: list[str] = []
tags: list[str] = []
descendants: list[Descendant] = []
redirects: list[str] = Field(
[],
description="Soft redirect page, extracted from template zh-see and ja-see",
)
46 changes: 42 additions & 4 deletions src/wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from mediawiki_langcodes import name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -132,15 +132,18 @@ def process_pos_block(
node: WikiNode,
pos_text: str,
):
pos_type = wxr.config.POS_SUBTITLES[pos_text]["pos"]
pos_data = wxr.config.POS_SUBTITLES[pos_text]
pos_type = pos_data["pos"]
base_data.pos = pos_type
append_base_data(page_data, "pos", pos_type, base_data)
page_data[-1].tags.extend(pos_data.get("tags", []))
for index, child in enumerate(node.filter_empty_str_child()):
if isinstance(child, WikiNode):
if index == 0 and child.kind == NodeKind.TEMPLATE:
if index == 0 and isinstance(child, TemplateNode):
extract_headword_line(
wxr, page_data, child, base_data.lang_code
)
process_soft_redirect_template(wxr, child, page_data)
elif child.kind == NodeKind.LIST:
extract_gloss(wxr, page_data, child, Sense())
elif child.kind in LEVEL_KIND_FLAGS:
Expand Down Expand Up @@ -230,6 +233,41 @@ def parse_page(
)
base_data.categories = categories.get("categories", [])
page_data.append(base_data.model_copy(deep=True))
parse_section(wxr, page_data, base_data, level2_node.children)
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)
if not level2_node.contain_node(NodeKind.LEVEL3):
process_low_quality_page(wxr, level2_node, page_data)

return [d.model_dump(exclude_defaults=True) for d in page_data]


def process_low_quality_page(
wxr: WiktextractContext,
level_node: WikiNode,
page_data: list[WordEntry],
) -> None:
if level_node.contain_node(NodeKind.TEMPLATE):
for template_node in level_node.find_child(NodeKind.TEMPLATE):
process_soft_redirect_template(wxr, template_node, page_data)
else:
# only have a gloss text
page_data[-1].senses.append(
Sense(glosses=[clean_node(wxr, page_data[-1], level_node.children)])
)


def process_soft_redirect_template(
wxr: WiktextractContext,
template_node: TemplateNode,
page_data: list[WordEntry],
) -> None:
# https://zh.wiktionary.org/wiki/Template:Ja-see
# https://zh.wiktionary.org/wiki/Template:Zh-see
if template_node.template_name.lower() == "zh-see":
page_data[-1].redirects.append(
clean_node(wxr, None, template_node.template_parameters.get(1, ""))
)
elif template_node.template_name.lower() == "ja-see":
for key, value in template_node.template_parameters.items():
if isinstance(key, int):
page_data[-1].redirects.append(clean_node(wxr, None, value))
2 changes: 0 additions & 2 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,6 @@ def test_typographic_variant_alt_of_template(self):
"Orthographe par contrainte typographique par système h de abaĵuro."
],
"alt_of": [{"word": "abaĵuro"}],
"tags": ["alt-of"],
}
],
)
Expand All @@ -361,7 +360,6 @@ def test_typographic_variant_alt_of_text(self):
"Variante par contrainte typographique de alphœnix."
],
"alt_of": [{"word": "alphœnix"}],
"tags": ["alt-of"],
}
],
)
4 changes: 3 additions & 1 deletion tests/test_zh_descendant.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def test_ruby(self):
10,
'<span class="Jpan" lang="ja">[[你好#日語|-{<ruby>你好<rp>(</rp><rt>ニイハオ</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">nīhao</span></span><span class="mention-gloss-paren annotation-paren">)</span>',
)
root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}")
root = self.wxr.wtp.parse(
"* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}"
)
page_data = WordEntry(word="你好", lang_code="ja", lang="日語")
extract_descendants(self.wxr, root, page_data)
self.assertEqual(
Expand Down
69 changes: 66 additions & 3 deletions tests/test_zh_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,22 @@
from wikitextprocessor import NodeKind, WikiNode, Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.zh.models import Sense, WordEntry
from wiktextract.extractor.zh.page import extract_gloss, parse_section
from wiktextract.extractor.zh.page import (
extract_gloss,
parse_section,
parse_page,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


class TestExample(TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh")
Wtp(lang_code="zh"),
WiktionaryConfig(
capture_language_codes=None, dump_file_lang_code="zh"
),
)

def tearDown(self) -> None:
Expand Down Expand Up @@ -83,7 +90,9 @@ def test_pos_title_number(
mock_process_pos_block.assert_called()

@patch("wiktextract.extractor.zh.page.process_pos_block")
@patch("wiktextract.extractor.zh.page.clean_node", return_value="名詞(一)")
@patch(
"wiktextract.extractor.zh.page.clean_node", return_value="名詞(一)"
)
def test_pos_title_chinese_numeral(
self,
mock_clean_node,
Expand All @@ -93,3 +102,57 @@ def test_pos_title_chinese_numeral(
base_data = WordEntry(word="", lang_code="", lang="")
parse_section(self.wxr, [base_data], base_data, node)
mock_process_pos_block.assert_called()

def test_soft_redirect_zh_see(self):
self.assertEqual(
parse_page(
self.wxr,
"別个",
"""==漢語==
{{zh-see|別個}}""",
),
[
{
"lang": "漢語",
"lang_code": "zh",
"redirects": ["別個"],
"word": "別个",
}
],
)

def test_soft_redirect_ja_see(self):
self.assertEqual(
parse_page(
self.wxr,
"きさらぎ",
"""==日語==
{{ja-see|如月|二月|更衣|衣更着}}""",
),
[
{
"lang": "日語",
"lang_code": "ja",
"redirects": ["如月", "二月", "更衣", "衣更着"],
"word": "きさらぎ",
}
],
)

def test_gloss_text_only_page(self):
self.assertEqual(
parse_page(
self.wxr,
"paraphrase",
"""== 英语 ==
释义;意译""",
),
[
{
"lang": "英语",
"lang_code": "en",
"senses": [{"glosses": ["释义;意译"]}],
"word": "paraphrase",
}
],
)
51 changes: 27 additions & 24 deletions tests/test_zh_headword.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,20 @@ def tearDown(self):
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value='<strong class="Latn headword" lang="en">manga</strong> ([[可數|可數]] & [[不可數|不可數]],複數 <b class="Latn form-of lang-en p-form-of" lang="en"><strong class="selflink">manga</strong></b> <small>或</small> <b class="Latn form-of lang-en p-form-of" lang="en">[[mangas#英語|mangas]]</b>)',
)
def test_english_headword(self, mock_node_to_wikitext) -> None:
def test_english_headword(self) -> None:
# https://zh.wiktionary.org/wiki/manga#字源1
# wikitext: {{en-noun|~|manga|s}}
# expanded text: manga (可數 & 不可數,複數 manga 或 mangas)
node = Mock()
node.largs = [["en-noun"]]
self.wxr.wtp.start_page("manga")
self.wxr.wtp.add_page(
"Template:en-noun",
10,
'<strong class="Latn headword" lang="en">manga</strong> ([[可數|可數]] & [[不可數|不可數]],複數 <b class="Latn form-of lang-en p-form-of" lang="en"><strong class="selflink">manga</strong></b> <small>或</small> <b class="Latn form-of lang-en p-form-of" lang="en">[[mangas#英語|mangas]]</b>)',
)
root = self.wxr.wtp.parse("{{en-noun|~|manga|s}}")
page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
self.wxr.wtp.title = "manga"
extract_headword_line(self.wxr, page_data, node, "en")
extract_headword_line(self.wxr, page_data, root.children[0], "en")
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data],
[
Expand All @@ -47,19 +48,20 @@ def test_english_headword(self, mock_node_to_wikitext) -> None:
],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value='<strong class="Latn headword" lang="nl">manga</strong>&nbsp;<span class="gender"><abbr title="陽性名詞">m</abbr></span> (複數 <b class="Latn form-of lang-nl p-form-of" lang="nl">[[manga\'s#荷蘭語|manga\'s]]</b>,指小詞 <b class="Latn form-of lang-nl 指小詞-form-of" lang="nl">[[mangaatje#荷蘭語|mangaatje]]</b>&nbsp;<span class="gender"><abbr title="中性名詞">n</abbr></span>)',
)
def test_headword_gender(self, mock_node_to_wikitext) -> None:
def test_headword_gender(self) -> None:
# https://zh.wiktionary.org/wiki/manga#字源1_2
# wikitext: {{nl-noun|m|-'s|mangaatje}}
# expanded text: manga m (複數 manga's,指小詞 mangaatje n)
node = Mock()
node.largs = [["nl-noun"]]
self.wxr.wtp.start_page("manga")
self.wxr.wtp.add_page(
"Template:nl-noun",
10,
'<strong class="Latn headword" lang="nl">manga</strong>&nbsp;<span class="gender"><abbr title="陽性名詞">m</abbr></span> (複數 <b class="Latn form-of lang-nl p-form-of" lang="nl">[[manga\'s#荷蘭語|manga\'s]]</b>,指小詞 <b class="Latn form-of lang-nl 指小詞-form-of" lang="nl">[[mangaatje#荷蘭語|mangaatje]]</b>&nbsp;<span class="gender"><abbr title="中性名詞">n</abbr></span>)',
)
root = self.wxr.wtp.parse("{{nl-noun|m|-'s|mangaatje}}")
page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
self.wxr.wtp.title = "manga"
extract_headword_line(self.wxr, page_data, node, "nl")
extract_headword_line(self.wxr, page_data, root.children[0], "nl")
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data],
[
Expand All @@ -76,21 +78,22 @@ def test_headword_gender(self, mock_node_to_wikitext) -> None:
],
)

@patch(
"wikitextprocessor.Wtp.node_to_wikitext",
return_value='<strong class="polytonic headword" lang="grc">-κρατίᾱς</strong> (<span lang="grc-Latn" class="headword-tr tr Latn" dir="ltr">-kratíās</span>)&nbsp;<span class="gender"><abbr title="陰性名詞">f</abbr></span>',
)
def test_headword_roman(self, mock_node_to_wikitext) -> None:
def test_headword_roman(self) -> None:
# https://zh.wiktionary.org/wiki/-κρατίας
# wikitext: {{head|grc|後綴變格形|g=f|head=-κρατίᾱς}}
# expanded text: -κρατίᾱς (-kratíās) f
node = Mock()
node.largs = [["head"]]
self.wxr.wtp.start_page("-κρατίας")
self.wxr.wtp.add_page(
"Template:head",
10,
'<strong class="polytonic headword" lang="grc">-κρατίᾱς</strong> (<span lang="grc-Latn" class="headword-tr tr Latn" dir="ltr">-kratíās</span>)&nbsp;<span class="gender"><abbr title="陰性名詞">f</abbr></span>',
)
root = self.wxr.wtp.parse("{{head|grc|後綴變格形|g=f|head=-κρατίᾱς}}")
page_data = [
WordEntry(word="-κρατίας", lang_code="grc", lang="古希臘語")
]
self.wxr.wtp.title = "-κρατίας"
extract_headword_line(self.wxr, page_data, node, "grc")
extract_headword_line(self.wxr, page_data, root.children[0], "grc")
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data],
[
Expand Down

0 comments on commit e4ac345

Please sign in to comment.