Skip to content

Commit

Permalink
Merge pull request #531 from xxyzz/zh
Browse files Browse the repository at this point in the history
Make zh edition's pydantic model `WordEntry.pos` field required
  • Loading branch information
kristian-clausal authored Mar 6, 2024
2 parents 725e728 + 47a4b1c commit 30ae74b
Show file tree
Hide file tree
Showing 16 changed files with 139 additions and 35 deletions.
36 changes: 28 additions & 8 deletions src/wiktextract/extractor/en/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from mediawiki_langcodes import get_all_names, name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
from wikitextprocessor.parser import GeneralNode
from wikitextprocessor.parser import GeneralNode, TemplateNode
from wiktextract.clean import clean_template_args
from wiktextract.datautils import (
data_append,
Expand Down Expand Up @@ -3185,13 +3185,8 @@ def skip_template_fn(name, ht):
if not isinstance(node, WikiNode):
# print(" X{}".format(repr(node)[:40]))
continue
if node.kind == NodeKind.TEMPLATE:
template_name = node.largs[0][0]
if template_name == "zh-see":
# handle Chinese character variant redirect
# https://en.wikipedia.org/wiki/Variant_Chinese_characters
redirect_to = node.largs[1][0]
redirect_list.append(redirect_to)
if isinstance(node, TemplateNode):
process_soft_redirect_template(wxr, node, redirect_list)
continue

if node.kind not in LEVEL_KINDS:
Expand Down Expand Up @@ -3309,9 +3304,13 @@ def skip_template_fn(name, ht):
if len(redirect_list) > 0:
if len(pos_data) > 0:
pos_data["redirects"] = redirect_list
if "pos" not in pos_data:
pos_data["pos"] = "soft-redirect"
else:
new_page_data = base_data.copy()
new_page_data["redirects"] = redirect_list
if "pos" not in new_page_data:
new_page_data["pos"] = "soft-redirect"
page_datas.append(new_page_data)

def extract_examples(others, sense_base):
Expand Down Expand Up @@ -3929,3 +3928,24 @@ def parse_page(
)
x["original_title"] = word
return ret


def process_soft_redirect_template(
wxr: WiktextractContext,
template_node: TemplateNode,
redirect_pages: list[str],
) -> None:
if template_node.template_name == "zh-see":
# https://en.wiktionary.org/wiki/Template:zh-see
title = clean_node(
wxr, None, template_node.template_parameters.get(1, "")
)
if title != "":
redirect_pages.append(title)
elif template_node.template_name == "ja-see":
# https://en.wiktionary.org/wiki/Template:ja-see
for key, value in template_node.template_parameters.items():
if isinstance(key, int):
title = clean_node(wxr, None, value)
if title != "":
redirect_pages.append(title)
1 change: 1 addition & 0 deletions src/wiktextract/extractor/fr/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@
# template text before gloss
SENSE_TAGS: dict[str, str] = {
# https://fr.wiktionary.org/wiki/Modèle:figuré
# https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_relation_entre_les_définitions
"sens figuré": "figuratively",
"enclise": "enclitic",
"idiotisme": "idiomatic",
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class WordEntry(ChineseBaseModel):
word: str = Field(description="Word string")
lang_code: str = Field(description="Wiktionary language code")
lang: str = Field(description="Localized language name")
pos: str = Field("", description="Part of speech type")
pos: str = Field(description="Part of speech type")
etymology_text: str = ""
senses: list[Sense] = Field([], description="Sense list")
forms: list[Form] = Field([], description="Inflection forms list")
Expand Down
11 changes: 10 additions & 1 deletion src/wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,10 @@ def parse_page(
continue
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
word=wxr.wtp.title, lang_code=lang_code, lang=lang_name
word=wxr.wtp.title,
lang_code=lang_code,
lang=lang_name,
pos="unknown",
)
base_data.categories = categories.get("categories", [])
page_data.append(base_data.model_copy(deep=True))
Expand Down Expand Up @@ -278,11 +281,17 @@ def process_soft_redirect_template(
) -> None:
# https://zh.wiktionary.org/wiki/Template:Ja-see
# https://zh.wiktionary.org/wiki/Template:Zh-see
update_pos = False
if template_node.template_name.lower() == "zh-see":
page_data[-1].redirects.append(
clean_node(wxr, None, template_node.template_parameters.get(1, ""))
)
update_pos = True
elif template_node.template_name.lower() == "ja-see":
for key, value in template_node.template_parameters.items():
if isinstance(key, int):
page_data[-1].redirects.append(clean_node(wxr, None, value))
update_pos = True

if update_pos and page_data[-1].pos == "unknown":
page_data[-1].pos = "soft-redirect"
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@
"部件": {"pos": "component"},
"釋義": {
# Means 'definition', some pages don't have POS but use this title
"pos": ""
"pos": "unknown"
},
"量詞": {"pos": "classifier"},
"量词": {"pos": "classifier"},
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
WikiNode,
)
from wikitextprocessor.core import (
TemplateArgs,
TemplateFnCallable,
PostTemplateFnCallable,
)
Expand Down
8 changes: 7 additions & 1 deletion src/wiktextract/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,13 @@ def page_handler(page: Page) -> tuple[list[dict], dict]:
title = re.sub(r"[\s\000-\037]+", " ", page.title)
title = title.strip()
if page.redirect_to is not None:
page_data = [{"title": title, "redirect": page.redirect_to}]
page_data = [
{
"title": title,
"redirect": page.redirect_to,
"pos": "hard-redirect",
}
]
else:
# XXX Sign gloss pages?
start_t = time.time()
Expand Down
22 changes: 22 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,7 @@ def test_zh_see(self, mock_get_page):
{
"lang": "Chinese",
"lang_code": "zh",
"pos": "soft-redirect",
"redirects": ["你們", "妳們"],
"word": "你们",
}
Expand Down Expand Up @@ -655,3 +656,24 @@ def test_gloss_not_inside_list(self, mock_get_page):
}
],
)

def test_ja_see(self):
# https://en.wiktionary.org/wiki/ひとり
self.wxr.wtp.start_page("ひとり")
data = parse_page(
self.wxr,
"ひとり",
"==Japanese==\n{{ja-see|一人|独り}}",
)
self.assertEqual(
data,
[
{
"lang": "Japanese",
"lang_code": "ja",
"pos": "soft-redirect",
"redirects": ["一人", "独り"],
"word": "ひとり",
}
],
)
12 changes: 9 additions & 3 deletions tests/test_zh_descendant.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ def test_ruby(self):
root = self.wxr.wtp.parse(
"* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}"
)
page_data = WordEntry(word="你好", lang_code="ja", lang="日語")
page_data = WordEntry(
word="你好", lang_code="ja", lang="日語", pos="intj"
)
extract_descendants(self.wxr, root, page_data)
self.assertEqual(
page_data.descendants[0].model_dump(exclude_defaults=True),
Expand All @@ -55,7 +57,9 @@ def test_roman_only_list(self):
'<span class="desc-arr" title="仿譯詞">→</span> 壯語:<span class="Latn" lang="za">[[mwngz ndei#壯語|-{mwngz ndei}-]]</span> <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content">仿譯</span><span class="ib-brac qualifier-brac">)</span>',
)
root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}")
page_data = WordEntry(word="你好", lang_code="zh", lang="漢語")
page_data = WordEntry(
word="你好", lang_code="zh", lang="漢語", pos="intj"
)
extract_descendants(self.wxr, root, page_data)
self.assertEqual(
page_data.descendants[0].model_dump(exclude_defaults=True),
Expand Down Expand Up @@ -86,7 +90,9 @@ def test_nested_list(self):
*:* {{desc|cmn|-|der=1}} {{zh-l|宅男}}
*:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}"""
)
page_data = WordEntry(word="オタク", lang_code="ja", lang="日語")
page_data = WordEntry(
word="オタク", lang_code="ja", lang="日語", pos="noun"
)
extract_descendants(self.wxr, root, page_data)
self.assertEqual(
page_data.descendants[0].model_dump(exclude_defaults=True),
Expand Down
11 changes: 8 additions & 3 deletions tests/test_zh_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def test_example_list(self) -> None:
lang="日語",
lang_code="ja",
word="可笑しい",
pos="adj",
)
]
wikitext = """# [[好玩]]的:
Expand All @@ -45,7 +46,7 @@ def test_example_list(self) -> None:
## [[美味]]的
## [[漂亮]]的
## [[很好]]的,[[卓越]]的"""
self.wxr.wtp.start_page("test")
self.wxr.wtp.start_page("可笑しい")
self.wxr.wtp.add_page("Template:lb", 10, "({{{2|}}})")
node = self.wxr.wtp.parse(wikitext)
extract_gloss(self.wxr, page_data, node.children[0], Sense())
Expand Down Expand Up @@ -82,7 +83,7 @@ def test_pos_title_number(
mock_process_pos_block,
) -> None:
node = WikiNode(NodeKind.LEVEL3, 0)
base_data = WordEntry(word="", lang_code="", lang="")
base_data = WordEntry(word="", lang_code="", lang="", pos="")
parse_section(self.wxr, [base_data], base_data, node)
mock_process_pos_block.assert_called()

Expand All @@ -96,7 +97,7 @@ def test_pos_title_chinese_numeral(
mock_process_pos_block,
) -> None:
node = WikiNode(NodeKind.LEVEL3, 0)
base_data = WordEntry(word="", lang_code="", lang="")
base_data = WordEntry(word="", lang_code="", lang="", pos="")
parse_section(self.wxr, [base_data], base_data, node)
mock_process_pos_block.assert_called()

Expand All @@ -112,6 +113,7 @@ def test_soft_redirect_zh_see(self):
{
"lang": "漢語",
"lang_code": "zh",
"pos": "soft-redirect",
"redirects": ["別個"],
"word": "別个",
}
Expand All @@ -130,6 +132,7 @@ def test_soft_redirect_ja_see(self):
{
"lang": "日語",
"lang_code": "ja",
"pos": "soft-redirect",
"redirects": ["如月", "二月", "更衣", "衣更着"],
"word": "きさらぎ",
}
Expand All @@ -146,6 +149,7 @@ def test_gloss_text_only_page(self):
{
"lang": "英语",
"lang_code": "en",
"pos": "unknown",
"senses": [{"glosses": ["释义;意译"]}],
"word": "paraphrase",
}
Expand All @@ -158,6 +162,7 @@ def test_gloss_text_only_page(self):
{
"lang": "漢語",
"lang_code": "zh",
"pos": "unknown",
"senses": [
{
"glosses": [
Expand Down
15 changes: 12 additions & 3 deletions tests/test_zh_headword.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def test_english_headword(self) -> None:
'<span class="headword-line"><strong class="Latn headword" lang="en">-{manga}-</strong> ([[可數|可數]] <small>和</small> [[不可數|不可數]]-{}-,複數-{ <b lang="en"><strong class="selflink">manga</strong></b> <small>或</small> <b>[[mangas#英語|-{mangas}-]]</b>}-)</span>',
)
root = self.wxr.wtp.parse("{{en-noun|~|manga|s}}")
page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
page_data = [
WordEntry(word="manga", lang_code="en", lang="英語", pos="noun")
]
self.wxr.wtp.title = "manga"
extract_headword_line(self.wxr, page_data, root.children[0], "en")
self.assertEqual(
Expand All @@ -44,6 +46,7 @@ def test_english_headword(self) -> None:
{"form": "mangas", "tags": ["plural"]},
],
"tags": ["countable", "uncountable"],
"pos": "noun",
}
],
)
Expand All @@ -59,7 +62,9 @@ def test_headword_gender(self) -> None:
'<span class="headword-line"><strong class="Latn headword" lang="nl">-{manga}-</strong>&nbsp;<span class="gender"><abbr title="陽性名詞">m</abbr></span> (複數-{ <b>[[manga\'s#荷蘭語|-{manga\'s}-]]</b>}-,指小詞-{ <b>[[mangaatje#荷蘭語|-{mangaatje}-]]</b>&nbsp;<span class="gender"><abbr title="中性名詞">n</abbr></span>}-)</span>',
)
root = self.wxr.wtp.parse("{{nl-noun|m|-'s|mangaatje}}")
page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
page_data = [
WordEntry(word="manga", lang_code="en", lang="英語", pos="noun")
]
self.wxr.wtp.title = "manga"
extract_headword_line(self.wxr, page_data, root.children[0], "nl")
self.assertEqual(
Expand All @@ -77,6 +82,7 @@ def test_headword_gender(self) -> None:
},
],
"tags": ["masculine"],
"pos": "noun",
}
],
)
Expand All @@ -93,7 +99,9 @@ def test_headword_roman(self) -> None:
)
root = self.wxr.wtp.parse("{{head|grc|後綴變格形|g=f|head=-κρατίᾱς}}")
page_data = [
WordEntry(word="-κρατίας", lang_code="grc", lang="古希臘語")
WordEntry(
word="-κρατίας", lang_code="grc", lang="古希臘語", pos="suffix"
)
]
self.wxr.wtp.title = "-κρατίας"
extract_headword_line(self.wxr, page_data, root.children[0], "grc")
Expand All @@ -108,6 +116,7 @@ def test_headword_roman(self) -> None:
{"form": "-kratíās", "tags": ["romanization"]},
],
"tags": ["feminine"],
"pos": "suffix",
}
],
)
4 changes: 3 additions & 1 deletion tests/test_zh_inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ def tearDown(self) -> None:
),
)
def test_ja_i_template(self, mock_get_page) -> None:
page_data = [WordEntry(lang="日語", lang_code="ja", word="可笑しい")]
page_data = [
WordEntry(lang="日語", lang_code="ja", word="可笑しい", pos="adj")
]
wikitext = "{{ja-i|可笑し|おかし|okashi}}"
self.wxr.wtp.start_page("可笑しい")
node = self.wxr.wtp.parse(wikitext)
Expand Down
9 changes: 7 additions & 2 deletions tests/test_zh_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def test_sense_term_list(self):
lang_code="mul",
word="%",
senses=[Sense(glosses=["百分比"])],
pos="symbol",
)
]
wikitext = "* {{sense|百分比}} {{l|mul|cU}}、[[centiuno]]"
Expand Down Expand Up @@ -56,7 +57,9 @@ def test_ja_r_template(self):
'<span class="Jpan" lang="ja">[[家主#日語|-{<ruby>家<rp>(</rp><rt>や</rt><rp>)</rp></ruby><ruby>主<rp>(</rp><rt>ぬし</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">yanushi</span></span><span class="mention-gloss-paren annotation-paren">)</span>',
)
node = self.wxr.wtp.parse("{{s|房東}}\n* {{ja-r|家%主|や%ぬし}}")
page_data = [WordEntry(word="大家", lang_code="zh", lang="漢語")]
page_data = [
WordEntry(word="大家", lang_code="zh", lang="漢語", pos="noun")
]
extract_linkages(self.wxr, page_data, node.children, "synonyms", "")
self.assertEqual(
page_data[0].synonyms[0].model_dump(exclude_defaults=True),
Expand All @@ -69,7 +72,9 @@ def test_ja_r_template(self):
)

def test_qual_tag(self):
page_data = [WordEntry(lang="漢語", lang_code="zh", word="駱駝")]
page_data = [
WordEntry(lang="漢語", lang_code="zh", word="駱駝", pos="noun")
]
self.wxr.wtp.add_page("Template:qual", 10, "({{{1}}})")
self.wxr.wtp.add_page("Template:zh-l", 10, "{{{1}}}")
self.wxr.wtp.start_page("駱駝")
Expand Down
8 changes: 6 additions & 2 deletions tests/test_zh_note.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,18 @@ def test_note_list(self):
# https://zh.wiktionary.org/wiki/オタク
self.wxr.wtp.start_page("オタク")
root = self.wxr.wtp.parse("* note list 1\n* note list 2")
page_data = [WordEntry(word="オタク", lang_code="ja", lang="日語")]
page_data = [
WordEntry(word="オタク", lang_code="ja", lang="日語", pos="noun")
]
extract_note(self.wxr, page_data, root)
self.assertEqual(page_data[-1].notes, ["note list 1", "note list 2"])

def test_note_no_list(self):
# https://zh.wiktionary.org/wiki/clavarder
self.wxr.wtp.start_page("clavarder")
root = self.wxr.wtp.parse("note text")
page_data = [WordEntry(word="オタク", lang_code="fr", lang="法語")]
page_data = [
WordEntry(word="オタク", lang_code="fr", lang="法語", pos="verb")
]
extract_note(self.wxr, page_data, root)
self.assertEqual(page_data[-1].notes, ["note text"])
Loading

0 comments on commit 30ae74b

Please sign in to comment.