Merge pull request #531 from xxyzz/zh

Make zh edition's pydantic model `WordEntry.pos` field required
tatuylonen · Mar 6, 2024 · 30ae74b · 30ae74b
2 parents 725e728 + 47a4b1c
commit 30ae74b
Show file tree

Hide file tree

Showing 16 changed files with 139 additions and 35 deletions.
diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
@@ -21,7 +21,7 @@
 from mediawiki_langcodes import get_all_names, name_to_code
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
-from wikitextprocessor.parser import GeneralNode
+from wikitextprocessor.parser import GeneralNode, TemplateNode
 from wiktextract.clean import clean_template_args
 from wiktextract.datautils import (
     data_append,
@@ -3185,13 +3185,8 @@ def skip_template_fn(name, ht):
             if not isinstance(node, WikiNode):
                 # print("  X{}".format(repr(node)[:40]))
                 continue
-            if node.kind == NodeKind.TEMPLATE:
-                template_name = node.largs[0][0]
-                if template_name == "zh-see":
-                    # handle Chinese character variant redirect
-                    # https://en.wikipedia.org/wiki/Variant_Chinese_characters
-                    redirect_to = node.largs[1][0]
-                    redirect_list.append(redirect_to)
+            if isinstance(node, TemplateNode):
+                process_soft_redirect_template(wxr, node, redirect_list)
                 continue
 
             if node.kind not in LEVEL_KINDS:
@@ -3309,9 +3304,13 @@ def skip_template_fn(name, ht):
         if len(redirect_list) > 0:
             if len(pos_data) > 0:
                 pos_data["redirects"] = redirect_list
+                if "pos" not in pos_data:
+                    pos_data["pos"] = "soft-redirect"
             else:
                 new_page_data = base_data.copy()
                 new_page_data["redirects"] = redirect_list
+                if "pos" not in new_page_data:
+                    new_page_data["pos"] = "soft-redirect"
                 page_datas.append(new_page_data)
 
     def extract_examples(others, sense_base):
@@ -3929,3 +3928,24 @@ def parse_page(
                 )
             x["original_title"] = word
     return ret
+
+
+def process_soft_redirect_template(
+    wxr: WiktextractContext,
+    template_node: TemplateNode,
+    redirect_pages: list[str],
+) -> None:
+    if template_node.template_name == "zh-see":
+        # https://en.wiktionary.org/wiki/Template:zh-see
+        title = clean_node(
+            wxr, None, template_node.template_parameters.get(1, "")
+        )
+        if title != "":
+            redirect_pages.append(title)
+    elif template_node.template_name == "ja-see":
+        # https://en.wiktionary.org/wiki/Template:ja-see
+        for key, value in template_node.template_parameters.items():
+            if isinstance(key, int):
+                title = clean_node(wxr, None, value)
+                if title != "":
+                    redirect_pages.append(title)
diff --git a/src/wiktextract/extractor/fr/tags.py b/src/wiktextract/extractor/fr/tags.py
@@ -141,6 +141,7 @@
 # template text before gloss
 SENSE_TAGS: dict[str, str] = {
     # https://fr.wiktionary.org/wiki/Modèle:figuré
+    # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_relation_entre_les_définitions
     "sens figuré": "figuratively",
     "enclise": "enclitic",
     "idiotisme": "idiomatic",

diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
@@ -105,7 +105,7 @@ class WordEntry(ChineseBaseModel):
     word: str = Field(description="Word string")
     lang_code: str = Field(description="Wiktionary language code")
     lang: str = Field(description="Localized language name")
-    pos: str = Field("", description="Part of speech type")
+    pos: str = Field(description="Part of speech type")
     etymology_text: str = ""
     senses: list[Sense] = Field([], description="Sense list")
     forms: list[Form] = Field([], description="Inflection forms list")

diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
@@ -244,7 +244,10 @@ def parse_page(
             continue
         wxr.wtp.start_section(lang_name)
         base_data = WordEntry(
-            word=wxr.wtp.title, lang_code=lang_code, lang=lang_name
+            word=wxr.wtp.title,
+            lang_code=lang_code,
+            lang=lang_name,
+            pos="unknown",
         )
         base_data.categories = categories.get("categories", [])
         page_data.append(base_data.model_copy(deep=True))
@@ -278,11 +281,17 @@ def process_soft_redirect_template(
 ) -> None:
     # https://zh.wiktionary.org/wiki/Template:Ja-see
     # https://zh.wiktionary.org/wiki/Template:Zh-see
+    update_pos = False
     if template_node.template_name.lower() == "zh-see":
         page_data[-1].redirects.append(
             clean_node(wxr, None, template_node.template_parameters.get(1, ""))
         )
+        update_pos = True
     elif template_node.template_name.lower() == "ja-see":
         for key, value in template_node.template_parameters.items():
             if isinstance(key, int):
                 page_data[-1].redirects.append(clean_node(wxr, None, value))
+        update_pos = True
+
+    if update_pos and page_data[-1].pos == "unknown":
+        page_data[-1].pos = "soft-redirect"
diff --git a/src/wiktextract/extractor/zh/section_titles.py b/src/wiktextract/extractor/zh/section_titles.py
@@ -166,7 +166,7 @@
     "部件": {"pos": "component"},
     "釋義": {
         # Means 'definition', some pages don't have POS but use this title
-        "pos": ""
+        "pos": "unknown"
     },
     "量詞": {"pos": "classifier"},
     "量词": {"pos": "classifier"},

diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py
@@ -13,7 +13,6 @@
     WikiNode,
 )
 from wikitextprocessor.core import (
-    TemplateArgs,
     TemplateFnCallable,
     PostTemplateFnCallable,
 )

diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py
@@ -48,7 +48,13 @@ def page_handler(page: Page) -> tuple[list[dict], dict]:
             title = re.sub(r"[\s\000-\037]+", " ", page.title)
             title = title.strip()
             if page.redirect_to is not None:
-                page_data = [{"title": title, "redirect": page.redirect_to}]
+                page_data = [
+                    {
+                        "title": title,
+                        "redirect": page.redirect_to,
+                        "pos": "hard-redirect",
+                    }
+                ]
             else:
                 # XXX Sign gloss pages?
                 start_t = time.time()

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -487,6 +487,7 @@ def test_zh_see(self, mock_get_page):
                 {
                     "lang": "Chinese",
                     "lang_code": "zh",
+                    "pos": "soft-redirect",
                     "redirects": ["你們", "妳們"],
                     "word": "你们",
                 }
@@ -655,3 +656,24 @@ def test_gloss_not_inside_list(self, mock_get_page):
                 }
             ],
         )
+
+    def test_ja_see(self):
+        # https://en.wiktionary.org/wiki/ひとり
+        self.wxr.wtp.start_page("ひとり")
+        data = parse_page(
+            self.wxr,
+            "ひとり",
+            "==Japanese==\n{{ja-see|一人|独り}}",
+        )
+        self.assertEqual(
+            data,
+            [
+                {
+                    "lang": "Japanese",
+                    "lang_code": "ja",
+                    "pos": "soft-redirect",
+                    "redirects": ["一人", "独り"],
+                    "word": "ひとり",
+                }
+            ],
+        )
diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py
@@ -34,7 +34,9 @@ def test_ruby(self):
         root = self.wxr.wtp.parse(
             "* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}"
         )
-        page_data = WordEntry(word="你好", lang_code="ja", lang="日語")
+        page_data = WordEntry(
+            word="你好", lang_code="ja", lang="日語", pos="intj"
+        )
         extract_descendants(self.wxr, root, page_data)
         self.assertEqual(
             page_data.descendants[0].model_dump(exclude_defaults=True),
@@ -55,7 +57,9 @@ def test_roman_only_list(self):
             '<span class="desc-arr" title="仿譯詞">→</span> 壯語：<span class="Latn" lang="za">[[mwngz ndei#壯語|-{mwngz ndei}-]]</span> <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content">仿譯</span><span class="ib-brac qualifier-brac">)</span>',
         )
         root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}")
-        page_data = WordEntry(word="你好", lang_code="zh", lang="漢語")
+        page_data = WordEntry(
+            word="你好", lang_code="zh", lang="漢語", pos="intj"
+        )
         extract_descendants(self.wxr, root, page_data)
         self.assertEqual(
             page_data.descendants[0].model_dump(exclude_defaults=True),
@@ -86,7 +90,9 @@ def test_nested_list(self):
 *:* {{desc|cmn|-|der=1}} {{zh-l|宅男}}
 *:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}"""
         )
-        page_data = WordEntry(word="オタク", lang_code="ja", lang="日語")
+        page_data = WordEntry(
+            word="オタク", lang_code="ja", lang="日語", pos="noun"
+        )
         extract_descendants(self.wxr, root, page_data)
         self.assertEqual(
             page_data.descendants[0].model_dump(exclude_defaults=True),

diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
@@ -34,6 +34,7 @@ def test_example_list(self) -> None:
                 lang="日語",
                 lang_code="ja",
                 word="可笑しい",
+                pos="adj",
             )
         ]
         wikitext = """# [[好玩]]的：
@@ -45,7 +46,7 @@ def test_example_list(self) -> None:
 ## [[美味]]的
 ## [[漂亮]]的
 ## [[很好]]的，[[卓越]]的"""
-        self.wxr.wtp.start_page("test")
+        self.wxr.wtp.start_page("可笑しい")
         self.wxr.wtp.add_page("Template:lb", 10, "({{{2|}}})")
         node = self.wxr.wtp.parse(wikitext)
         extract_gloss(self.wxr, page_data, node.children[0], Sense())
@@ -82,7 +83,7 @@ def test_pos_title_number(
         mock_process_pos_block,
     ) -> None:
         node = WikiNode(NodeKind.LEVEL3, 0)
-        base_data = WordEntry(word="", lang_code="", lang="")
+        base_data = WordEntry(word="", lang_code="", lang="", pos="")
         parse_section(self.wxr, [base_data], base_data, node)
         mock_process_pos_block.assert_called()
 
@@ -96,7 +97,7 @@ def test_pos_title_chinese_numeral(
         mock_process_pos_block,
     ) -> None:
         node = WikiNode(NodeKind.LEVEL3, 0)
-        base_data = WordEntry(word="", lang_code="", lang="")
+        base_data = WordEntry(word="", lang_code="", lang="", pos="")
         parse_section(self.wxr, [base_data], base_data, node)
         mock_process_pos_block.assert_called()
 
@@ -112,6 +113,7 @@ def test_soft_redirect_zh_see(self):
                 {
                     "lang": "漢語",
                     "lang_code": "zh",
+                    "pos": "soft-redirect",
                     "redirects": ["別個"],
                     "word": "別个",
                 }
@@ -130,6 +132,7 @@ def test_soft_redirect_ja_see(self):
                 {
                     "lang": "日語",
                     "lang_code": "ja",
+                    "pos": "soft-redirect",
                     "redirects": ["如月", "二月", "更衣", "衣更着"],
                     "word": "きさらぎ",
                 }
@@ -146,6 +149,7 @@ def test_gloss_text_only_page(self):
                     {
                         "lang": "英语",
                         "lang_code": "en",
+                        "pos": "unknown",
                         "senses": [{"glosses": ["释义；意译"]}],
                         "word": "paraphrase",
                     }
@@ -158,6 +162,7 @@ def test_gloss_text_only_page(self):
                     {
                         "lang": "漢語",
                         "lang_code": "zh",
+                        "pos": "unknown",
                         "senses": [
                             {
                                 "glosses": [

diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py
@@ -29,7 +29,9 @@ def test_english_headword(self) -> None:
             '<span class="headword-line"><strong class="Latn headword" lang="en">-{manga}-</strong> ([[可數|可數]] <small>和</small> [[不可數|不可數]]-{}-，複數-{ <b lang="en"><strong class="selflink">manga</strong></b> <small>或</small> <b>[[mangas#英語|-{mangas}-]]</b>}-)</span>',
         )
         root = self.wxr.wtp.parse("{{en-noun|~|manga|s}}")
-        page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
+        page_data = [
+            WordEntry(word="manga", lang_code="en", lang="英語", pos="noun")
+        ]
         self.wxr.wtp.title = "manga"
         extract_headword_line(self.wxr, page_data, root.children[0], "en")
         self.assertEqual(
@@ -44,6 +46,7 @@ def test_english_headword(self) -> None:
                         {"form": "mangas", "tags": ["plural"]},
                     ],
                     "tags": ["countable", "uncountable"],
+                    "pos": "noun",
                 }
             ],
         )
@@ -59,7 +62,9 @@ def test_headword_gender(self) -> None:
             '<span class="headword-line"><strong class="Latn headword" lang="nl">-{manga}-</strong>&nbsp;<span class="gender"><abbr title="陽性名詞">m</abbr></span> (複數-{ <b>[[manga\'s#荷蘭語|-{manga\'s}-]]</b>}-，指小詞-{ <b>[[mangaatje#荷蘭語|-{mangaatje}-]]</b>&nbsp;<span class="gender"><abbr title="中性名詞">n</abbr></span>}-)</span>',
         )
         root = self.wxr.wtp.parse("{{nl-noun|m|-'s|mangaatje}}")
-        page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
+        page_data = [
+            WordEntry(word="manga", lang_code="en", lang="英語", pos="noun")
+        ]
         self.wxr.wtp.title = "manga"
         extract_headword_line(self.wxr, page_data, root.children[0], "nl")
         self.assertEqual(
@@ -77,6 +82,7 @@ def test_headword_gender(self) -> None:
                         },
                     ],
                     "tags": ["masculine"],
+                    "pos": "noun",
                 }
             ],
         )
@@ -93,7 +99,9 @@ def test_headword_roman(self) -> None:
         )
         root = self.wxr.wtp.parse("{{head|grc|後綴變格形|g=f|head=-κρατίᾱς}}")
         page_data = [
-            WordEntry(word="-κρατίας", lang_code="grc", lang="古希臘語")
+            WordEntry(
+                word="-κρατίας", lang_code="grc", lang="古希臘語", pos="suffix"
+            )
         ]
         self.wxr.wtp.title = "-κρατίας"
         extract_headword_line(self.wxr, page_data, root.children[0], "grc")
@@ -108,6 +116,7 @@ def test_headword_roman(self) -> None:
                         {"form": "-kratíās", "tags": ["romanization"]},
                     ],
                     "tags": ["feminine"],
+                    "pos": "suffix",
                 }
             ],
         )
diff --git a/tests/test_zh_inflection.py b/tests/test_zh_inflection.py
@@ -39,7 +39,9 @@ def tearDown(self) -> None:
         ),
     )
     def test_ja_i_template(self, mock_get_page) -> None:
-        page_data = [WordEntry(lang="日語", lang_code="ja", word="可笑しい")]
+        page_data = [
+            WordEntry(lang="日語", lang_code="ja", word="可笑しい", pos="adj")
+        ]
         wikitext = "{{ja-i|可笑し|おかし|okashi}}"
         self.wxr.wtp.start_page("可笑しい")
         node = self.wxr.wtp.parse(wikitext)

diff --git a/tests/test_zh_linkage.py b/tests/test_zh_linkage.py
@@ -27,6 +27,7 @@ def test_sense_term_list(self):
                 lang_code="mul",
                 word="%",
                 senses=[Sense(glosses=["百分比"])],
+                pos="symbol",
             )
         ]
         wikitext = "* {{sense|百分比}} {{l|mul|cU}}、[[centiuno]]"
@@ -56,7 +57,9 @@ def test_ja_r_template(self):
             '<span class="Jpan" lang="ja">[[家主#日語|-{<ruby>家<rp>(</rp><rt>や</rt><rp>)</rp></ruby><ruby>主<rp>(</rp><rt>ぬし</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">yanushi</span></span><span class="mention-gloss-paren annotation-paren">)</span>',
         )
         node = self.wxr.wtp.parse("{{s|房東}}\n* {{ja-r|家%主|や%ぬし}}")
-        page_data = [WordEntry(word="大家", lang_code="zh", lang="漢語")]
+        page_data = [
+            WordEntry(word="大家", lang_code="zh", lang="漢語", pos="noun")
+        ]
         extract_linkages(self.wxr, page_data, node.children, "synonyms", "")
         self.assertEqual(
             page_data[0].synonyms[0].model_dump(exclude_defaults=True),
@@ -69,7 +72,9 @@ def test_ja_r_template(self):
         )
 
     def test_qual_tag(self):
-        page_data = [WordEntry(lang="漢語", lang_code="zh", word="駱駝")]
+        page_data = [
+            WordEntry(lang="漢語", lang_code="zh", word="駱駝", pos="noun")
+        ]
         self.wxr.wtp.add_page("Template:qual", 10, "({{{1}}})")
         self.wxr.wtp.add_page("Template:zh-l", 10, "{{{1}}}")
         self.wxr.wtp.start_page("駱駝")

diff --git a/tests/test_zh_note.py b/tests/test_zh_note.py
@@ -22,14 +22,18 @@ def test_note_list(self):
         # https://zh.wiktionary.org/wiki/オタク
         self.wxr.wtp.start_page("オタク")
         root = self.wxr.wtp.parse("* note list 1\n* note list 2")
-        page_data = [WordEntry(word="オタク", lang_code="ja", lang="日語")]
+        page_data = [
+            WordEntry(word="オタク", lang_code="ja", lang="日語", pos="noun")
+        ]
         extract_note(self.wxr, page_data, root)
         self.assertEqual(page_data[-1].notes, ["note list 1", "note list 2"])
 
     def test_note_no_list(self):
         # https://zh.wiktionary.org/wiki/clavarder
         self.wxr.wtp.start_page("clavarder")
         root = self.wxr.wtp.parse("note text")
-        page_data = [WordEntry(word="オタク", lang_code="fr", lang="法語")]
+        page_data = [
+            WordEntry(word="オタク", lang_code="fr", lang="法語", pos="verb")
+        ]
         extract_note(self.wxr, page_data, root)
         self.assertEqual(page_data[-1].notes, ["note text"])