Merge pull request #474 from xxyzz/zh

Handle zh edition soft redirect templates and gloss sentence only pages
tatuylonen · Jan 26, 2024 · e4ac345 · e4ac345
2 parents a1e399e + 63e4692
commit e4ac345
Show file tree

Hide file tree

Showing 9 changed files with 155 additions and 39 deletions.
diff --git a/src/wiktextract/data/zh/pos_subtitles.json b/src/wiktextract/data/zh/pos_subtitles.json
@@ -554,6 +554,9 @@
   "諺語": {
     "pos": "proverb"
   },
+  "變位": {
+    "pos": "conj"
+  },
   "词组": {
     "pos": "phrase"
   },
@@ -581,6 +584,10 @@
   "部件": {
     "pos": "component"
   },
+  "釋義": {
+    "description": "Means 'definition', some pages don't have POS but use this title",
+    "pos": ""
+  },
   "量詞": {
     "pos": "classifier"
   },

diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py
@@ -174,4 +174,3 @@ def find_alt_of_form(
                 alt_of = clean_node(wxr, None, link)
         if len(alt_of) > 0:
             gloss_data.alt_of.append(AltForm(word=alt_of))
-            gloss_data.tags.append("alt-of")
diff --git a/src/wiktextract/extractor/zh/headword_line.py b/src/wiktextract/extractor/zh/headword_line.py
@@ -2,6 +2,7 @@
 from typing import Union
 
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import TemplateNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -40,13 +41,14 @@
 def extract_headword_line(
     wxr: WiktextractContext,
     page_data: list[WordEntry],
-    node: WikiNode,
+    node: TemplateNode,
     lang_code: str,
 ) -> None:
     template_name = node.template_name
-    if template_name != "head" and not template_name.startswith(
-        f"{lang_code}-"
-    ):
+    if (
+        template_name != "head"
+        and not template_name.startswith(f"{lang_code}-")
+    ) or template_name.endswith("-see"):
         return
 
     expanded_node = wxr.wtp.parse(

diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
@@ -125,3 +125,7 @@ class WordEntry(ChineseBaseModel):
     notes: list[str] = []
     tags: list[str] = []
     descendants: list[Descendant] = []
+    redirects: list[str] = Field(
+        [],
+        description="Soft redirect page, extracted from template zh-see and ja-see",
+    )
diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
@@ -4,7 +4,7 @@
 
 from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import LEVEL_KIND_FLAGS
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -132,15 +132,18 @@ def process_pos_block(
     node: WikiNode,
     pos_text: str,
 ):
-    pos_type = wxr.config.POS_SUBTITLES[pos_text]["pos"]
+    pos_data = wxr.config.POS_SUBTITLES[pos_text]
+    pos_type = pos_data["pos"]
     base_data.pos = pos_type
     append_base_data(page_data, "pos", pos_type, base_data)
+    page_data[-1].tags.extend(pos_data.get("tags", []))
     for index, child in enumerate(node.filter_empty_str_child()):
         if isinstance(child, WikiNode):
-            if index == 0 and child.kind == NodeKind.TEMPLATE:
+            if index == 0 and isinstance(child, TemplateNode):
                 extract_headword_line(
                     wxr, page_data, child, base_data.lang_code
                 )
+                process_soft_redirect_template(wxr, child, page_data)
             elif child.kind == NodeKind.LIST:
                 extract_gloss(wxr, page_data, child, Sense())
             elif child.kind in LEVEL_KIND_FLAGS:
@@ -230,6 +233,41 @@ def parse_page(
         )
         base_data.categories = categories.get("categories", [])
         page_data.append(base_data.model_copy(deep=True))
-        parse_section(wxr, page_data, base_data, level2_node.children)
+        for level3_node in level2_node.find_child(NodeKind.LEVEL3):
+            parse_section(wxr, page_data, base_data, level3_node)
+        if not level2_node.contain_node(NodeKind.LEVEL3):
+            process_low_quality_page(wxr, level2_node, page_data)
 
     return [d.model_dump(exclude_defaults=True) for d in page_data]
+
+
+def process_low_quality_page(
+    wxr: WiktextractContext,
+    level_node: WikiNode,
+    page_data: list[WordEntry],
+) -> None:
+    if level_node.contain_node(NodeKind.TEMPLATE):
+        for template_node in level_node.find_child(NodeKind.TEMPLATE):
+            process_soft_redirect_template(wxr, template_node, page_data)
+    else:
+        # only have a gloss text
+        page_data[-1].senses.append(
+            Sense(glosses=[clean_node(wxr, page_data[-1], level_node.children)])
+        )
+
+
+def process_soft_redirect_template(
+    wxr: WiktextractContext,
+    template_node: TemplateNode,
+    page_data: list[WordEntry],
+) -> None:
+    # https://zh.wiktionary.org/wiki/Template:Ja-see
+    # https://zh.wiktionary.org/wiki/Template:Zh-see
+    if template_node.template_name.lower() == "zh-see":
+        page_data[-1].redirects.append(
+            clean_node(wxr, None, template_node.template_parameters.get(1, ""))
+        )
+    elif template_node.template_name.lower() == "ja-see":
+        for key, value in template_node.template_parameters.items():
+            if isinstance(key, int):
+                page_data[-1].redirects.append(clean_node(wxr, None, value))
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
@@ -334,7 +334,6 @@ def test_typographic_variant_alt_of_template(self):
                         "Orthographe par contrainte typographique par système h de abaĵuro."
                     ],
                     "alt_of": [{"word": "abaĵuro"}],
-                    "tags": ["alt-of"],
                 }
             ],
         )
@@ -361,7 +360,6 @@ def test_typographic_variant_alt_of_text(self):
                         "Variante par contrainte typographique de alphœnix."
                     ],
                     "alt_of": [{"word": "alphœnix"}],
-                    "tags": ["alt-of"],
                 }
             ],
         )
diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py
@@ -31,7 +31,9 @@ def test_ruby(self):
             10,
             '<span class="Jpan" lang="ja">[[你好#日語|-{<ruby>你好<rp>(</rp><rt>ニイハオ</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">nīhao</span></span><span class="mention-gloss-paren annotation-paren">)</span>',
         )
-        root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}")
+        root = self.wxr.wtp.parse(
+            "* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}"
+        )
         page_data = WordEntry(word="你好", lang_code="ja", lang="日語")
         extract_descendants(self.wxr, root, page_data)
         self.assertEqual(

diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
@@ -4,15 +4,22 @@
 from wikitextprocessor import NodeKind, WikiNode, Wtp
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.zh.models import Sense, WordEntry
-from wiktextract.extractor.zh.page import extract_gloss, parse_section
+from wiktextract.extractor.zh.page import (
+    extract_gloss,
+    parse_section,
+    parse_page,
+)
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
 class TestExample(TestCase):
     def setUp(self) -> None:
         self.wxr = WiktextractContext(
-            Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh")
+            Wtp(lang_code="zh"),
+            WiktionaryConfig(
+                capture_language_codes=None, dump_file_lang_code="zh"
+            ),
         )
 
     def tearDown(self) -> None:
@@ -83,7 +90,9 @@ def test_pos_title_number(
         mock_process_pos_block.assert_called()
 
     @patch("wiktextract.extractor.zh.page.process_pos_block")
-    @patch("wiktextract.extractor.zh.page.clean_node", return_value="名詞（一）")
+    @patch(
+        "wiktextract.extractor.zh.page.clean_node", return_value="名詞（一）"
+    )
     def test_pos_title_chinese_numeral(
         self,
         mock_clean_node,
@@ -93,3 +102,57 @@ def test_pos_title_chinese_numeral(
         base_data = WordEntry(word="", lang_code="", lang="")
         parse_section(self.wxr, [base_data], base_data, node)
         mock_process_pos_block.assert_called()
+
+    def test_soft_redirect_zh_see(self):
+        self.assertEqual(
+            parse_page(
+                self.wxr,
+                "別个",
+                """==漢語==
+{{zh-see|別個}}""",
+            ),
+            [
+                {
+                    "lang": "漢語",
+                    "lang_code": "zh",
+                    "redirects": ["別個"],
+                    "word": "別个",
+                }
+            ],
+        )
+
+    def test_soft_redirect_ja_see(self):
+        self.assertEqual(
+            parse_page(
+                self.wxr,
+                "きさらぎ",
+                """==日語==
+{{ja-see|如月|二月|更衣|衣更着}}""",
+            ),
+            [
+                {
+                    "lang": "日語",
+                    "lang_code": "ja",
+                    "redirects": ["如月", "二月", "更衣", "衣更着"],
+                    "word": "きさらぎ",
+                }
+            ],
+        )
+
+    def test_gloss_text_only_page(self):
+        self.assertEqual(
+            parse_page(
+                self.wxr,
+                "paraphrase",
+                """== 英语 ==
+释义；意译""",
+            ),
+            [
+                {
+                    "lang": "英语",
+                    "lang_code": "en",
+                    "senses": [{"glosses": ["释义；意译"]}],
+                    "word": "paraphrase",
+                }
+            ],
+        )
diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py
@@ -18,19 +18,20 @@ def tearDown(self):
             self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
         )
 
-    @patch(
-        "wikitextprocessor.Wtp.node_to_wikitext",
-        return_value='<strong class="Latn headword" lang="en">manga</strong> ([[可數|可數]] & [[不可數|不可數]]，複數 <b class="Latn form-of lang-en p-form-of" lang="en"><strong class="selflink">manga</strong></b> <small>或</small> <b class="Latn form-of lang-en p-form-of" lang="en">[[mangas#英語|mangas]]</b>)',
-    )
-    def test_english_headword(self, mock_node_to_wikitext) -> None:
+    def test_english_headword(self) -> None:
         # https://zh.wiktionary.org/wiki/manga#字源1
         # wikitext: {{en-noun|~|manga|s}}
         # expanded text: manga (可數 & 不可數，複數 manga 或 mangas)
-        node = Mock()
-        node.largs = [["en-noun"]]
+        self.wxr.wtp.start_page("manga")
+        self.wxr.wtp.add_page(
+            "Template:en-noun",
+            10,
+            '<strong class="Latn headword" lang="en">manga</strong> ([[可數|可數]] & [[不可數|不可數]]，複數 <b class="Latn form-of lang-en p-form-of" lang="en"><strong class="selflink">manga</strong></b> <small>或</small> <b class="Latn form-of lang-en p-form-of" lang="en">[[mangas#英語|mangas]]</b>)',
+        )
+        root = self.wxr.wtp.parse("{{en-noun|~|manga|s}}")
         page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
         self.wxr.wtp.title = "manga"
-        extract_headword_line(self.wxr, page_data, node, "en")
+        extract_headword_line(self.wxr, page_data, root.children[0], "en")
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data],
             [
@@ -47,19 +48,20 @@ def test_english_headword(self, mock_node_to_wikitext) -> None:
             ],
         )
 
-    @patch(
-        "wikitextprocessor.Wtp.node_to_wikitext",
-        return_value='<strong class="Latn headword" lang="nl">manga</strong>&nbsp;<span class="gender"><abbr title="陽性名詞">m</abbr></span> (複數 <b class="Latn form-of lang-nl p-form-of" lang="nl">[[manga\'s#荷蘭語|manga\'s]]</b>，指小詞 <b class="Latn form-of lang-nl 指小詞-form-of" lang="nl">[[mangaatje#荷蘭語|mangaatje]]</b>&nbsp;<span class="gender"><abbr title="中性名詞">n</abbr></span>)',
-    )
-    def test_headword_gender(self, mock_node_to_wikitext) -> None:
+    def test_headword_gender(self) -> None:
         # https://zh.wiktionary.org/wiki/manga#字源1_2
         # wikitext: {{nl-noun|m|-'s|mangaatje}}
         # expanded text: manga m (複數 manga's，指小詞 mangaatje n)
-        node = Mock()
-        node.largs = [["nl-noun"]]
+        self.wxr.wtp.start_page("manga")
+        self.wxr.wtp.add_page(
+            "Template:nl-noun",
+            10,
+            '<strong class="Latn headword" lang="nl">manga</strong>&nbsp;<span class="gender"><abbr title="陽性名詞">m</abbr></span> (複數 <b class="Latn form-of lang-nl p-form-of" lang="nl">[[manga\'s#荷蘭語|manga\'s]]</b>，指小詞 <b class="Latn form-of lang-nl 指小詞-form-of" lang="nl">[[mangaatje#荷蘭語|mangaatje]]</b>&nbsp;<span class="gender"><abbr title="中性名詞">n</abbr></span>)',
+        )
+        root = self.wxr.wtp.parse("{{nl-noun|m|-'s|mangaatje}}")
         page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
         self.wxr.wtp.title = "manga"
-        extract_headword_line(self.wxr, page_data, node, "nl")
+        extract_headword_line(self.wxr, page_data, root.children[0], "nl")
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data],
             [
@@ -76,21 +78,22 @@ def test_headword_gender(self, mock_node_to_wikitext) -> None:
             ],
         )
 
-    @patch(
-        "wikitextprocessor.Wtp.node_to_wikitext",
-        return_value='<strong class="polytonic headword" lang="grc">-κρατίᾱς</strong> (<span lang="grc-Latn" class="headword-tr tr Latn" dir="ltr">-kratíās</span>)&nbsp;<span class="gender"><abbr title="陰性名詞">f</abbr></span>',
-    )
-    def test_headword_roman(self, mock_node_to_wikitext) -> None:
+    def test_headword_roman(self) -> None:
         # https://zh.wiktionary.org/wiki/-κρατίας
         # wikitext: {{head|grc|後綴變格形|g=f|head=-κρατίᾱς}}
         # expanded text: -κρατίᾱς (-kratíās) f
-        node = Mock()
-        node.largs = [["head"]]
+        self.wxr.wtp.start_page("-κρατίας")
+        self.wxr.wtp.add_page(
+            "Template:head",
+            10,
+            '<strong class="polytonic headword" lang="grc">-κρατίᾱς</strong> (<span lang="grc-Latn" class="headword-tr tr Latn" dir="ltr">-kratíās</span>)&nbsp;<span class="gender"><abbr title="陰性名詞">f</abbr></span>',
+        )
+        root = self.wxr.wtp.parse("{{head|grc|後綴變格形|g=f|head=-κρατίᾱς}}")
         page_data = [
             WordEntry(word="-κρατίας", lang_code="grc", lang="古希臘語")
         ]
         self.wxr.wtp.title = "-κρατίας"
-        extract_headword_line(self.wxr, page_data, node, "grc")
+        extract_headword_line(self.wxr, page_data, root.children[0], "grc")
         self.assertEqual(
             [d.model_dump(exclude_defaults=True) for d in page_data],
             [