From 2de1f1a831e3fdcfb611c085fc9b34dea85cb095 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 15 Dec 2023 14:52:27 +0800
Subject: [PATCH] Use Pydantic in Chinese Wiktionary extractor

---
 .gitignore                                    |   3 +
 Makefile                                      |   1 -
 json_schema/zh.json                           | 376 ------------------
 .../data/zh/linkage_subtitles.json            |  26 +-
 src/wiktextract/datautils.py                  |  60 +--
 src/wiktextract/extractor/zh/descendant.py    |  54 ++-
 src/wiktextract/extractor/zh/example.py       |  85 ++--
 src/wiktextract/extractor/zh/gloss.py         |  32 +-
 src/wiktextract/extractor/zh/headword_line.py |  88 ++--
 src/wiktextract/extractor/zh/inflection.py    |  31 +-
 src/wiktextract/extractor/zh/linkage.py       | 124 +++---
 src/wiktextract/extractor/zh/models.py        | 127 ++++++
 src/wiktextract/extractor/zh/note.py          |  10 +-
 src/wiktextract/extractor/zh/page.py          | 127 +++---
 src/wiktextract/extractor/zh/pronunciation.py |  81 ++--
 src/wiktextract/extractor/zh/thesaurus.py     |  25 +-
 src/wiktextract/extractor/zh/translation.py   |  42 +-
 src/wiktextract/extractor/zh/util.py          |  26 ++
 src/wiktextract/page.py                       |  24 +-
 tests/test_zh_descendant.py                   |  87 ++--
 tests/test_zh_example.py                      |  38 +-
 tests/test_zh_gloss.py                        |  27 +-
 tests/test_zh_headword.py                     |  35 +-
 tests/test_zh_inflection.py                   |  19 +-
 tests/test_zh_linkage.py                      |  53 ++-
 tests/test_zh_note.py                         |  10 +-
 tests/test_zh_pronunciation.py                |  34 +-
 tests/test_zh_translation.py                  |  14 +-
 28 files changed, 651 insertions(+), 1008 deletions(-)
 delete mode 100644 json_schema/zh.json
 create mode 100644 src/wiktextract/extractor/zh/models.py
 create mode 100644 src/wiktextract/extractor/zh/util.py

diff --git a/.gitignore b/.gitignore
index 1271ff87..4ac776d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,6 @@ wikt-db*
 
 # GitHub Pages
 _site
+
+# Emacs files
+*~
diff --git a/Makefile b/Makefile
index 7ca5dfcb..9b21194d 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,6 @@ coverage_report:
 	python -m coverage html
 github_pages:
 	python tools/generate_schema.py
-	cp json_schema/*.json _site
 	python tools/github_pages.py $(REPO) $(SHA)
 clean:
 	python -m coverage erase
diff --git a/json_schema/zh.json b/json_schema/zh.json
deleted file mode 100644
index 9de1c86c..00000000
--- a/json_schema/zh.json
+++ /dev/null
@@ -1,376 +0,0 @@
-{
-  "$schema": "https://json-schema.org/draft/2020-12/schema",
-  "$id": "https://kaikki.org/zh.json",
-  "title": "Chinese Wiktionary",
-  "description": "JSON schema of the Chinese Wiktionary extractor",
-  "type": "object",
-  "properties": {
-    "lang_name": {
-      "description": "Localized language name of the word",
-      "type": "string"
-    },
-    "lang_code": {
-      "description": "Wiktionary language code",
-      "type": "string"
-    },
-    "word": {
-      "description": "word string",
-      "type": "string"
-    },
-    "pos": {
-      "description": "Part of speech type",
-      "type": "string"
-    },
-    "etymology_text": {
-      "type": "string"
-    },
-    "senses": {
-      "description": "Sense list",
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/sense"
-      }
-    },
-    "forms": {
-      "description": "Inflection forms list",
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/form"
-      }
-    },
-    "sounds": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/sound"
-      }
-    },
-    "translations": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/translation"
-      }
-    },
-    "synonyms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "hyponyms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "hypernyms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "holonyms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "meronyms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "derived": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "troponyms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "paronyms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "related": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "abbreviation": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "proverbs": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "antonyms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "coordinate_terms": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "various": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/linkage"
-      }
-    },
-    "title": {
-      "description": "Redirect page source title",
-      "type": "string"
-    },
-    "redirect": {
-      "description": "Redirect page target title",
-      "type": "string"
-    },
-    "categories": {
-      "type": "array",
-      "items": {
-        "type": "string"
-      }
-    },
-    "descendants": {
-      "type": "array",
-      "items": {
-        "$ref": "#/$defs/descendant"
-      }
-    },
-    "notes": {
-      "description": "Usage notes",
-      "type": "array",
-      "items": {
-        "type": "string"
-      }
-    }
-  },
-  "$defs": {
-    "sense": {
-      "type": "object",
-      "properties": {
-        "glosses": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          }
-        },
-        "tags": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          }
-        },
-        "categories": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          }
-        },
-        "examples": {
-          "type": "array",
-          "items": {
-            "$ref": "#/$defs/example"
-          }
-        }
-      }
-    },
-    "example": {
-      "type": "object",
-      "properties": {
-        "texts": {
-          "description": "Example usage sentences, some might have have both Simplified and Traditional Chinese forms",
-          "type": "array",
-          "items": {
-            "type": "string"
-          }
-        },
-        "translation": {
-          "description": "Chinese translation of the example sentence",
-          "type": "string"
-        },
-        "roman": {
-          "description": "Romanization of the example sentence",
-          "type": "string"
-        },
-        "ref": {
-          "description": "Source of the sentence, like book title and page number",
-          "type": "string"
-        },
-        "type": {
-          "description": "This value is 'quotation' if 'source' exists",
-          "type": "string",
-          "enum": [
-            "example",
-            "quotation"
-          ]
-        }
-      }
-    },
-    "form": {
-      "type": "object",
-      "properties": {
-        "form": {
-          "type": "string"
-        },
-        "tags": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          }
-        },
-        "source": {
-          "type": "string"
-        },
-        "ruby": {
-          "description": "Japanese Kanji and furigana",
-          "type": "array",
-          "items": {
-            "type": "array",
-            "items": {
-              "type": "string"
-            }
-          }
-        }
-      }
-    },
-    "sound": {
-      "type": "object",
-      "properties": {
-        "zh-pron": {
-          "description": "Chinese word pronunciation",
-          "type": "string"
-        },
-        "ipa": {
-          "description": "International Phonetic Alphabet",
-          "type": "string"
-        },
-        "audio": {
-          "description": "Audio file name",
-          "type": "string"
-        },
-        "wav_url": {
-          "type": "string"
-        },
-        "ogg_url": {
-          "type": "string"
-        },
-        "mp3_url": {
-          "type": "string"
-        },
-        "homophone": {
-          "type": "string"
-        }
-      }
-    },
-    "translation": {
-      "type": "object",
-      "properties": {
-        "lang_code": {
-          "description": "Wiktionary language code of the translation term",
-          "type": "string"
-        },
-        "lang_name": {
-          "description": "Translation language name",
-          "type": "string"
-        },
-        "word": {
-          "description": "Translation term",
-          "type": "string"
-        },
-        "sense": {
-          "description": "Translation gloss",
-          "type": "string"
-        },
-        "tags": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          }
-        },
-        "roman": {
-          "type": "string"
-        }
-      }
-    },
-    "linkage": {
-      "type": "object",
-      "properties": {
-        "word": {
-          "type": "string"
-        },
-        "tags": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          }
-        },
-        "roman": {
-          "type": "string"
-        },
-        "language_variant": {
-          "description": "Chinese character variant",
-          "type": "string",
-          "enum": ["zh-Hant", "zh-Hans"]
-        }
-      }
-    },
-    "descendant": {
-      "type": "object",
-      "properties": {
-        "lang_code": {
-          "description": "Wiktionary language code",
-          "type": "string"
-        },
-        "lang_name": {
-          "type": "string"
-        },
-        "word": {
-          "type": "string"
-        },
-        "roman": {
-          "type": "string"
-        },
-        "tags": {
-          "type": "array",
-          "items": {
-            "type": "string"
-          }
-        },
-        "descendants": {
-          "type": "array",
-          "items": {
-            "$refs": "#/$defs/descendant"
-          }
-        },
-        "ruby": {
-          "description": "Japanese Kanji and furigana",
-          "type": "array",
-          "items": {
-            "type": "array",
-            "items": {
-              "type": "string"
-            }
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/src/wiktextract/data/zh/linkage_subtitles.json b/src/wiktextract/data/zh/linkage_subtitles.json
index 369e2221..3b21d4cb 100644
--- a/src/wiktextract/data/zh/linkage_subtitles.json
+++ b/src/wiktextract/data/zh/linkage_subtitles.json
@@ -1,18 +1,18 @@
 {
-  "上下位關係": "hypernym",
-  "上义词": "hypernym",
-  "上位詞": "hypernym",
-  "上位語": "hypernym",
-  "上位词": "hypernym",
-  "上義詞": "hypernym",
-  "下义词": "hyponym",
-  "下位詞": "hyponym",
-  "下位語": "hyponym",
-  "下位词": "hyponym",
-  "下层词": "hyponym",
-  "下属词": "hyponym",
+  "上下位關係": "hypernyms",
+  "上义词": "hypernyms",
+  "上位詞": "hypernyms",
+  "上位語": "hypernyms",
+  "上位词": "hypernyms",
+  "上義詞": "hypernyms",
+  "下义词": "hyponyms",
+  "下位詞": "hyponyms",
+  "下位語": "hyponyms",
+  "下位词": "hyponyms",
+  "下层词": "hyponyms",
+  "下属词": "hyponyms",
   "下層概念": "derived",
-  "下義詞": "hyponym",
+  "下義詞": "hyponyms",
   "俗语": "related",
   "关联词": "related",
   "关联词条": "related",
diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py
index ff196c2f..96868893 100644
--- a/src/wiktextract/datautils.py
+++ b/src/wiktextract/datautils.py
@@ -1,13 +1,9 @@
 # Utilities for manipulating word data structures
 #
 # Copyright (c) 2018-2022 Tatu Ylonen.  See file LICENSE and https://ylonen.org
-import copy
 import re
 from collections import defaultdict
-from functools import partial
-from typing import Any, Dict, Iterable, List, Tuple
-
-from wiktextract.wxr_context import WiktextractContext
+from typing import Any, Iterable
 
 # Keys in ``data`` that can only have string values (a list of them)
 STR_KEYS = frozenset({"tags", "glosses"})
@@ -26,7 +22,7 @@
 )
 
 
-def data_append(data: Dict, key: str, value: Any) -> None:
+def data_append(data: Any, key: str, value: Any) -> None:
     """Appends ``value`` under ``key`` in the dictionary ``data``.  The key
     is created if it does not exist."""
     assert isinstance(key, str)
@@ -47,8 +43,10 @@ def data_append(data: Dict, key: str, value: Any) -> None:
         data[key] = list_value
 
 
-def data_extend(data: Dict, key: str, values: Iterable) -> None:
-    """Appends all values in a list under ``key`` in the dictionary ``data``."""
+def data_extend(data: Any, key: str, values: Iterable) -> None:
+    """
+    Appends all values in a list under ``key`` in the dictionary ``data``.
+    """
     assert isinstance(data, dict)
     assert isinstance(key, str)
     assert isinstance(values, (list, tuple))
@@ -63,7 +61,7 @@ def data_extend(data: Dict, key: str, values: Iterable) -> None:
 
 def split_at_comma_semi(
     text: str, separators=(",", ";", "，", "،"), extra=()
-) -> List[str]:
+) -> list[str]:
     """Splits the text at commas and semicolons, unless they are inside
     parenthesis.  ``separators`` is default separators (setting it eliminates
     default separators).  ``extra`` is extra separators to be used in addition
@@ -203,7 +201,7 @@ def freeze(x):
 
 def ns_title_prefix_tuple(
     wxr, namespace: str, lower: bool = False
-) -> Tuple[str, ...]:
+) -> tuple[str, ...]:
     """Based on given namespace name, create a tuple of aliases"""
     if namespace in wxr.wtp.NAMESPACE_DATA:
         return tuple(
@@ -215,45 +213,3 @@ def ns_title_prefix_tuple(
         )
     else:
         return ()
-
-
-def find_similar_gloss(page_data: List[Dict], gloss: str) -> Dict:
-    """
-    Return a sense dictionary if it has similar gloss, return the last
-    word dictionary if can't found such gloss.
-    """
-    from rapidfuzz.fuzz import partial_token_set_ratio
-    from rapidfuzz.process import extractOne
-    from rapidfuzz.utils import default_process
-
-    if len(gloss) == 0:
-        return page_data[-1]
-
-    choices = [
-        sense_dict.get("raw_glosses", sense_dict.get("glosses", [""]))[0]
-        for sense_dict in page_data[-1]["senses"]
-    ]
-    if match_result := extractOne(
-        gloss,
-        choices,
-        score_cutoff=85,
-        scorer=partial(partial_token_set_ratio, processor=default_process),
-    ):
-        return page_data[-1]["senses"][match_result[2]]
-
-    return page_data[-1]
-
-
-def append_base_data(
-    page_data: List[Dict], field: str, value: Any, base_data: Dict
-) -> None:
-    if page_data[-1].get(field) is not None:
-        if len(page_data[-1]["senses"]) > 0:
-            # append new dictionary if the last dictionary has sense data and
-            # also has the same key
-            page_data.append(copy.deepcopy(base_data))
-            page_data[-1][field] = value
-        elif isinstance(page_data[-1].get(field), list):
-            page_data[-1][field] += value
-    else:
-        page_data[-1][field] = value
diff --git a/src/wiktextract/extractor/zh/descendant.py b/src/wiktextract/extractor/zh/descendant.py
index e3ef77c0..9699f859 100644
--- a/src/wiktextract/extractor/zh/descendant.py
+++ b/src/wiktextract/extractor/zh/descendant.py
@@ -1,11 +1,11 @@
-from collections import defaultdict
-from typing import Dict
+from typing import Union
 
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
 from ..ruby import extract_ruby
+from .models import Descendant, WordEntry
 
 DESCENDANT_TEMPLATES = frozenset(["desc", "descendant"])
 
@@ -13,7 +13,7 @@
 def extract_descendants(
     wxr: WiktextractContext,
     level_node: WikiNode,
-    parent_data: Dict,
+    parent_data: Union[WordEntry, Descendant],
 ) -> None:
     for list_node in level_node.find_child(NodeKind.LIST):
         for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
@@ -23,57 +23,51 @@ def extract_descendants(
 def extract_descendant_list_item(
     wxr: WiktextractContext,
     list_item_node: WikiNode,
-    parent_data: Dict,
+    parent_data: Union[WordEntry, Descendant],
 ) -> None:
     lang_code = ""
     lang_name = ""
-    descendant_data = defaultdict(list)
+    descendant_data = Descendant()
     for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
         expanded_template = wxr.wtp.parse(
             wxr.wtp.node_to_wikitext(template_node), expand_all=True
         )
         if template_node.template_name.lower() in DESCENDANT_TEMPLATES:
             lang_code = template_node.template_parameters.get(1)
-            descendant_data["lang_code"] = lang_code
+            descendant_data.lang_code = lang_code
         ruby_data, nodes_without_ruby = extract_ruby(
             wxr, expanded_template.children
         )
         if len(ruby_data) > 0:
-            descendant_data["ruby"] = ruby_data
+            descendant_data.ruby = ruby_data
         for child_index, child_node in enumerate(nodes_without_ruby):
             if isinstance(child_node, str) and child_node.endswith("："):
                 lang_name = child_node.strip(" ：")
-                descendant_data["lang_name"] = lang_name
+                descendant_data.lang_name = lang_name
             elif (
                 isinstance(child_node, WikiNode)
                 and child_node.kind == NodeKind.HTML
             ):
                 if child_node.tag == "span":
                     class_names = child_node.attrs.get("class", "")
-                    if (
-                        "Latn" in class_names or "tr" in class_names
-                    ) and "word" in descendant_data:
+                    if ("Latn" in class_names or "tr" in class_names) and len(
+                        descendant_data.word
+                    ) > 0:
                         # template:ja-r
-                        descendant_data["roman"] = clean_node(
+                        descendant_data.roman = clean_node(
                             wxr, None, child_node
                         )
                     elif "lang" in child_node.attrs:
-                        if "word" in descendant_data:
-                            parent_data["descendants"].append(descendant_data)
-                            descendant_data = defaultdict(
-                                list,
-                                {
-                                    "lang_code": lang_code,
-                                    "lang_name": lang_name,
-                                },
+                        if len(descendant_data.word) > 0:
+                            parent_data.descendants.append(descendant_data)
+                            descendant_data = Descendant(
+                                lang_code=lang_code, lang_name=lang_name
                             )
                             if len(ruby_data) > 0:
-                                descendant_data["ruby"] = ruby_data
-                        descendant_data["word"] = clean_node(
-                            wxr, None, child_node
-                        )
+                                descendant_data.ruby = ruby_data
+                        descendant_data.word = clean_node(wxr, None, child_node)
                     if "qualifier-content" in class_names:
-                        descendant_data["tags"].append(
+                        descendant_data.tags.append(
                             clean_node(wxr, None, child_node)
                         )
                 elif child_node.tag == "i":
@@ -81,16 +75,14 @@ def extract_descendant_list_item(
                     for span_tag in child_node.find_html(
                         "span", attr_name="class", attr_value="Latn"
                     ):
-                        descendant_data["roman"] = clean_node(
-                            wxr, None, span_tag
-                        )
+                        descendant_data.roman = clean_node(wxr, None, span_tag)
 
-        if "word" in descendant_data:
-            parent_data["descendants"].append(descendant_data)
+        if len(descendant_data.word) > 0:
+            parent_data.descendants.append(descendant_data)
 
     if list_item_node.contain_node(NodeKind.LIST):
         extract_descendants(
             wxr,
             list_item_node,
-            descendant_data if "word" in descendant_data else parent_data,
+            descendant_data if len(descendant_data.word) > 0 else parent_data,
         )
diff --git a/src/wiktextract/extractor/zh/example.py b/src/wiktextract/extractor/zh/example.py
index 5021e493..e1fee774 100644
--- a/src/wiktextract/extractor/zh/example.py
+++ b/src/wiktextract/extractor/zh/example.py
@@ -1,5 +1,4 @@
-from collections import defaultdict
-from typing import Dict, List, Union
+from typing import Union
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import TemplateNode
@@ -7,19 +6,20 @@
 from wiktextract.wxr_context import WiktextractContext
 
 from ..ruby import extract_ruby
+from .models import Example, Sense
 
 
 def extract_examples(
     wxr: WiktextractContext,
-    sense_data: Dict,
-    node: Union[WikiNode, List[WikiNode]],
+    sense_data: Sense,
+    node: Union[WikiNode, list[WikiNode]],
 ) -> None:
     if isinstance(node, list):
         for n in node:
             extract_examples(wxr, sense_data, n)
     elif isinstance(node, WikiNode):
         if node.kind == NodeKind.LIST_ITEM:
-            example_data = defaultdict(list, {"type": "example"})
+            example_data = Example()
             # example text in the nested list
             # https://zh.wiktionary.org/wiki/%, the second example
             if node.contain_node(NodeKind.LIST):
@@ -39,53 +39,54 @@ def extract_examples(
                     elif template_name == "uxi":
                         extract_template_uxi(wxr, child, example_data)
                     else:
-                        example_data["text"] = clean_node(wxr, None, child)
+                        example_data.texts = [clean_node(wxr, None, child)]
 
-            if "text" in example_data or "texts" in example_data:
-                sense_data["examples"].append(example_data)
+            if len(example_data.texts) > 0:
+                sense_data.examples.append(example_data)
         else:
             extract_examples(wxr, sense_data, node.children)
 
 
 def extract_example_list(
-    wxr: WiktextractContext, node: WikiNode, example_data: Dict
+    wxr: WiktextractContext, node: WikiNode, example_data: Example
 ) -> None:
     for index, child_node in enumerate(node.children):
         if (
             isinstance(child_node, WikiNode)
             and child_node.kind == NodeKind.LIST
         ):
-            example_data["type"] = "quotation"
-            example_data["ref"] = clean_node(wxr, None, node.children[:index])
-            example_data["text"] = clean_node(
-                wxr, None, child_node.children[0].children
-            )
+            example_data.ref = clean_node(wxr, None, node.children[:index])
+            example_data.texts = [
+                clean_node(wxr, None, child_node.children[0].children)
+            ]
 
 
 def extract_quote_templates(
-    wxr: WiktextractContext, node: TemplateNode, example_data: Dict
+    wxr: WiktextractContext, node: TemplateNode, example_data: Example
 ) -> None:
     """
     Process template `quote-book` and "RQ:*".
     """
-    example_data["type"] = "quotation"
     expanded_text = clean_node(wxr, None, node)
     for line_num, expanded_line in enumerate(expanded_text.splitlines()):
         if line_num == 0:
             key = "ref"
         elif line_num == 1:
-            key = "text"
+            key = "texts"
         elif line_num == 2 and "transliteration" in node.template_parameters:
             key = "roman"
         else:
             key = "translation"
 
         if expanded_line != "（請為本引文添加中文翻譯）":
-            example_data[key] = expanded_line
+            if key == "texts":
+                example_data.texts.append(expanded_line)
+            else:
+                setattr(example_data, key, expanded_line)
 
 
 def extract_template_ja_usex(
-    wxr: WiktextractContext, node: WikiNode, example_data: Dict
+    wxr: WiktextractContext, node: WikiNode, example_data: Example
 ) -> None:
     expanded_node = wxr.wtp.parse(
         wxr.wtp.node_to_wikitext(node), expand_all=True
@@ -94,27 +95,30 @@ def extract_template_ja_usex(
     expanded_text = clean_node(wxr, None, node_without_ruby)
     for line_num, expanded_line in enumerate(expanded_text.splitlines()):
         if line_num == 0:
-            key = "text"
+            key = "texts"
         elif line_num == 1:
             key = "roman"
         else:
             key = "translation"
-        example_data[key] = expanded_line
+        if key == "texts":
+            example_data.texts.append(expanded_line)
+        else:
+            setattr(example_data, key, expanded_line)
     if len(ruby_data) > 0:
-        example_data["ruby"] = ruby_data
+        example_data.ruby = ruby_data
 
 
 def extract_template_zh_usex(
-    wxr: WiktextractContext, node: WikiNode, example_data: Dict
+    wxr: WiktextractContext, node: WikiNode, example_data: Example
 ) -> None:
     expanded_text = clean_node(wxr, None, node)
     if "―" in expanded_text:
         for index, split_text in enumerate(expanded_text.split("―")):
             if index == 0:
                 for example_text in split_text.split(" / "):
-                    example_data["texts"].append(example_text.strip())
+                    example_data.texts.append(example_text.strip())
             elif index == 1:
-                example_data["roman"] = split_text.strip()
+                example_data.roman = split_text.strip()
         return
 
     for expanded_line in expanded_text.splitlines():
@@ -122,18 +126,17 @@ def extract_template_zh_usex(
             # expanded simplified or traditional Chinese
             # example sentence usually ends with
             # "繁體]" or "簡體]"
-            example_data["texts"].append(expanded_line)
+            example_data.texts.append(expanded_line)
         elif expanded_line.endswith("]"):
-            example_data["roman"] = expanded_line
+            example_data.roman = expanded_line
         elif expanded_line.startswith("來自："):
-            example_data["ref"] = expanded_line[3:]
-            example_data["type"] = "quotation"
+            example_data.ref = expanded_line[3:]
         else:
-            example_data["translation"] = expanded_line
+            example_data.translation = expanded_line
 
 
 def extract_template_ux(
-    wxr: WiktextractContext, node: WikiNode, example_data: Dict
+    wxr: WiktextractContext, node: WikiNode, example_data: Example
 ) -> None:
     expanded_text = clean_node(wxr, None, node)
     if " ― " in expanded_text:
@@ -143,7 +146,7 @@ def extract_template_ux(
     lines = expanded_text.splitlines()
     for line_num, expanded_line in enumerate(lines):
         if line_num == 0:
-            key = "text"
+            key = "texts"
         elif line_num == 1:
             if line_num == len(lines) - 1:
                 key = "translation"
@@ -151,21 +154,26 @@ def extract_template_ux(
                 key = "roman"
         else:
             key = "translation"
-        example_data[key] = expanded_line
+        if key == "texts":
+            example_data.texts.append(expanded_line)
+        else:
+            setattr(example_data, key, expanded_line)
 
 
 def extract_template_uxi(
-    wxr: WiktextractContext, node: WikiNode, example_data: Dict
+    wxr: WiktextractContext, node: WikiNode, example_data: Example
 ) -> None:
     expanded_text = clean_node(wxr, None, node)
     extract_template_uxi_text(expanded_text, example_data)
 
 
-def extract_template_uxi_text(expanded_text: str, example_data: Dict) -> None:
+def extract_template_uxi_text(
+    expanded_text: str, example_data: Example
+) -> None:
     parts = expanded_text.split(" ― ")
     for index, part in enumerate(parts):
         if index == 0:
-            key = "text"
+            key = "texts"
         elif index == 1:
             if index == len(parts) - 1:
                 key = "translation"
@@ -173,4 +181,7 @@ def extract_template_uxi_text(expanded_text: str, example_data: Dict) -> None:
                 key = "roman"
         else:
             key = "translation"
-        example_data[key] = part
+        if key == "texts":
+            example_data.texts.append(part)
+        else:
+            setattr(example_data, key, part)
diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py
index a8c31da8..8b28a8ac 100644
--- a/src/wiktextract/extractor/zh/gloss.py
+++ b/src/wiktextract/extractor/zh/gloss.py
@@ -1,6 +1,4 @@
 import re
-from collections import defaultdict
-from typing import Dict, List
 
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.page import clean_node
@@ -8,15 +6,16 @@
 
 from ..ruby import extract_ruby
 from .example import extract_examples
+from .models import Sense, WordEntry
 
 
 def extract_gloss(
     wxr: WiktextractContext,
-    page_data: List[Dict],
+    page_data: list[WordEntry],
     list_node: WikiNode,
-    gloss_data: Dict[str, List[str]],
+    gloss_data: Sense,
 ) -> None:
-    lang_code = page_data[-1].get("lang_code")
+    lang_code = page_data[-1].lang_code
     for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
         gloss_nodes = [
             child
@@ -38,7 +37,7 @@ def extract_gloss(
             gloss_data, extract_gloss_and_tags(raw_gloss_text)
         )
         if len(ruby_data) > 0:
-            new_gloss_data["ruby"] = ruby_data
+            new_gloss_data.ruby = ruby_data
 
         has_nested_gloss = False
         if list_item_node.contain_node(NodeKind.LIST):
@@ -50,20 +49,19 @@ def extract_gloss(
                     extract_examples(wxr, new_gloss_data, child_node)
 
         if not has_nested_gloss:
-            page_data[-1]["senses"].append(new_gloss_data)
+            page_data[-1].senses.append(new_gloss_data)
 
 
-def merge_gloss_data(
-    data_a: Dict[str, List[str]], data_b: Dict[str, List[str]]
-) -> Dict[str, List[str]]:
-    new_data = defaultdict(list)
+def merge_gloss_data(data_a: Sense, data_b: Sense) -> Sense:
+    new_data = Sense()
     for data in data_a, data_b:
-        for key, value in data.items():
-            new_data[key].extend(value)
+        for field in data.model_fields:
+            pre_data = getattr(new_data, field)
+            pre_data.extend(getattr(data, field))
     return new_data
 
 
-def extract_gloss_and_tags(raw_gloss: str) -> Dict[str, List[str]]:
+def extract_gloss_and_tags(raw_gloss: str) -> Sense:
     left_brackets = ("(", "（")
     right_brackets = (")", "）")
     if raw_gloss.startswith(left_brackets) or raw_gloss.endswith(
@@ -87,8 +85,6 @@ def extract_gloss_and_tags(raw_gloss: str) -> Dict[str, List[str]]:
                 tags += re.split(split_tag_regex, rear_label)
 
         gloss = raw_gloss[front_tag_end + 1 : rear_tag_start].strip()
-        return defaultdict(
-            list, {"glosses": [gloss], "raw_glosses": [raw_gloss], "tags": tags}
-        )
+        return Sense(glosses=[gloss], raw_glosses=[raw_gloss], tags=tags)
     else:
-        return defaultdict(list, {"glosses": [raw_gloss]})
+        return Sense(glosses=[raw_gloss])
diff --git a/src/wiktextract/extractor/zh/headword_line.py b/src/wiktextract/extractor/zh/headword_line.py
index 7ba19dcb..39cce406 100644
--- a/src/wiktextract/extractor/zh/headword_line.py
+++ b/src/wiktextract/extractor/zh/headword_line.py
@@ -1,5 +1,5 @@
 import re
-from typing import Dict, List, Union
+from typing import Union
 
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.page import clean_node
@@ -7,6 +7,7 @@
 
 from ..ruby import extract_ruby
 from ..share import strip_nodes
+from .models import Form, WordEntry
 
 # https://zh.wiktionary.org/wiki/Module:Gender_and_number
 GENDERS = {
@@ -36,37 +37,9 @@
 }
 
 
-FORM_TAGS = {
-    "不可數": ["uncountable"],
-    "通常不可數": ["uncountable"],
-    "可數": ["countable"],
-    "複數": ["plural"],
-    # en-verb
-    "第三人稱單數簡單現在時": ["third-person", "singular", "simple", "present"],
-    "現在分詞": ["present", "participle"],
-    "一般過去時及過去分詞": ["past", "participle"],
-    # fr-noun, fr-adj
-    # https://zh.wiktionary.org/wiki/Module:Fr-headword
-    "指小詞": ["diminutive"],
-    "陰性": ["feminine"],
-    "陽性": ["masculine"],
-    "陽性複數": ["masculine", "plural"],
-    "陰性複數": ["feminine", "plural"],
-    "陽性單數": ["masculine", "singular"],
-    "元音前陽性單數": ["masculine", "singular", "before-vowel"],
-    "比較級": ["comparative"],
-    "最高級": ["superlative"],
-    # voice
-    "主動": ["active"],
-    "被動": ["passive"],
-    "及物": ["transitive"],
-    "不規則": ["irregular"],
-}
-
-
 def extract_headword_line(
     wxr: WiktextractContext,
-    page_data: List[Dict],
+    page_data: list[WordEntry],
     node: WikiNode,
     lang_code: str,
 ) -> None:
@@ -88,30 +61,30 @@ def extract_headword_line(
             if "headword-tr" in class_names:
                 forms_start_index = index + 1
 
-                page_data[-1]["forms"].append(
-                    {
-                        "form": clean_node(wxr, page_data[-1], child),
-                        "tags": ["romanization"],
-                    }
+                page_data[-1].forms.append(
+                    Form(
+                        form=clean_node(wxr, page_data[-1], child),
+                        tags=["romanization"],
+                    )
                 )
             elif "gender" in class_names:
                 forms_start_index = index + 1
                 for abbr_tag in child.find_html("abbr"):
                     gender = abbr_tag.children[0]
-                    page_data[-1]["tags"].append(GENDERS.get(gender, gender))
+                    page_data[-1].tags.append(GENDERS.get(gender, gender))
             if lang_code == "ja":
                 for span_child in child.find_html(
                     "strong", attr_name="class", attr_value="headword"
                 ):
                     ruby_data, node_without_ruby = extract_ruby(wxr, span_child)
-                    page_data[-1]["forms"].append(
-                        {
-                            "form": clean_node(
+                    page_data[-1].forms.append(
+                        Form(
+                            form=clean_node(
                                 wxr, page_data[-1], node_without_ruby
                             ),
-                            "ruby": ruby_data,
-                            "tags": ["canonical"],
-                        }
+                            ruby=ruby_data,
+                            tags=["canonical"],
+                        )
                     )
         elif child.tag == "b":
             # this is a form <b> tag, already inside form parentheses
@@ -124,8 +97,8 @@ def extract_headword_line(
 
 def extract_headword_forms(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    form_nodes: List[Union[WikiNode, str]],
+    page_data: list[WordEntry],
+    form_nodes: list[Union[WikiNode, str]],
 ) -> None:
     current_nodes = []
     for node in form_nodes:
@@ -141,18 +114,18 @@ def extract_headword_forms(
 
 def process_forms_text(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    form_nodes: List[Union[WikiNode, str]],
+    page_data: list[WordEntry],
+    form_nodes: list[Union[WikiNode, str]],
 ) -> None:
     tag_nodes = []
     has_forms = False
     striped_nodes = list(strip_nodes(form_nodes))
-    lang_code = page_data[-1].get("lang_code")
+    lang_code = page_data[-1].lang_code
     for index, node in enumerate(striped_nodes):
         if isinstance(node, WikiNode) and node.kind == NodeKind.HTML:
             if node.tag == "b":
                 has_forms = True
-                ruby_data = None
+                ruby_data = []
                 if lang_code == "ja":
                     ruby_data, node_without_ruby = extract_ruby(wxr, node)
                     form = clean_node(wxr, None, node_without_ruby)
@@ -173,18 +146,11 @@ def process_forms_text(
                         gender = clean_node(wxr, None, next_node)
                         form_tags.append(GENDERS.get(gender, gender))
 
-                form_data = {
-                    "form": form,
-                    "tags": form_tags,
-                }
-                if ruby_data is not None:
-                    form_data["ruby"] = ruby_data
-                page_data[-1]["forms"].append(form_data)
+                form_data = Form(form=form, tags=form_tags, ruby=ruby_data)
+                page_data[-1].forms.append(form_data)
             elif node.tag == "span" and "tr" in node.attrs.get("class", ""):
                 # romanization of the previous form <b> tag
-                page_data[-1]["forms"][-1]["roman"] = clean_node(
-                    wxr, None, node
-                )
+                page_data[-1].forms[-1].roman = clean_node(wxr, None, node)
             else:
                 tag_nodes.append(node)
         else:
@@ -195,15 +161,15 @@ def process_forms_text(
             clean_node(wxr, page_data[-1], tag_nodes).strip("() ")
         )
         if len(tags_list) > 0:
-            page_data[-1]["tags"].extend(tags_list)
+            page_data[-1].tags.extend(tags_list)
     else:
         clean_node(wxr, page_data[-1], tag_nodes)  # find categories
 
 
-def extract_headword_tags(tags_str: str) -> List[str]:
+def extract_headword_tags(tags_str: str) -> list[str]:
     tags = []
     for tag_str in (
         s.strip() for s in re.split("&|或", tags_str) if len(s.strip()) > 0
     ):
-        tags.extend(FORM_TAGS.get(tag_str, [tag_str]))
+        tags.append(tag_str)
     return tags
diff --git a/src/wiktextract/extractor/zh/inflection.py b/src/wiktextract/extractor/zh/inflection.py
index 3a620379..0bfb2289 100644
--- a/src/wiktextract/extractor/zh/inflection.py
+++ b/src/wiktextract/extractor/zh/inflection.py
@@ -1,9 +1,9 @@
-from typing import Dict, List
-
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
+from .models import Form, WordEntry
+
 # https://zh.wiktionary.org/wiki/Category:日語變格表模板
 JAPANESE_INFLECTION_TEMPLATE_PREFIXES = (
     "ja-i",
@@ -21,21 +21,21 @@
 
 def extract_inflections(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    node: WikiNode,
+    page_data: list[WordEntry],
+    level_node: WikiNode,
 ) -> None:
-    for child in node.find_child(NodeKind.TEMPLATE):
+    for child in level_node.find_child(NodeKind.TEMPLATE):
         template_name = child.template_name.lower()
         if template_name.startswith(JAPANESE_INFLECTION_TEMPLATE_PREFIXES):
             expanded_table = wxr.wtp.parse(
-                wxr.wtp.node_to_wikitext(node), expand_all=True
+                wxr.wtp.node_to_wikitext(level_node), expand_all=True
             )
             extract_ja_i_template(wxr, page_data, expanded_table, "")
 
 
 def extract_ja_i_template(
     wxr: WiktextractContext,
-    page_data: List[Dict],
+    page_data: list[WordEntry],
     node: WikiNode,
     table_header: str,
 ) -> None:
@@ -45,16 +45,15 @@ def extract_ja_i_template(
                 if len(list(child.filter_empty_str_child())) == 1:
                     table_header = clean_node(wxr, None, child.children)
                 else:
-                    inflection_data = {
-                        "tags": [table_header],
-                        "source": "inflection",
-                    }
+                    inflection_data = Form(
+                        tags=[table_header], source="inflection"
+                    )
                     cell_node_index = 0
                     keys = ["form", "hiragana", "roman"]
                     for row_child in child.children:
                         if isinstance(row_child, WikiNode):
                             if row_child.kind == NodeKind.TABLE_HEADER_CELL:
-                                inflection_data["tags"].append(
+                                inflection_data.tags.append(
                                     clean_node(wxr, None, row_child)
                                 )
                             elif row_child.kind == NodeKind.TABLE_CELL:
@@ -64,11 +63,13 @@ def extract_ja_i_template(
                                 if cell_node_index < len(keys):
                                     key = keys[cell_node_index]
                                     cell_node_index += 1
-                                    inflection_data[key] = clean_node(
-                                        wxr, None, row_child
+                                    setattr(
+                                        inflection_data,
+                                        key,
+                                        clean_node(wxr, None, row_child),
                                     )
                                 else:
                                     break
-                    page_data[-1]["forms"].append(inflection_data)
+                    page_data[-1].forms.append(inflection_data)
             else:
                 extract_ja_i_template(wxr, page_data, child, table_header)
diff --git a/src/wiktextract/extractor/zh/linkage.py b/src/wiktextract/extractor/zh/linkage.py
index 04cdf3a5..13f187fd 100644
--- a/src/wiktextract/extractor/zh/linkage.py
+++ b/src/wiktextract/extractor/zh/linkage.py
@@ -1,10 +1,7 @@
-from collections import defaultdict
-from copy import deepcopy
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import TemplateNode
-from wiktextract.datautils import find_similar_gloss
 from wiktextract.page import LEVEL_KINDS, clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -15,16 +12,16 @@
     strip_nodes,
 )
 from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item
+from .models import Linkage, WordEntry
 
 
 def extract_linkages(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    nodes: List[Union[WikiNode, str]],
+    page_data: list[WordEntry],
+    nodes: list[Union[WikiNode, str]],
     linkage_type: str,
     sense: str,
-    append_to: Dict,
-) -> Optional[Tuple[str, Dict]]:
+) -> Optional[str]:
     """
     Return linkage sense text for `sense` template inside a list item node.
     """
@@ -33,12 +30,11 @@ def extract_linkages(
     for node in strip_nodes(nodes):
         if isinstance(node, str) and len(sense) == 0:
             sense = node.strip(strip_sense_chars)
-            append_to = find_similar_gloss(page_data, sense)
         elif isinstance(node, WikiNode):
             if node.kind == NodeKind.LIST_ITEM:
                 not_term_indexes = set()
                 filtered_children = list(node.filter_empty_str_child())
-                linkage_data = defaultdict(list)
+                linkage_data = Linkage()
                 for index, item_child in enumerate(filtered_children):
                     if (
                         isinstance(item_child, WikiNode)
@@ -50,13 +46,12 @@ def extract_linkages(
                             sense = clean_node(wxr, None, item_child).strip(
                                 strip_sense_chars
                             )
-                            append_to = find_similar_gloss(page_data, sense)
                             if index == len(filtered_children) - 1:
                                 # sense template before entry list
-                                return sense, append_to
+                                return sense
                         elif template_name in {"qualifier", "qual"}:
                             not_term_indexes.add(index)
-                            linkage_data["tags"].append(
+                            linkage_data.tags.append(
                                 clean_node(wxr, None, item_child).strip("()")
                             )
                         elif template_name.lower() in DESCENDANT_TEMPLATES:
@@ -87,81 +82,67 @@ def extract_linkages(
                 roman, terms = capture_text_in_parentheses(terms)
                 roman = roman[0] if len(roman) > 0 else None
                 if roman is not None:
-                    linkage_data["roman"] = roman
+                    linkage_data.roman = roman
                 if len(sense) > 0:
-                    linkage_data["sense"] = sense
+                    linkage_data.sense = sense
                 for term in terms.split("、"):
                     for variant_type, variant_term in split_chinese_variants(
                         term
                     ):
-                        final_linkage_data = deepcopy(linkage_data)
-                        final_linkage_data["word"] = variant_term
+                        final_linkage_data = linkage_data.model_copy(deep=True)
+                        final_linkage_data.word = variant_term
                         if variant_type is not None:
-                            final_linkage_data[
-                                "language_variant"
-                            ] = variant_type
-                        if len(final_linkage_data["word"]) > 0:
-                            append_to[linkage_type].append(final_linkage_data)
+                            final_linkage_data.language_variant = variant_type
+                        if len(final_linkage_data.word) > 0:
+                            pre_data = getattr(page_data[-1], linkage_type)
+                            pre_data.append(final_linkage_data)
             elif node.kind == NodeKind.TEMPLATE:
                 template_name = node.template_name.lower()
                 if template_name in sense_template_names:
                     sense = clean_node(wxr, None, node).strip(strip_sense_chars)
                 elif template_name.endswith("-saurus"):
                     extract_saurus_template(
-                        wxr, node, page_data, linkage_type, sense, append_to
+                        wxr, node, page_data, linkage_type, sense
                     )
                 elif template_name == "zh-dial":
                     extract_zh_dial_template(
-                        wxr, node, linkage_type, sense, append_to
+                        wxr, page_data, node, linkage_type, sense
                     )
                 else:
                     expanded_node = wxr.wtp.parse(
                         wxr.wtp.node_to_wikitext(node), expand_all=True
                     )
                     extract_linkages(
-                        wxr,
-                        page_data,
-                        [expanded_node],
-                        linkage_type,
-                        sense,
-                        append_to,
+                        wxr, page_data, [expanded_node], linkage_type, sense
                     )
             elif node.kind in LEVEL_KINDS:
                 from .page import parse_section
 
-                base_data = defaultdict(
-                    list,
-                    {
-                        "lang_name": page_data[-1].get("lang_name"),
-                        "lang_code": page_data[-1].get("lang_code"),
-                        "word": wxr.wtp.title,
-                    },
+                base_data = WordEntry(
+                    lang_code=page_data[-1].lang_code,
+                    lang_name=page_data[-1].lang_name,
+                    word=page_data[-1].word,
                 )
                 parse_section(wxr, page_data, base_data, node)
             elif len(node.children) > 0:
-                returned_values = extract_linkages(
+                returned_sense = extract_linkages(
                     wxr,
                     page_data,
                     node.children,
                     linkage_type,
                     sense,
-                    append_to,
                 )
-                if returned_values is not None:
-                    returned_sense, returned_append_target = returned_values
-                    if len(returned_sense) > 0:
-                        sense = returned_sense
-                        append_to = returned_append_target
+                if returned_sense is not None:
+                    sense = returned_sense
     return None
 
 
 def extract_saurus_template(
     wxr: WiktextractContext,
     node: WikiNode,
-    page_data: Dict,
+    page_data: list[WordEntry],
     linkage_type: str,
     sense: str,
-    append_to: Dict,
 ) -> None:
     """
     Extract data from template names end with "-saurus", like "zh-syn-saurus"
@@ -174,49 +155,52 @@ def extract_saurus_template(
     for thesaurus in search_thesaurus(
         wxr.thesaurus_db_conn,
         thesaurus_page_title,
-        page_data[-1].get("lang_code"),
-        page_data[-1].get("pos"),
+        page_data[-1].lang_code,
+        page_data[-1].pos,
         linkage_type,
     ):
         if thesaurus.term == wxr.wtp.title:
             continue
-        linkage_data = {"word": thesaurus.term}
+        linkage_data = Linkage(word=thesaurus.term)
         if thesaurus.roman is not None:
-            linkage_data["roman"] = thesaurus.roman
+            linkage_data.roman = thesaurus.roman
         if thesaurus.tags is not None:
-            linkage_data["tags"] = thesaurus.tags.split("|")
+            linkage_data.tags = thesaurus.tags.split("|")
         if thesaurus.language_variant is not None:
-            linkage_data["language_variant"] = thesaurus.language_variant
+            linkage_data.language_variant = thesaurus.language_variant
         if len(sense) > 0:
-            linkage_data["sense"] = sense
+            linkage_data.sense = sense
         elif thesaurus.sense is not None:
-            linkage_data["sense"] = thesaurus.sense
-        append_to[linkage_type].append(linkage_data)
+            linkage_data.sense = thesaurus.sense
+
+        pre_data = getattr(page_data[-1], linkage_type)
+        pre_data.append(linkage_data)
 
 
 def extract_zh_dial_template(
     wxr: WiktextractContext,
+    page_data: list[WordEntry],
     node: Union[WikiNode, str],
     linkage_type: str,
     sense: str,
-    append_to: Dict,
 ) -> None:
     dial_data = {}
     node = wxr.wtp.parse(wxr.wtp.node_to_wikitext(node), expand_all=True)
     extract_zh_dial_recursively(wxr, node, dial_data, None)
     for term, tags in dial_data.items():
-        linkage_data = {"word": term}
+        linkage_data = Linkage(word=term)
         if len(sense) > 0:
-            linkage_data["sense"] = sense
+            linkage_data.sense = sense
         if len(tags) > 0:
-            linkage_data["tags"] = tags
-        append_to[linkage_type].append(linkage_data)
+            linkage_data.tags = tags
+        pre_data = getattr(page_data[-1], linkage_type)
+        pre_data.append(linkage_data)
 
 
 def extract_zh_dial_recursively(
     wxr: WiktextractContext,
     node: Union[WikiNode, str],
-    dial_data: Dict[str, List[str]],
+    dial_data: dict[str, list[str]],
     header_lang: Optional[str],
 ) -> str:
     if isinstance(node, WikiNode) and node.kind == NodeKind.TABLE_ROW:
@@ -261,7 +245,7 @@ def extract_zh_dial_recursively(
 
 def process_ja_r_template(
     wxr: WiktextractContext,
-    page_data: Dict[str, Any],
+    page_data: list[WordEntry],
     template_node: TemplateNode,
     linkage_type: str,
     sense: str,
@@ -270,17 +254,15 @@ def process_ja_r_template(
     expanded_node = wxr.wtp.parse(
         wxr.wtp.node_to_wikitext(template_node), expand_all=True
     )
-    linkage_data = defaultdict(list)
-    if len(sense) > 0:
-        linkage_data["sense"] = sense
+    linkage_data = Linkage(sense=sense)
     for span_node in expanded_node.find_html("span"):
         if "lang" in span_node.attrs:
             ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node)
-            linkage_data["word"] = clean_node(wxr, None, no_ruby_nodes)
-            if len(ruby_data) > 0:
-                linkage_data["ruby"] = ruby_data
+            linkage_data.word = clean_node(wxr, None, no_ruby_nodes)
+            linkage_data.ruby = ruby_data
         elif "tr" in span_node.attrs.get("class", ""):
-            linkage_data["roman"] = clean_node(wxr, None, span_node)
+            linkage_data.roman = clean_node(wxr, None, span_node)
 
-    if len(linkage_data.get("word", "")) > 0:
-        page_data[-1][linkage_type].append(linkage_data)
+    if len(linkage_data.word) > 0:
+        pre_data = getattr(page_data[-1], linkage_type)
+        pre_data.append(linkage_data)
diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
new file mode 100644
index 00000000..b9b665c5
--- /dev/null
+++ b/src/wiktextract/extractor/zh/models.py
@@ -0,0 +1,127 @@
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class ChineseBaseModel(BaseModel):
+    model_config = ConfigDict(
+        extra="ignore",
+        strict=True,
+        validate_assignment=True,
+        validate_default=True,
+    )
+
+
+class Example(ChineseBaseModel):
+    texts: list[str] = Field(
+        [],
+        description="Example usage sentences, some might have have both "
+        "Simplified and Traditional Chinese forms",
+    )
+    translation: str = Field(
+        "", description="Chinese translation of the example sentence"
+    )
+    roman: str = Field("", description="Romanization of the example sentence")
+    ref: str = Field(
+        "",
+        description="Source of the sentence, like book title and page number",
+    )
+    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+
+
+class Sense(ChineseBaseModel):
+    glosses: list[str] = []
+    raw_glosses: list[str] = Field([], description="Gloss text without tags")
+    tags: list[str] = []
+    categories: list[str] = []
+    examples: list[Example] = []
+    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+
+
+class Form(ChineseBaseModel):
+    form: str = ""
+    tags: list[str] = []
+    source: str = ""
+    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+    hiragana: str = ""
+    roman: str = ""
+
+
+class Sound(ChineseBaseModel):
+    zh_pron: str = Field("", description="Chinese word pronunciation")
+    ipa: str = Field("", description="International Phonetic Alphabet")
+    audio: str = Field("", description="Audio file name")
+    wav_url: str = ""
+    oga_url: str = ""
+    ogg_url: str = ""
+    mp3_url: str = ""
+    opus_url: str = ""
+    tags: list[str] = []
+    homophone: str = ""
+
+
+class Translation(ChineseBaseModel):
+    lang_code: str = Field(
+        "", description="Wiktionary language code of the translation term"
+    )
+    lang_name: str = Field("", description="Translation language name")
+    word: str = Field("", description="Translation term")
+    sense: str = Field("", description="Translation gloss")
+    tags: list[str] = []
+    roman: str = ""
+
+
+class Linkage(ChineseBaseModel):
+    word: str = ""
+    tags: list[str] = []
+    roman: str = ""
+    sense: str = ""
+    language_variant: Literal["", "zh-Hant", "zh-Hans"] = Field(
+        "", description="Chinese character variant"
+    )
+    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+
+
+class Descendant(ChineseBaseModel):
+    lang_code: str = Field("", description="Wiktionary language code of")
+    lang_name: str = Field("", description="Translation language name")
+    word: str = ""
+    roman: str = ""
+    tags: list[str] = []
+    descendants: list["Descendant"] = []
+    ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana")
+
+
+class WordEntry(ChineseBaseModel):
+    model_config = ConfigDict(title="Chinese Wiktionary")
+
+    word: str = Field(description="Word string")
+    lang_code: str = Field(description="Wiktionary language code")
+    lang_name: str = Field(description="Localized language name")
+    pos: str = Field("", description="Part of speech type")
+    etymology_text: str = ""
+    senses: list[Sense] = Field([], description="Sense list")
+    forms: list[Form] = Field([], description="Inflection forms list")
+    sounds: list[Sound] = []
+    translations: list[Translation] = []
+    synonyms: list[Linkage] = []
+    hyponyms: list[Linkage] = []
+    hypernyms: list[Linkage] = []
+    holonyms: list[Linkage] = []
+    meronyms: list[Linkage] = []
+    derived: list[Linkage] = []
+    troponyms: list[Linkage] = []
+    paronyms: list[Linkage] = []
+    related: list[Linkage] = []
+    abbreviation: list[Linkage] = []
+    proverbs: list[Linkage] = []
+    antonyms: list[Linkage] = []
+    coordinate_terms: list[Linkage] = []
+    various: list[Linkage] = []
+    compounds: list[Linkage] = []
+    title: str = Field("", description="Redirect page source title")
+    redirect: str = Field("", description="Redirect page target title")
+    categories: list[str] = []
+    notes: list[str] = []
+    tags: list[str] = []
+    descendants: list[Descendant] = []
diff --git a/src/wiktextract/extractor/zh/note.py b/src/wiktextract/extractor/zh/note.py
index 0b70ad02..10411591 100644
--- a/src/wiktextract/extractor/zh/note.py
+++ b/src/wiktextract/extractor/zh/note.py
@@ -1,21 +1,21 @@
-from typing import Any, Dict, List
-
 from wikitextprocessor import NodeKind, WikiNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
+from .models import WordEntry
+
 
 def extract_note(
     wxr: WiktextractContext,
-    page_data: List[Dict[str, Any]],
+    page_data: list[WordEntry],
     level_node: WikiNode,
 ) -> None:
     for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
-        page_data[-1]["notes"].append(
+        page_data[-1].notes.append(
             clean_node(wxr, page_data[-1], list_item.children)
         )
 
     if not level_node.contain_node(NodeKind.LIST):
-        page_data[-1]["notes"].append(
+        page_data[-1].notes.append(
             clean_node(wxr, page_data[-1], level_node.children)
         )
diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
index 95d84b7b..6c97de94 100644
--- a/src/wiktextract/extractor/zh/page.py
+++ b/src/wiktextract/extractor/zh/page.py
@@ -1,12 +1,9 @@
-import copy
 import logging
 import re
-from collections import defaultdict
-from typing import Dict, List, Union
+from typing import Any, Union
 
 from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
-from wiktextract.datautils import append_base_data
 from wiktextract.page import LEVEL_KINDS, clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -15,9 +12,11 @@
 from .headword_line import extract_headword_line
 from .inflection import extract_inflections
 from .linkage import extract_linkages
+from .models import Sense, WordEntry
 from .note import extract_note
 from .pronunciation import extract_pronunciation_recursively
 from .translation import extract_translation
+from .util import append_base_data
 
 # Templates that are used to form panels on pages and that
 # should be ignored in various positions
@@ -29,42 +28,42 @@
 PANEL_PREFIXES = {}
 
 # Additional templates to be expanded in the pre-expand phase
-ADDITIONAL_EXPAND_TEMPLATES = {
-    "multitrans",
-    "multitrans-nowiki",
-    "checktrans-top",
-    "checktrans-bottom",
-    "col1",
-    "col2",
-    "col3",
-    "col4",
-    "col5",
-    "col1-u",
-    "col2-u",
-    "col3-u",
-    "col4-u",
-    "col5-u",
-    "check deprecated lang param usage",
-    "deprecated code",
-    "ru-verb-alt-ё",
-    "ru-noun-alt-ё",
-    "ru-adj-alt-ё",
-    "ru-proper noun-alt-ё",
-    "ru-pos-alt-ё",
-    "ru-alt-ё",
-    # langhd is needed for pre-expanding language heading templates in the
-    # Chinese Wiktionary dump file: https://zh.wiktionary.org/wiki/Template:-en-
-    "langhd",
-    "zh-der",  # col3 for Chinese
-    "der3",  # redirects to col3
-}
+ADDITIONAL_EXPAND_TEMPLATES = frozenset(
+    {
+        "multitrans",
+        "multitrans-nowiki",
+        "col1",
+        "col2",
+        "col3",
+        "col4",
+        "col5",
+        "col1-u",
+        "col2-u",
+        "col3-u",
+        "col4-u",
+        "col5-u",
+        "check deprecated lang param usage",
+        "deprecated code",
+        "ru-verb-alt-ё",
+        "ru-noun-alt-ё",
+        "ru-adj-alt-ё",
+        "ru-proper noun-alt-ё",
+        "ru-pos-alt-ё",
+        "ru-alt-ё",
+        # langhd is needed for pre-expanding language heading templates:
+        # https://zh.wiktionary.org/wiki/Template:-en-
+        "langhd",
+        "zh-der",  # col3 for Chinese
+        "der3",  # redirects to col3
+    }
+)
 
 
 def parse_section(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    base_data: Dict,
-    node: Union[WikiNode, List[Union[WikiNode, str]]],
+    page_data: list[WordEntry],
+    base_data: WordEntry,
+    node: Union[WikiNode, list[Union[WikiNode, str]]],
 ) -> None:
     if isinstance(node, list):
         for x in node:
@@ -100,7 +99,6 @@ def parse_section(
                 node.children,
                 wxr.config.LINKAGE_SUBTITLES[subtitle],
                 "",
-                page_data[-1],
             )
         elif (
             wxr.config.capture_translations
@@ -128,21 +126,22 @@ def parse_section(
 
 def process_pos_block(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    base_data: Dict,
+    page_data: list[WordEntry],
+    base_data: WordEntry,
     node: WikiNode,
     pos_text: str,
 ):
     pos_type = wxr.config.POS_SUBTITLES[pos_text]["pos"]
-    base_data["pos"] = pos_type
+    base_data.pos = pos_type
     append_base_data(page_data, "pos", pos_type, base_data)
     for index, child in enumerate(node.filter_empty_str_child()):
         if isinstance(child, WikiNode):
             if index == 0 and child.kind == NodeKind.TEMPLATE:
-                lang_code = base_data.get("lang_code")
-                extract_headword_line(wxr, page_data, child, lang_code)
+                extract_headword_line(
+                    wxr, page_data, child, base_data.lang_code
+                )
             elif child.kind == NodeKind.LIST:
-                extract_gloss(wxr, page_data, child, defaultdict(list))
+                extract_gloss(wxr, page_data, child, Sense())
             elif child.kind in LEVEL_KINDS:
                 parse_section(wxr, page_data, base_data, child)
         else:
@@ -151,9 +150,9 @@ def process_pos_block(
 
 def extract_etymology(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    base_data: Dict,
-    nodes: List[Union[WikiNode, str]],
+    page_data: list[WordEntry],
+    base_data: WordEntry,
+    nodes: list[Union[WikiNode, str]],
 ) -> None:
     level_node_index = -1
     for index, node in enumerate(nodes):
@@ -165,7 +164,7 @@ def extract_etymology(
     else:
         etymology = clean_node(wxr, page_data[-1], nodes)
     if len(etymology) > 0:
-        base_data["etymology_text"] = etymology
+        base_data.etymology_text = etymology
         append_base_data(page_data, "etymology_text", etymology, base_data)
     if level_node_index != -1:
         parse_section(wxr, page_data, base_data, nodes[level_node_index:])
@@ -173,11 +172,11 @@ def extract_etymology(
 
 def extract_pronunciation(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    base_data: Dict,
-    nodes: List[Union[WikiNode, str]],
+    page_data: list[WordEntry],
+    base_data: WordEntry,
+    nodes: list[Union[WikiNode, str]],
 ) -> None:
-    lang_code = base_data.get("lang_code")
+    lang_code = base_data.lang_code
     for index, node in enumerate(nodes):
         if isinstance(node, WikiNode):
             if node.kind in LEVEL_KINDS:
@@ -194,7 +193,7 @@ def extract_pronunciation(
 
 def parse_page(
     wxr: WiktextractContext, page_title: str, page_text: str
-) -> List[Dict[str, str]]:
+) -> list[dict[str, Any]]:
     if wxr.config.verbose:
         logging.info(f"Parsing page: {page_title}")
 
@@ -211,31 +210,25 @@ def parse_page(
 
     page_data = []
     for level2_node in tree.find_child(NodeKind.LEVEL2):
-        categories_and_links = defaultdict(list)
-        lang_name = clean_node(wxr, categories_and_links, level2_node.largs)
-        if name_to_code(lang_name, "zh") == "":
+        categories = {}
+        lang_name = clean_node(wxr, categories, level2_node.largs)
+        lang_code = name_to_code(lang_name, "zh")
+        if lang_code == "":
             wxr.wtp.warning(
                 f"Unrecognized language name: {lang_name}",
                 sortid="extractor/zh/page/parse_page/509",
             )
-        lang_code = name_to_code(lang_name, "zh")
         if (
             wxr.config.capture_language_codes is not None
             and lang_code not in wxr.config.capture_language_codes
         ):
             continue
         wxr.wtp.start_section(lang_name)
-
-        base_data = defaultdict(
-            list,
-            {
-                "lang_name": lang_name,
-                "lang_code": lang_code,
-                "word": wxr.wtp.title,
-            },
+        base_data = WordEntry(
+            word=wxr.wtp.title, lang_code=lang_code, lang_name=lang_name
         )
-        base_data.update(categories_and_links)
-        page_data.append(copy.deepcopy(base_data))
+        base_data.categories = categories.get("categories", [])
+        page_data.append(base_data.model_copy(deep=True))
         parse_section(wxr, page_data, base_data, level2_node.children)
 
-    return page_data
+    return [d.model_dump(exclude_defaults=True) for d in page_data]
diff --git a/src/wiktextract/extractor/zh/pronunciation.py b/src/wiktextract/extractor/zh/pronunciation.py
index c9e9014b..724f9a26 100644
--- a/src/wiktextract/extractor/zh/pronunciation.py
+++ b/src/wiktextract/extractor/zh/pronunciation.py
@@ -1,21 +1,23 @@
 import re
-from typing import Any, Dict, List, Optional, Union
+from typing import Optional, Union
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import HTMLNode, TemplateNode
-from wiktextract.datautils import append_base_data
 from wiktextract.extractor.share import create_audio_url_dict
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
+from .models import Sound, WordEntry
+from .util import append_base_data
+
 
 def extract_pronunciation_recursively(
     wxr: WiktextractContext,
-    page_data: List[Dict[str, Any]],
-    base_data: Dict[str, Any],
+    page_data: list[WordEntry],
+    base_data: WordEntry,
     lang_code: str,
-    node: Union[WikiNode, List[Union[WikiNode, str]]],
-    tags: List[str],
+    node: Union[WikiNode, list[Union[WikiNode, str]]],
+    tags: list[str],
 ) -> None:
     if isinstance(node, list):
         for x in node:
@@ -39,20 +41,25 @@ def extract_pronunciation_recursively(
             # audio file usually after Pinyin
             # add back to previous Pinyin dictionary if it doesn't have
             # audio file data and they are sibling nodes(similar tags).
-            last_sounds_list = page_data[-1].get("sounds", [])
+            last_sounds_list = page_data[-1].sounds
             for index in range(len(last_sounds_list)):
-                if last_sounds_list[index].get("audio") is None and (
-                    tags == last_sounds_list[index].get("tags", [])[:-1]
+                if last_sounds_list[index].audio == "" and (
+                    tags == last_sounds_list[index].tags[:-1]
                     or lang_code != "zh"
                 ):
-                    page_data[-1].get("sounds")[index].update(
-                        create_audio_url_dict(data)
-                    )
-        elif isinstance(data, dict):
+                    for key, value in create_audio_url_dict(data).items():
+                        if key in Sound.model_fields:
+                            setattr(page_data[-1].sounds[index], key, value)
+                        else:
+                            wxr.wtp.warning(
+                                f"{key=} not defined in Sound",
+                                sortid="zh.pronunciation/56",
+                            )
+        elif isinstance(data, Sound):
             append_base_data(
                 page_data,
                 "sounds",
-                [data],
+                data,
                 base_data,
             )
             # list children could contain audio file
@@ -62,7 +69,7 @@ def extract_pronunciation_recursively(
                 base_data,
                 lang_code,
                 rest_children,
-                data.get("tags")[:-1],
+                data.tags[:-1],
             )
         elif isinstance(data, list):
             # list item is a tag
@@ -81,8 +88,8 @@ def extract_pronunciation_recursively(
 
 
 def combine_pronunciation_tags(
-    old_tags: List[str], new_tags: List[str]
-) -> List[str]:
+    old_tags: list[str], new_tags: list[str]
+) -> list[str]:
     combined_tags = old_tags[:]
     old_tags_set = set(old_tags)
     for tag in new_tags:
@@ -91,7 +98,7 @@ def combine_pronunciation_tags(
     return combined_tags
 
 
-def split_pronunciation_tags(text: str) -> List[str]:
+def split_pronunciation_tags(text: str) -> list[str]:
     return list(
         filter(
             None,
@@ -107,11 +114,11 @@ def split_pronunciation_tags(text: str) -> List[str]:
 
 def extract_pronunciation_item(
     wxr: WiktextractContext,
-    page_data: List[Dict],
+    page_data: list[WordEntry],
     lang_code: str,
-    node_children: List[WikiNode],
-    tags: List[str],
-) -> Optional[Union[Dict[str, Any], str, List[str]]]:
+    node_children: list[WikiNode],
+    tags: list[str],
+) -> Optional[Union[Sound, str, list[str]]]:
     """
     Return audio file name(eg. "File:LL-Q1860 (eng)-Vealhurl-manga.wav") string
     or a dictionary contains IPA and tags
@@ -138,9 +145,9 @@ def extract_pronunciation_item(
             tags, split_pronunciation_tags(sound_tags_text)
         )
         if len(ipa) > 0:
-            data = {"tags": new_tags}
-            ipa_key = "zh-pron" if lang_code == "zh" else "ipa"
-            data[ipa_key] = ipa[0].strip()
+            data = Sound(tags=new_tags)
+            ipa_key = "zh_pron" if lang_code == "zh" else "ipa"
+            setattr(data, ipa_key, ipa[0].strip())
             return data
 
         for child in filter(
@@ -155,9 +162,9 @@ def extract_pronunciation_item(
 
 def process_homophone_data(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    node_children: List[WikiNode],
-    tags: List[str],
+    page_data: list[WordEntry],
+    node_children: list[WikiNode],
+    tags: list[str],
 ) -> None:
     # Process the collapsible homophone table created from "zh-pron" template
     # and the "homophones" template
@@ -167,11 +174,10 @@ def process_homophone_data(
             for span_node in node.find_html_recursively(
                 "span", attr_name="lang"
             ):
-                sound_data = {
-                    "homophone": clean_node(wxr, None, span_node),
-                    "tags": tags,
-                }
-                page_data[-1]["sounds"].append(sound_data)
+                sound_data = Sound(
+                    homophone=clean_node(wxr, None, span_node), tags=tags
+                )
+                page_data[-1].sounds.append(sound_data)
         elif (
             isinstance(node, TemplateNode)
             and node.template_name == "homophones"
@@ -182,8 +188,7 @@ def process_homophone_data(
             for span_node in expaned_template.find_html_recursively(
                 "span", attr_name="lang"
             ):
-                sound_data = {
-                    "homophone": clean_node(wxr, None, span_node),
-                    "tags": tags,
-                }
-                page_data[-1]["sounds"].append(sound_data)
+                sound_data = Sound(
+                    homophone=clean_node(wxr, None, span_node), tags=tags
+                )
+                page_data[-1].sounds.append(sound_data)
diff --git a/src/wiktextract/extractor/zh/thesaurus.py b/src/wiktextract/extractor/zh/thesaurus.py
index 3f466b96..6c2d89e5 100644
--- a/src/wiktextract/extractor/zh/thesaurus.py
+++ b/src/wiktextract/extractor/zh/thesaurus.py
@@ -1,11 +1,12 @@
 import logging
 import re
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, Page, WikiNode
 
 from ...page import clean_node
+from ...thesaurus import ThesaurusTerm
 from ...wxr_context import WiktextractContext
 from ..share import capture_text_in_parentheses, split_chinese_variants
 
@@ -26,9 +27,7 @@ def parse_ja_thesaurus_term(
     sense: Optional[str],
     linkage: Optional[str],
     term_str: str,
-) -> List["ThesaurusTerm"]:
-    from wiktextract.thesaurus import ThesaurusTerm
-
+) -> list[ThesaurusTerm]:
     tags = None
     roman = None
     if term_str.startswith("("):  # has qualifier
@@ -40,8 +39,8 @@ def parse_ja_thesaurus_term(
     for term_str in term_str.split("、"):
         # Example term_str from https://zh.wiktionary.org/wiki/Thesaurus:死ぬ
         # Fromat: (qualifer) term (roman, gloss)
-        # 'この世(よ)を去(さ)る (kono yo o saru, 字面意思為“to leave this world”)'
-        # '若死(わかじ)にする (wakajini suru, “还年轻时死去”)'
+        # この世(よ)を去(さ)る (kono yo o saru, 字面意思為“to leave this world”)
+        # 若死(わかじ)にする (wakajini suru, “还年轻时死去”)
         term_end = term_str.find(" (")
         term = term_str[:term_end]
         roman_and_gloss = term_str[term_end + 2 :].removesuffix(")").split(", ")
@@ -70,9 +69,7 @@ def parse_zh_thesaurus_term(
     sense: Optional[str],
     linkage: Optional[str],
     term_str: str,
-) -> List["ThesaurusTerm"]:
-    from wiktextract.thesaurus import ThesaurusTerm
-
+) -> list[ThesaurusTerm]:
     # Example term_str from https://zh.wiktionary.org/wiki/Thesaurus:安置
     # Fromat: traditional／simplified (pinyin) (tags)
     # 施設／施设 (shīshè) (書面)
@@ -112,9 +109,7 @@ def parse_thesaurus_term(
     sense: Optional[str],
     linkage: Optional[str],
     node: WikiNode,
-) -> List["ThesaurusTerm"]:
-    from wiktextract.thesaurus import ThesaurusTerm
-
+) -> list[ThesaurusTerm]:
     node_str = clean_node(wxr, None, node)
     node_str = node_str.removeprefix("* ")  # remove list wikitext
 
@@ -146,8 +141,8 @@ def recursive_parse(
     pos: Optional[str],
     sense: Optional[str],
     linkage: Optional[str],
-    node: Union[WikiNode, List[Union[WikiNode, str]]],
-) -> Optional[List["ThesaurusTerm"]]:
+    node: Union[WikiNode, list[Union[WikiNode, str]]],
+) -> Optional[list[ThesaurusTerm]]:
     if isinstance(node, list):
         thesaurus = []
         for x in node:
@@ -225,7 +220,7 @@ def recursive_parse(
 
 def extract_thesaurus_page(
     wxr: WiktextractContext, page: Page
-) -> Optional[List["ThesaurusTerm"]]:
+) -> Optional[list[ThesaurusTerm]]:
     entry = page.title[page.title.find(":") + 1 :]
     wxr.wtp.start_page(page.title)
     root = wxr.wtp.parse(page.body, additional_expand={"ws", "zh-syn-list"})
diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py
index e11fec28..a9086e59 100644
--- a/src/wiktextract/extractor/zh/translation.py
+++ b/src/wiktextract/extractor/zh/translation.py
@@ -1,21 +1,19 @@
 import re
-from collections import defaultdict
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 
 from mediawiki_langcodes import name_to_code
 from wikitextprocessor import NodeKind, WikiNode
-from wiktextract.datautils import find_similar_gloss
 from wiktextract.page import LEVEL_KINDS, clean_node
 from wiktextract.wxr_context import WiktextractContext
 
 from ..share import capture_text_in_parentheses
+from .models import Translation, WordEntry
 
 
 def extract_translation(
-    wxr: WiktextractContext, page_data: List[Dict], node: WikiNode
+    wxr: WiktextractContext, page_data: list[WordEntry], node: WikiNode
 ) -> None:
     sense_text = ""
-    append_to = page_data[-1]
     for child in node.children:
         if isinstance(child, WikiNode):
             if child.kind == NodeKind.TEMPLATE:
@@ -27,7 +25,6 @@ def extract_translation(
                     sense_text = clean_node(
                         wxr, None, child.template_parameters.get(1)
                     )
-                    append_to = find_similar_gloss(page_data, sense_text)
                 elif template_name == "checktrans-top":
                     return
                 elif template_name == "see translation subpage":
@@ -42,7 +39,6 @@ def extract_translation(
                             page_data,
                             clean_node(wxr, None, list_item_node.children),
                             sense_text,
-                            append_to,
                         )
                     else:
                         nested_list_index = 0
@@ -65,7 +61,6 @@ def extract_translation(
                                 list_item_node.children[:nested_list_index],
                             ),
                             sense_text,
-                            append_to,
                         )
                         for nested_list_node in list_item_node.find_child(
                             NodeKind.LIST
@@ -80,16 +75,14 @@ def extract_translation(
                                         wxr, None, nested_list_item.children
                                     ),
                                     sense_text,
-                                    append_to,
                                 )
 
 
 def process_translation_list_item(
     wxr: WiktextractContext,
-    page_data: List[Dict],
+    page_data: list[WordEntry],
     expanded_text: str,
     sense: str,
-    append_to: Dict,
 ) -> None:
     from .headword_line import GENDERS
 
@@ -107,38 +100,33 @@ def process_translation_list_item(
     for word_and_tags in re.split(r"[,;、](?![^(]*\))\s*", words_text):
         tags, word = capture_text_in_parentheses(word_and_tags)
         tags = [tag for tag in tags if tag != lang_code]  # rm Wiktionary link
-        translation_data = defaultdict(
-            list,
-            {
-                "lang_code": lang_code,
-                "lang_name": lang_text,
-                "word": word,
-            },
+        translation_data = Translation(
+            lang_code=lang_code, lang_name=lang_text, word=word
         )
         tags_without_roman = []
         for tag in tags:
             if re.search(r"[a-z]", tag):
-                translation_data["roman"] = tag
+                translation_data.roman = tag
             else:
                 tags_without_roman.append(tag)
 
         if len(tags_without_roman) > 0:
-            translation_data["tags"] = tags_without_roman
+            translation_data.tags = tags_without_roman
 
         gender = word.split(" ")[-1]
         if gender in GENDERS:
-            translation_data["word"] = word.removesuffix(f" {gender}")
-            translation_data["tags"].append(GENDERS.get(gender))
+            translation_data.word = word.removesuffix(f" {gender}")
+            translation_data.tags.append(GENDERS.get(gender))
 
         if len(sense) > 0:
-            translation_data["sense"] = sense
-        append_to["translations"].append(translation_data)
+            translation_data.sense = sense
+        page_data[-1].translations.append(translation_data)
 
 
 def translation_subpage(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    template_args: Dict[str, str],
+    page_data: list[WordEntry],
+    template_args: dict[str, str],
 ) -> None:
     from .page import ADDITIONAL_EXPAND_TEMPLATES
 
@@ -174,7 +162,7 @@ def translation_subpage(
 def find_subpage_section(
     wxr: WiktextractContext,
     node: Union[WikiNode, str],
-    target_section: Union[str, List[str]],
+    target_section: Union[str, list[str]],
 ) -> Optional[WikiNode]:
     if isinstance(node, WikiNode):
         if node.kind in LEVEL_KINDS:
diff --git a/src/wiktextract/extractor/zh/util.py b/src/wiktextract/extractor/zh/util.py
new file mode 100644
index 00000000..0cb1ae79
--- /dev/null
+++ b/src/wiktextract/extractor/zh/util.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+from .models import WordEntry
+
+
+def append_base_data(
+    page_data: list[WordEntry], field: str, value: Any, base_data: WordEntry
+) -> None:
+    """
+    Chinese Wiktionary's POS sections could under other sections or at the same
+    level of other sections. This function is to decide whether append a new
+    WordEntry data.
+    """
+    if len(page_data) == 0 or (
+        len(getattr(page_data[-1], field)) > 0 and len(page_data[-1].senses) > 0
+    ):
+        # Append new entry if last data has same field and also has gloss data
+        page_data.append(base_data.model_copy(deep=True))
+
+    # Don't append new WordEntry if POS section is not processed
+    # Example page "kirin", "北庫爾德語" section
+    pre_data = getattr(page_data[-1], field)
+    if isinstance(pre_data, list):
+        pre_data.append(value)
+    else:
+        setattr(page_data[-1], field, value)
diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py
index 590552b1..1e525364 100644
--- a/src/wiktextract/page.py
+++ b/src/wiktextract/page.py
@@ -5,7 +5,7 @@
 import re
 from collections import defaultdict
 from copy import copy
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 from mediawiki_langcodes import get_all_names, name_to_code
 from wikitextprocessor import NodeKind, WikiNode
@@ -28,7 +28,7 @@
 
 def parse_page(
     wxr: WiktextractContext, page_title: str, page_text: str
-) -> List[Dict[str, str]]:
+) -> list[dict[str, Any]]:
     """Parses the text of a Wiktionary page and returns a list of
     dictionaries, one for each word/part-of-speech defined on the page
     for the languages specified by ``capture_language_codes`` (None means
@@ -56,9 +56,9 @@ def is_panel_template(wxr: WiktextractContext, template_name: str) -> bool:
 
 
 def recursively_extract(
-    contents: Union[WikiNode, List[WikiNode]],
-    fn: Callable[[Union[WikiNode, List[WikiNode]]], bool],
-) -> Tuple[List[WikiNode], List[WikiNode]]:
+    contents: Union[WikiNode, list[WikiNode]],
+    fn: Callable[[Union[WikiNode, list[WikiNode]]], bool],
+) -> tuple[list[WikiNode], list[WikiNode]]:
     """Recursively extracts elements from contents for which ``fn`` returns
     True.  This returns two lists, the extracted elements and the remaining
     content (with the extracted elements removed at each level).  Only
@@ -146,7 +146,7 @@ def recursively_extract(
     return extracted, new_contents
 
 
-def inject_linkages(wxr: WiktextractContext, page_data: List[Dict]) -> None:
+def inject_linkages(wxr: WiktextractContext, page_data: list[dict]) -> None:
     # Inject linkages from thesaurus entries
     from .thesaurus import search_thesaurus
 
@@ -183,7 +183,7 @@ def inject_linkages(wxr: WiktextractContext, page_data: List[Dict]) -> None:
                 data_append(data, term.linkage, dt)
 
 
-def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None:
+def process_categories(wxr: WiktextractContext, page_data: list[dict]) -> None:
     # Categories are not otherwise disambiguated, but if there is only
     # one sense and only one data in ret for the same language, move
     # categories to the only sense.  Note that categories are commonly
@@ -275,7 +275,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None:
             data["categories"] = new_cats
 
 
-def remove_duplicate_data(page_data: Dict) -> None:
+def remove_duplicate_data(page_data: dict) -> None:
     # Remove duplicates from tags, categories, etc.
     for data in page_data:
         for field in ("categories", "topics", "tags", "wikidata", "wikipedia"):
@@ -310,10 +310,10 @@ def remove_duplicate_data(page_data: Dict) -> None:
 
 def clean_node(
     wxr: WiktextractContext,
-    sense_data: Optional[Dict],
-    wikinode: Union[str, WikiNode, List[Union[str, WikiNode, List]]],
-    template_fn: Optional[Callable[[str, Dict], str]] = None,
-    post_template_fn: Optional[Callable[[str, Dict, str], str]] = None,
+    sense_data: Optional[Any],
+    wikinode: Union[str, WikiNode, list[Union[str, WikiNode]]],
+    template_fn: Optional[Callable[[str, dict], str]] = None,
+    post_template_fn: Optional[Callable[[str, dict, str], str]] = None,
     collect_links: bool = False,
 ) -> str:
     """
diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py
index dc63e8a4..a0a97639 100644
--- a/tests/test_zh_descendant.py
+++ b/tests/test_zh_descendant.py
@@ -1,9 +1,9 @@
-from collections import defaultdict
 from unittest import TestCase
 from unittest.mock import Mock
 
 from wikitextprocessor import Wtp
 from wiktextract.extractor.zh.descendant import extract_descendants
+from wiktextract.extractor.zh.models import WordEntry
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
@@ -32,19 +32,17 @@ def test_ruby(self):
             '<span class="Jpan" lang="ja">[[你好#日語|-{<ruby>你好<rp>(</rp><rt>ニイハオ</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">nīhao</span></span><span class="mention-gloss-paren annotation-paren">)</span>',
         )
         root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}")
-        page_data = defaultdict(list)
+        page_data = WordEntry(word="你好", lang_code="ja", lang_name="日語")
         extract_descendants(self.wxr, root, page_data)
         self.assertEqual(
-            page_data.get("descendants"),
-            [
-                {
-                    "lang_code": "ja",
-                    "lang_name": "日語",
-                    "roman": "nīhao",
-                    "ruby": [("你好", "ニイハオ")],
-                    "word": "你好",
-                }
-            ],
+            page_data.descendants[0].model_dump(exclude_defaults=True),
+            {
+                "lang_code": "ja",
+                "lang_name": "日語",
+                "roman": "nīhao",
+                "ruby": [["你好", "ニイハオ"]],
+                "word": "你好",
+            },
         )
 
     def test_roman_only_list(self):
@@ -55,22 +53,21 @@ def test_roman_only_list(self):
             '<span class="desc-arr" title="仿譯詞">→</span> 壯語：<span class="Latn" lang="za">[[mwngz ndei#壯語|-{mwngz ndei}-]]</span> <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content">仿譯</span><span class="ib-brac qualifier-brac">)</span>',
         )
         root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}")
-        page_data = defaultdict(list)
+        page_data = WordEntry(word="你好", lang_code="zh", lang_name="漢語")
         extract_descendants(self.wxr, root, page_data)
         self.assertEqual(
-            page_data.get("descendants"),
-            [
-                {
-                    "lang_code": "za",
-                    "lang_name": "壯語",
-                    "tags": ["仿譯"],
-                    "word": "mwngz ndei",
-                }
-            ],
+            page_data.descendants[0].model_dump(exclude_defaults=True),
+            {
+                "lang_code": "za",
+                "lang_name": "壯語",
+                "tags": ["仿譯"],
+                "word": "mwngz ndei",
+            },
         )
 
     def test_nested_list(self):
         # https://zh.wiktionary.org/wiki/オタク
+        self.maxDiff = None
         self.wxr.wtp.start_page("オタク")
         self.wxr.wtp.add_page(
             "Template:desc",
@@ -87,30 +84,28 @@ def test_nested_list(self):
 *:* {{desc|cmn|-|der=1}} {{zh-l|宅男}}
 *:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}"""
         )
-        page_data = defaultdict(list)
+        page_data = WordEntry(word="オタク", lang_code="ja", lang_name="日語")
         extract_descendants(self.wxr, root, page_data)
         self.assertEqual(
-            page_data.get("descendants"),
-            [
-                {
-                    "descendants": [
-                        {
-                            "lang_code": "cmn",
-                            "lang_name": "官話",
-                            "roman": "宅男",
-                            "word": "宅男",
-                        },
-                        {
-                            "lang_code": "cmn",
-                            "lang_name": "官話",
-                            "roman": "宅女",
-                            "word": "宅女",
-                        },
-                    ],
-                    "lang_code": "cmn",
-                    "lang_name": "官話",
-                    "roman": "御宅族",
-                    "word": "御宅族",
-                }
-            ],
+            page_data.descendants[0].model_dump(exclude_defaults=True),
+            {
+                "descendants": [
+                    {
+                        "lang_code": "cmn",
+                        "lang_name": "官話",
+                        "roman": "宅男",
+                        "word": "宅男",
+                    },
+                    {
+                        "lang_code": "cmn",
+                        "lang_name": "官話",
+                        "roman": "宅女",
+                        "word": "宅女",
+                    },
+                ],
+                "lang_code": "cmn",
+                "lang_name": "官話",
+                "roman": "御宅族",
+                "word": "御宅族",
+            },
         )
diff --git a/tests/test_zh_example.py b/tests/test_zh_example.py
index 0cb0f6d2..18448bb0 100644
--- a/tests/test_zh_example.py
+++ b/tests/test_zh_example.py
@@ -1,15 +1,15 @@
-import unittest
-from collections import defaultdict
+from unittest import TestCase
 from unittest.mock import patch
 
 from wikitextprocessor import Wtp
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.zh.example import extract_examples
+from wiktextract.extractor.zh.models import Sense
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
-class TestExample(unittest.TestCase):
+class TestExample(TestCase):
     def setUp(self) -> None:
         self.wxr = WiktextractContext(
             Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh")
@@ -22,7 +22,7 @@ def tearDown(self) -> None:
         )
 
     def test_example_list(self) -> None:
-        sense_data = defaultdict(list)
+        sense_data = Sense()
         wikitext = """
 #* ref text
 #*: example text
@@ -31,14 +31,11 @@ def test_example_list(self) -> None:
         node = self.wxr.wtp.parse(wikitext)
         extract_examples(self.wxr, sense_data, node)
         self.assertEqual(
-            sense_data.get("examples"),
-            [
-                {
-                    "ref": "ref text",
-                    "text": "example text",
-                    "type": "quotation",
-                },
-            ],
+            sense_data.examples[0].model_dump(exclude_defaults=True),
+            {
+                "ref": "ref text",
+                "texts": ["example text"],
+            },
         )
 
     @patch(
@@ -48,19 +45,16 @@ def test_example_list(self) -> None:
 translation text""",
     )
     def test_quote_example(self, mock_clean_node) -> None:
-        sense_data = defaultdict(list)
+        sense_data = Sense()
         wikitext = "#* {{RQ:Schuster Hepaticae}}"
         self.wxr.wtp.start_page("test")
         node = self.wxr.wtp.parse(wikitext)
         extract_examples(self.wxr, sense_data, node)
         self.assertEqual(
-            sense_data.get("examples"),
-            [
-                {
-                    "ref": "ref text",
-                    "text": "quote text",
-                    "translation": "translation text",
-                    "type": "quotation",
-                },
-            ],
+            sense_data.examples[0].model_dump(exclude_defaults=True),
+            {
+                "ref": "ref text",
+                "texts": ["quote text"],
+                "translation": "translation text",
+            },
         )
diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
index ae8eb794..b44a70b6 100644
--- a/tests/test_zh_gloss.py
+++ b/tests/test_zh_gloss.py
@@ -1,15 +1,15 @@
-import unittest
-from collections import defaultdict
+from unittest import TestCase
 from unittest.mock import patch
 
 from wikitextprocessor import NodeKind, WikiNode, Wtp
 from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.zh.models import Sense, WordEntry
 from wiktextract.extractor.zh.page import extract_gloss, parse_section
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
-class TestExample(unittest.TestCase):
+class TestExample(TestCase):
     def setUp(self) -> None:
         self.wxr = WiktextractContext(
             Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh")
@@ -23,13 +23,10 @@ def tearDown(self) -> None:
 
     def test_example_list(self) -> None:
         page_data = [
-            defaultdict(
-                list,
-                {
-                    "lang_name": "日語",
-                    "lang_code": "ja",
-                    "word": "可笑しい",
-                },
+            WordEntry(
+                lang_name="日語",
+                lang_code="ja",
+                word="可笑しい",
             )
         ]
         wikitext = """# [[好玩]]的：
@@ -43,9 +40,9 @@ def test_example_list(self) -> None:
 ## [[很好]]的，[[卓越]]的"""
         self.wxr.wtp.start_page("test")
         node = self.wxr.wtp.parse(wikitext)
-        extract_gloss(self.wxr, page_data, node.children[0], {})
+        extract_gloss(self.wxr, page_data, node.children[0], Sense())
         self.assertEqual(
-            page_data[0]["senses"],
+            [s.model_dump(exclude_defaults=True) for s in page_data[0].senses],
             [
                 {"glosses": ["好玩的：", "有趣的，滑稽的，可笑的"]},
                 {"glosses": ["好玩的：", "奇怪的，不正常的"]},
@@ -81,7 +78,8 @@ def test_pos_title_number(
         mock_process_pos_block,
     ) -> None:
         node = WikiNode(NodeKind.LEVEL3, 0)
-        parse_section(self.wxr, [{}], {}, node)
+        base_data = WordEntry(word="", lang_code="", lang_name="")
+        parse_section(self.wxr, [base_data], base_data, node)
         mock_process_pos_block.assert_called()
 
     @patch("wiktextract.extractor.zh.page.process_pos_block")
@@ -92,5 +90,6 @@ def test_pos_title_chinese_numeral(
         mock_process_pos_block,
     ) -> None:
         node = WikiNode(NodeKind.LEVEL3, 0)
-        parse_section(self.wxr, [{}], {}, node)
+        base_data = WordEntry(word="", lang_code="", lang_name="")
+        parse_section(self.wxr, [base_data], base_data, node)
         mock_process_pos_block.assert_called()
diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py
index b1f31fdf..d651cadf 100644
--- a/tests/test_zh_headword.py
+++ b/tests/test_zh_headword.py
@@ -1,9 +1,9 @@
-from collections import defaultdict
 from unittest import TestCase
 from unittest.mock import Mock, patch
 
 from wikitextprocessor import Wtp
 from wiktextract.extractor.zh.headword_line import extract_headword_line
+from wiktextract.extractor.zh.models import WordEntry
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
@@ -28,18 +28,21 @@ def test_english_headword(self, mock_node_to_wikitext) -> None:
         # expanded text: manga (可數 & 不可數，複數 manga 或 mangas)
         node = Mock()
         node.largs = [["en-noun"]]
-        page_data = [defaultdict(list)]
+        page_data = [WordEntry(word="manga", lang_code="en", lang_name="英語")]
         self.wxr.wtp.title = "manga"
         extract_headword_line(self.wxr, page_data, node, "en")
         self.assertEqual(
-            page_data,
+            [d.model_dump(exclude_defaults=True) for d in page_data],
             [
                 {
+                    "word": "manga",
+                    "lang_code": "en",
+                    "lang_name": "英語",
                     "forms": [
-                        {"form": "manga", "tags": ["plural"]},
-                        {"form": "mangas", "tags": ["plural"]},
+                        {"form": "manga", "tags": ["複數"]},
+                        {"form": "mangas", "tags": ["複數"]},
                     ],
-                    "tags": ["countable", "uncountable"],
+                    "tags": ["可數", "不可數"],
                 }
             ],
         )
@@ -54,16 +57,19 @@ def test_headword_gender(self, mock_node_to_wikitext) -> None:
         # expanded text: manga m (複數 manga's，指小詞 mangaatje n)
         node = Mock()
         node.largs = [["nl-noun"]]
-        page_data = [defaultdict(list)]
+        page_data = [WordEntry(word="manga", lang_code="en", lang_name="英語")]
         self.wxr.wtp.title = "manga"
         extract_headword_line(self.wxr, page_data, node, "nl")
         self.assertEqual(
-            page_data,
+            [d.model_dump(exclude_defaults=True) for d in page_data],
             [
                 {
+                    "word": "manga",
+                    "lang_code": "en",
+                    "lang_name": "英語",
                     "forms": [
-                        {"form": "manga's", "tags": ["plural"]},
-                        {"form": "mangaatje", "tags": ["diminutive", "neuter"]},
+                        {"form": "manga's", "tags": ["複數"]},
+                        {"form": "mangaatje", "tags": ["指小詞", "neuter"]},
                     ],
                     "tags": ["masculine"],
                 }
@@ -80,13 +86,18 @@ def test_headword_roman(self, mock_node_to_wikitext) -> None:
         # expanded text: -κρατίᾱς (-kratíās) f
         node = Mock()
         node.largs = [["head"]]
-        page_data = [defaultdict(list)]
+        page_data = [
+            WordEntry(word="-κρατίας", lang_code="grc", lang_name="古希臘語")
+        ]
         self.wxr.wtp.title = "-κρατίας"
         extract_headword_line(self.wxr, page_data, node, "grc")
         self.assertEqual(
-            page_data,
+            [d.model_dump(exclude_defaults=True) for d in page_data],
             [
                 {
+                    "word": "-κρατίας",
+                    "lang_code": "grc",
+                    "lang_name": "古希臘語",
                     "forms": [
                         {"form": "-kratíās", "tags": ["romanization"]},
                     ],
diff --git a/tests/test_zh_inflection.py b/tests/test_zh_inflection.py
index d3464b7f..64230fed 100644
--- a/tests/test_zh_inflection.py
+++ b/tests/test_zh_inflection.py
@@ -1,15 +1,15 @@
-import unittest
-from collections import defaultdict
+from unittest import TestCase
 from unittest.mock import patch
 
 from wikitextprocessor import Page, Wtp
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.zh.inflection import extract_inflections
+from wiktextract.extractor.zh.models import WordEntry
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
-class TestInflection(unittest.TestCase):
+class TestInflection(TestCase):
     def setUp(self) -> None:
         self.wxr = WiktextractContext(
             Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh")
@@ -39,22 +39,13 @@ def tearDown(self) -> None:
         ),
     )
     def test_ja_i_template(self, mock_get_page) -> None:
-        page_data = [
-            defaultdict(
-                list,
-                {
-                    "lang": "日語",
-                    "lang_code": "ja",
-                    "word": "可笑しい",
-                },
-            )
-        ]
+        page_data = [WordEntry(lang_name="日語", lang_code="ja", word="可笑しい")]
         wikitext = "{{ja-i|可笑し|おかし|okashi}}"
         self.wxr.wtp.start_page("可笑しい")
         node = self.wxr.wtp.parse(wikitext)
         extract_inflections(self.wxr, page_data, node)
         self.assertEqual(
-            page_data[0].get("forms"),
+            [d.model_dump(exclude_defaults=True) for d in page_data[0].forms],
             [
                 {
                     "form": "可笑しかろ",
diff --git a/tests/test_zh_linkage.py b/tests/test_zh_linkage.py
index e736fb9a..d316d372 100644
--- a/tests/test_zh_linkage.py
+++ b/tests/test_zh_linkage.py
@@ -1,14 +1,14 @@
-import unittest
-from collections import defaultdict
+from unittest import TestCase
 
 from wikitextprocessor import Wtp
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.zh.linkage import extract_linkages
+from wiktextract.extractor.zh.models import Sense, WordEntry
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
-class TestLinkage(unittest.TestCase):
+class TestLinkage(TestCase):
     def setUp(self):
         self.wxr = WiktextractContext(
             Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh")
@@ -22,12 +22,12 @@ def tearDown(self):
 
     def test_sense_term_list(self):
         page_data = [
-            {
-                "lang": "跨語言",
-                "lang_code": "mul",
-                "word": "%",
-                "senses": [defaultdict(list, {"glosses": ["百分比"]})],
-            }
+            WordEntry(
+                lang_name="跨語言",
+                lang_code="mul",
+                word="%",
+                senses=[Sense(glosses=["百分比"])],
+            )
         ]
         wikitext = "* {{sense|百分比}} {{l|mul|cU}}、[[centiuno]]"
         self.wxr.wtp.add_page("Template:Sense", 10, "{{{1}}}")
@@ -35,11 +35,12 @@ def test_sense_term_list(self):
         self.wxr.wtp.db_conn.commit()
         self.wxr.wtp.start_page("%")
         node = self.wxr.wtp.parse(wikitext)
-        extract_linkages(
-            self.wxr, page_data, node.children, "synonyms", "", page_data[-1]
-        )
+        extract_linkages(self.wxr, page_data, node.children, "synonyms", "")
         self.assertEqual(
-            page_data[0]["senses"][0].get("synonyms"),
+            [
+                s.model_dump(exclude_defaults=True)
+                for s in page_data[0].synonyms
+            ],
             [
                 {"sense": "百分比", "word": "cU"},
                 {"sense": "百分比", "word": "centiuno"},
@@ -55,22 +56,14 @@ def test_ja_r_template(self):
             '<span class="Jpan" lang="ja">[[家主#日語|-{<ruby>家<rp>(</rp><rt>や</rt><rp>)</rp></ruby><ruby>主<rp>(</rp><rt>ぬし</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">yanushi</span></span><span class="mention-gloss-paren annotation-paren">)</span>',
         )
         node = self.wxr.wtp.parse("{{s|房東}}\n* {{ja-r|家%主|や%ぬし}}")
-        page_data = [defaultdict(list)]
-        extract_linkages(
-            self.wxr, page_data, node.children, "synonyms", "", page_data[-1]
-        )
+        page_data = [WordEntry(word="大家", lang_code="zh", lang_name="漢語")]
+        extract_linkages(self.wxr, page_data, node.children, "synonyms", "")
         self.assertEqual(
-            page_data,
-            [
-                {
-                    "synonyms": [
-                        {
-                            "roman": "yanushi",
-                            "ruby": [("家", "や"), ("主", "ぬし")],
-                            "sense": "房東",
-                            "word": "家主",
-                        }
-                    ]
-                }
-            ],
+            page_data[0].synonyms[0].model_dump(exclude_defaults=True),
+            {
+                "roman": "yanushi",
+                "ruby": [["家", "や"], ["主", "ぬし"]],
+                "sense": "房東",
+                "word": "家主",
+            },
         )
diff --git a/tests/test_zh_note.py b/tests/test_zh_note.py
index 04a3406b..c41aa623 100644
--- a/tests/test_zh_note.py
+++ b/tests/test_zh_note.py
@@ -1,8 +1,8 @@
-from collections import defaultdict
 from unittest import TestCase
 from unittest.mock import Mock
 
 from wikitextprocessor import Wtp
+from wiktextract.extractor.zh.models import WordEntry
 from wiktextract.extractor.zh.note import extract_note
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
@@ -22,14 +22,14 @@ def test_note_list(self):
         # https://zh.wiktionary.org/wiki/オタク
         self.wxr.wtp.start_page("オタク")
         root = self.wxr.wtp.parse("* note list 1\n* note list 2")
-        page_data = [defaultdict(list)]
+        page_data = [WordEntry(word="オタク", lang_code="ja", lang_name="日語")]
         extract_note(self.wxr, page_data, root)
-        self.assertEqual(page_data, [{"notes": ["note list 1", "note list 2"]}])
+        self.assertEqual(page_data[-1].notes, ["note list 1", "note list 2"])
 
     def test_note_no_list(self):
         # https://zh.wiktionary.org/wiki/clavarder
         self.wxr.wtp.start_page("clavarder")
         root = self.wxr.wtp.parse("note text")
-        page_data = [defaultdict(list)]
+        page_data = [WordEntry(word="オタク", lang_code="fr", lang_name="法語")]
         extract_note(self.wxr, page_data, root)
-        self.assertEqual(page_data, [{"notes": ["note text"]}])
+        self.assertEqual(page_data[-1].notes, ["note text"])
diff --git a/tests/test_zh_pronunciation.py b/tests/test_zh_pronunciation.py
index 9a677af5..82c5d9aa 100644
--- a/tests/test_zh_pronunciation.py
+++ b/tests/test_zh_pronunciation.py
@@ -1,8 +1,8 @@
-from collections import defaultdict
 from unittest import TestCase
 from unittest.mock import Mock
 
 from wikitextprocessor import Wtp
+from wiktextract.extractor.zh.models import WordEntry
 from wiktextract.extractor.zh.pronunciation import (
     extract_pronunciation_recursively,
 )
@@ -25,19 +25,16 @@ def test_homophone_table(self):
         root = self.wxr.wtp.parse(
             """* <small>同音詞</small>：<table><tr><th>[展開/摺疊]</th></tr><tr><td><span class="Hani" lang="zh">[[大姑#漢語|大姑]]</span><br><span class="Hani" lang="zh">[[小姑#漢語|小姑]]</span></td></tr></table>"""
         )
-        page_data = [defaultdict(list)]
+        base_data = WordEntry(word="大家", lang_code="zh", lang_name="漢語")
+        page_data = [base_data.model_copy(deep=True)]
         extract_pronunciation_recursively(
-            self.wxr, page_data, {}, "zh", root, []
+            self.wxr, page_data, base_data, "zh", root, []
         )
         self.assertEqual(
-            page_data,
+            [d.model_dump(exclude_defaults=True) for d in page_data[0].sounds],
             [
-                {
-                    "sounds": [
-                        {"homophone": "大姑", "tags": ["同音詞"]},
-                        {"homophone": "小姑", "tags": ["同音詞"]},
-                    ]
-                }
+                {"homophone": "大姑", "tags": ["同音詞"]},
+                {"homophone": "小姑", "tags": ["同音詞"]},
             ],
         )
 
@@ -49,19 +46,16 @@ def test_homophone_template(self):
             '<span class="homophones">[[Appendix:Glossary#同音词|同音词]]：<span class="Jpan" lang="ja">[[大矢#日語|-{大矢}-]]</span>, <span class="Jpan" lang="ja">[[大宅#日語|-{大宅}-]]</span>, <span class="Jpan" lang="ja">[[大谷#日語|-{大谷}-]]</span></span>[[Category:有同音詞的日語詞]]',
         )
         root = self.wxr.wtp.parse("* {{homophones|ja|大矢|大宅|大谷}}")
-        page_data = [defaultdict(list)]
+        base_data = WordEntry(word="大家", lang_code="zh", lang_name="漢語")
+        page_data = [base_data.model_copy(deep=True)]
         extract_pronunciation_recursively(
-            self.wxr, page_data, {}, "ja", root, []
+            self.wxr, page_data, base_data, "ja", root, []
         )
         self.assertEqual(
-            page_data,
+            [d.model_dump(exclude_defaults=True) for d in page_data[0].sounds],
             [
-                {
-                    "sounds": [
-                        {"homophone": "大矢", "tags": ["同音詞"]},
-                        {"homophone": "大宅", "tags": ["同音詞"]},
-                        {"homophone": "大谷", "tags": ["同音詞"]},
-                    ]
-                }
+                {"homophone": "大矢", "tags": ["同音詞"]},
+                {"homophone": "大宅", "tags": ["同音詞"]},
+                {"homophone": "大谷", "tags": ["同音詞"]},
             ],
         )
diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py
index d7d63771..e1752700 100644
--- a/tests/test_zh_translation.py
+++ b/tests/test_zh_translation.py
@@ -1,15 +1,15 @@
-import unittest
-from collections import defaultdict
+from unittest import TestCase
 from unittest.mock import patch
 
 from wikitextprocessor import Page, Wtp
 from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.zh.models import WordEntry
 from wiktextract.extractor.zh.translation import extract_translation
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext
 
 
-class TestTranslation(unittest.TestCase):
+class TestZhTranslation(TestCase):
     def setUp(self) -> None:
         self.wxr = WiktextractContext(
             Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh")
@@ -27,7 +27,7 @@ def tearDown(self) -> None:
     )
     def test_normal(self, mock_get_page) -> None:
         # test wikitext from page "你好" and "這裡"
-        page_data = [defaultdict(list)]
+        page_data = [WordEntry(word="你好", lang_code="zh", lang_name="漢語")]
         wikitext = """
 {{trans-top|靠近說話者的地方}}
 * 阿爾巴尼亞語：këtu (sq)
@@ -43,7 +43,10 @@ def test_normal(self, mock_get_page) -> None:
         node = self.wxr.wtp.parse(wikitext)
         extract_translation(self.wxr, page_data, node)
         self.assertEqual(
-            page_data[0].get("translations"),
+            [
+                d.model_dump(exclude_defaults=True)
+                for d in page_data[0].translations
+            ],
             [
                 {
                     "lang_code": "sq",
@@ -52,7 +55,6 @@ def test_normal(self, mock_get_page) -> None:
                     "word": "këtu",
                 },
                 {
-                    "lang_code": "",
                     "lang_name": "西阿帕切語",
                     "sense": "靠近說話者的地方",
                     "word": "kú",