From 2de1f1a831e3fdcfb611c085fc9b34dea85cb095 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 15 Dec 2023 14:52:27 +0800 Subject: [PATCH] Use Pydantic in Chinese Wiktionary extractor --- .gitignore | 3 + Makefile | 1 - json_schema/zh.json | 376 ------------------ .../data/zh/linkage_subtitles.json | 26 +- src/wiktextract/datautils.py | 60 +-- src/wiktextract/extractor/zh/descendant.py | 54 ++- src/wiktextract/extractor/zh/example.py | 85 ++-- src/wiktextract/extractor/zh/gloss.py | 32 +- src/wiktextract/extractor/zh/headword_line.py | 88 ++-- src/wiktextract/extractor/zh/inflection.py | 31 +- src/wiktextract/extractor/zh/linkage.py | 124 +++--- src/wiktextract/extractor/zh/models.py | 127 ++++++ src/wiktextract/extractor/zh/note.py | 10 +- src/wiktextract/extractor/zh/page.py | 127 +++--- src/wiktextract/extractor/zh/pronunciation.py | 81 ++-- src/wiktextract/extractor/zh/thesaurus.py | 25 +- src/wiktextract/extractor/zh/translation.py | 42 +- src/wiktextract/extractor/zh/util.py | 26 ++ src/wiktextract/page.py | 24 +- tests/test_zh_descendant.py | 87 ++-- tests/test_zh_example.py | 38 +- tests/test_zh_gloss.py | 27 +- tests/test_zh_headword.py | 35 +- tests/test_zh_inflection.py | 19 +- tests/test_zh_linkage.py | 53 ++- tests/test_zh_note.py | 10 +- tests/test_zh_pronunciation.py | 34 +- tests/test_zh_translation.py | 14 +- 28 files changed, 651 insertions(+), 1008 deletions(-) delete mode 100644 json_schema/zh.json create mode 100644 src/wiktextract/extractor/zh/models.py create mode 100644 src/wiktextract/extractor/zh/util.py diff --git a/.gitignore b/.gitignore index 1271ff87..4ac776d8 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ wikt-db* # GitHub Pages _site + +# Emacs files +*~ diff --git a/Makefile b/Makefile index 7ca5dfcb..9b21194d 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,6 @@ coverage_report: python -m coverage html github_pages: python tools/generate_schema.py - cp json_schema/*.json _site python tools/github_pages.py $(REPO) $(SHA) clean: python -m coverage erase diff --git a/json_schema/zh.json b/json_schema/zh.json deleted file mode 100644 index 9de1c86c..00000000 --- a/json_schema/zh.json +++ /dev/null @@ -1,376 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://kaikki.org/zh.json", - "title": "Chinese Wiktionary", - "description": "JSON schema of the Chinese Wiktionary extractor", - "type": "object", - "properties": { - "lang_name": { - "description": "Localized language name of the word", - "type": "string" - }, - "lang_code": { - "description": "Wiktionary language code", - "type": "string" - }, - "word": { - "description": "word string", - "type": "string" - }, - "pos": { - "description": "Part of speech type", - "type": "string" - }, - "etymology_text": { - "type": "string" - }, - "senses": { - "description": "Sense list", - "type": "array", - "items": { - "$ref": "#/$defs/sense" - } - }, - "forms": { - "description": "Inflection forms list", - "type": "array", - "items": { - "$ref": "#/$defs/form" - } - }, - "sounds": { - "type": "array", - "items": { - "$ref": "#/$defs/sound" - } - }, - "translations": { - "type": "array", - "items": { - "$ref": "#/$defs/translation" - } - }, - "synonyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "hyponyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "hypernyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "holonyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "meronyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "derived": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "troponyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "paronyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "related": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "abbreviation": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "proverbs": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "antonyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "coordinate_terms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "various": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "title": { - "description": "Redirect page source title", - "type": "string" - }, - "redirect": { - "description": "Redirect page target title", - "type": "string" - }, - "categories": { - "type": "array", - "items": { - "type": "string" - } - }, - "descendants": { - "type": "array", - "items": { - "$ref": "#/$defs/descendant" - } - }, - "notes": { - "description": "Usage notes", - "type": "array", - "items": { - "type": "string" - } - } - }, - "$defs": { - "sense": { - "type": "object", - "properties": { - "glosses": { - "type": "array", - "items": { - "type": "string" - } - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "categories": { - "type": "array", - "items": { - "type": "string" - } - }, - "examples": { - "type": "array", - "items": { - "$ref": "#/$defs/example" - } - } - } - }, - "example": { - "type": "object", - "properties": { - "texts": { - "description": "Example usage sentences, some might have have both Simplified and Traditional Chinese forms", - "type": "array", - "items": { - "type": "string" - } - }, - "translation": { - "description": "Chinese translation of the example sentence", - "type": "string" - }, - "roman": { - "description": "Romanization of the example sentence", - "type": "string" - }, - "ref": { - "description": "Source of the sentence, like book title and page number", - "type": "string" - }, - "type": { - "description": "This value is 'quotation' if 'source' exists", - "type": "string", - "enum": [ - "example", - "quotation" - ] - } - } - }, - "form": { - "type": "object", - "properties": { - "form": { - "type": "string" - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "source": { - "type": "string" - }, - "ruby": { - "description": "Japanese Kanji and furigana", - "type": "array", - "items": { - "type": "array", - "items": { - "type": "string" - } - } - } - } - }, - "sound": { - "type": "object", - "properties": { - "zh-pron": { - "description": "Chinese word pronunciation", - "type": "string" - }, - "ipa": { - "description": "International Phonetic Alphabet", - "type": "string" - }, - "audio": { - "description": "Audio file name", - "type": "string" - }, - "wav_url": { - "type": "string" - }, - "ogg_url": { - "type": "string" - }, - "mp3_url": { - "type": "string" - }, - "homophone": { - "type": "string" - } - } - }, - "translation": { - "type": "object", - "properties": { - "lang_code": { - "description": "Wiktionary language code of the translation term", - "type": "string" - }, - "lang_name": { - "description": "Translation language name", - "type": "string" - }, - "word": { - "description": "Translation term", - "type": "string" - }, - "sense": { - "description": "Translation gloss", - "type": "string" - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "roman": { - "type": "string" - } - } - }, - "linkage": { - "type": "object", - "properties": { - "word": { - "type": "string" - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "roman": { - "type": "string" - }, - "language_variant": { - "description": "Chinese character variant", - "type": "string", - "enum": ["zh-Hant", "zh-Hans"] - } - } - }, - "descendant": { - "type": "object", - "properties": { - "lang_code": { - "description": "Wiktionary language code", - "type": "string" - }, - "lang_name": { - "type": "string" - }, - "word": { - "type": "string" - }, - "roman": { - "type": "string" - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "descendants": { - "type": "array", - "items": { - "$refs": "#/$defs/descendant" - } - }, - "ruby": { - "description": "Japanese Kanji and furigana", - "type": "array", - "items": { - "type": "array", - "items": { - "type": "string" - } - } - } - } - } - } -} diff --git a/src/wiktextract/data/zh/linkage_subtitles.json b/src/wiktextract/data/zh/linkage_subtitles.json index 369e2221..3b21d4cb 100644 --- a/src/wiktextract/data/zh/linkage_subtitles.json +++ b/src/wiktextract/data/zh/linkage_subtitles.json @@ -1,18 +1,18 @@ { - "上下位關係": "hypernym", - "上义词": "hypernym", - "上位詞": "hypernym", - "上位語": "hypernym", - "上位词": "hypernym", - "上義詞": "hypernym", - "下义词": "hyponym", - "下位詞": "hyponym", - "下位語": "hyponym", - "下位词": "hyponym", - "下层词": "hyponym", - "下属词": "hyponym", + "上下位關係": "hypernyms", + "上义词": "hypernyms", + "上位詞": "hypernyms", + "上位語": "hypernyms", + "上位词": "hypernyms", + "上義詞": "hypernyms", + "下义词": "hyponyms", + "下位詞": "hyponyms", + "下位語": "hyponyms", + "下位词": "hyponyms", + "下层词": "hyponyms", + "下属词": "hyponyms", "下層概念": "derived", - "下義詞": "hyponym", + "下義詞": "hyponyms", "俗语": "related", "关联词": "related", "关联词条": "related", diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py index ff196c2f..96868893 100644 --- a/src/wiktextract/datautils.py +++ b/src/wiktextract/datautils.py @@ -1,13 +1,9 @@ # Utilities for manipulating word data structures # # Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org -import copy import re from collections import defaultdict -from functools import partial -from typing import Any, Dict, Iterable, List, Tuple - -from wiktextract.wxr_context import WiktextractContext +from typing import Any, Iterable # Keys in ``data`` that can only have string values (a list of them) STR_KEYS = frozenset({"tags", "glosses"}) @@ -26,7 +22,7 @@ ) -def data_append(data: Dict, key: str, value: Any) -> None: +def data_append(data: Any, key: str, value: Any) -> None: """Appends ``value`` under ``key`` in the dictionary ``data``. The key is created if it does not exist.""" assert isinstance(key, str) @@ -47,8 +43,10 @@ def data_append(data: Dict, key: str, value: Any) -> None: data[key] = list_value -def data_extend(data: Dict, key: str, values: Iterable) -> None: - """Appends all values in a list under ``key`` in the dictionary ``data``.""" +def data_extend(data: Any, key: str, values: Iterable) -> None: + """ + Appends all values in a list under ``key`` in the dictionary ``data``. + """ assert isinstance(data, dict) assert isinstance(key, str) assert isinstance(values, (list, tuple)) @@ -63,7 +61,7 @@ def data_extend(data: Dict, key: str, values: Iterable) -> None: def split_at_comma_semi( text: str, separators=(",", ";", ",", "،"), extra=() -) -> List[str]: +) -> list[str]: """Splits the text at commas and semicolons, unless they are inside parenthesis. ``separators`` is default separators (setting it eliminates default separators). ``extra`` is extra separators to be used in addition @@ -203,7 +201,7 @@ def freeze(x): def ns_title_prefix_tuple( wxr, namespace: str, lower: bool = False -) -> Tuple[str, ...]: +) -> tuple[str, ...]: """Based on given namespace name, create a tuple of aliases""" if namespace in wxr.wtp.NAMESPACE_DATA: return tuple( @@ -215,45 +213,3 @@ def ns_title_prefix_tuple( ) else: return () - - -def find_similar_gloss(page_data: List[Dict], gloss: str) -> Dict: - """ - Return a sense dictionary if it has similar gloss, return the last - word dictionary if can't found such gloss. - """ - from rapidfuzz.fuzz import partial_token_set_ratio - from rapidfuzz.process import extractOne - from rapidfuzz.utils import default_process - - if len(gloss) == 0: - return page_data[-1] - - choices = [ - sense_dict.get("raw_glosses", sense_dict.get("glosses", [""]))[0] - for sense_dict in page_data[-1]["senses"] - ] - if match_result := extractOne( - gloss, - choices, - score_cutoff=85, - scorer=partial(partial_token_set_ratio, processor=default_process), - ): - return page_data[-1]["senses"][match_result[2]] - - return page_data[-1] - - -def append_base_data( - page_data: List[Dict], field: str, value: Any, base_data: Dict -) -> None: - if page_data[-1].get(field) is not None: - if len(page_data[-1]["senses"]) > 0: - # append new dictionary if the last dictionary has sense data and - # also has the same key - page_data.append(copy.deepcopy(base_data)) - page_data[-1][field] = value - elif isinstance(page_data[-1].get(field), list): - page_data[-1][field] += value - else: - page_data[-1][field] = value diff --git a/src/wiktextract/extractor/zh/descendant.py b/src/wiktextract/extractor/zh/descendant.py index e3ef77c0..9699f859 100644 --- a/src/wiktextract/extractor/zh/descendant.py +++ b/src/wiktextract/extractor/zh/descendant.py @@ -1,11 +1,11 @@ -from collections import defaultdict -from typing import Dict +from typing import Union from wikitextprocessor import NodeKind, WikiNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext from ..ruby import extract_ruby +from .models import Descendant, WordEntry DESCENDANT_TEMPLATES = frozenset(["desc", "descendant"]) @@ -13,7 +13,7 @@ def extract_descendants( wxr: WiktextractContext, level_node: WikiNode, - parent_data: Dict, + parent_data: Union[WordEntry, Descendant], ) -> None: for list_node in level_node.find_child(NodeKind.LIST): for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): @@ -23,57 +23,51 @@ def extract_descendants( def extract_descendant_list_item( wxr: WiktextractContext, list_item_node: WikiNode, - parent_data: Dict, + parent_data: Union[WordEntry, Descendant], ) -> None: lang_code = "" lang_name = "" - descendant_data = defaultdict(list) + descendant_data = Descendant() for template_node in list_item_node.find_child(NodeKind.TEMPLATE): expanded_template = wxr.wtp.parse( wxr.wtp.node_to_wikitext(template_node), expand_all=True ) if template_node.template_name.lower() in DESCENDANT_TEMPLATES: lang_code = template_node.template_parameters.get(1) - descendant_data["lang_code"] = lang_code + descendant_data.lang_code = lang_code ruby_data, nodes_without_ruby = extract_ruby( wxr, expanded_template.children ) if len(ruby_data) > 0: - descendant_data["ruby"] = ruby_data + descendant_data.ruby = ruby_data for child_index, child_node in enumerate(nodes_without_ruby): if isinstance(child_node, str) and child_node.endswith(":"): lang_name = child_node.strip(" :") - descendant_data["lang_name"] = lang_name + descendant_data.lang_name = lang_name elif ( isinstance(child_node, WikiNode) and child_node.kind == NodeKind.HTML ): if child_node.tag == "span": class_names = child_node.attrs.get("class", "") - if ( - "Latn" in class_names or "tr" in class_names - ) and "word" in descendant_data: + if ("Latn" in class_names or "tr" in class_names) and len( + descendant_data.word + ) > 0: # template:ja-r - descendant_data["roman"] = clean_node( + descendant_data.roman = clean_node( wxr, None, child_node ) elif "lang" in child_node.attrs: - if "word" in descendant_data: - parent_data["descendants"].append(descendant_data) - descendant_data = defaultdict( - list, - { - "lang_code": lang_code, - "lang_name": lang_name, - }, + if len(descendant_data.word) > 0: + parent_data.descendants.append(descendant_data) + descendant_data = Descendant( + lang_code=lang_code, lang_name=lang_name ) if len(ruby_data) > 0: - descendant_data["ruby"] = ruby_data - descendant_data["word"] = clean_node( - wxr, None, child_node - ) + descendant_data.ruby = ruby_data + descendant_data.word = clean_node(wxr, None, child_node) if "qualifier-content" in class_names: - descendant_data["tags"].append( + descendant_data.tags.append( clean_node(wxr, None, child_node) ) elif child_node.tag == "i": @@ -81,16 +75,14 @@ def extract_descendant_list_item( for span_tag in child_node.find_html( "span", attr_name="class", attr_value="Latn" ): - descendant_data["roman"] = clean_node( - wxr, None, span_tag - ) + descendant_data.roman = clean_node(wxr, None, span_tag) - if "word" in descendant_data: - parent_data["descendants"].append(descendant_data) + if len(descendant_data.word) > 0: + parent_data.descendants.append(descendant_data) if list_item_node.contain_node(NodeKind.LIST): extract_descendants( wxr, list_item_node, - descendant_data if "word" in descendant_data else parent_data, + descendant_data if len(descendant_data.word) > 0 else parent_data, ) diff --git a/src/wiktextract/extractor/zh/example.py b/src/wiktextract/extractor/zh/example.py index 5021e493..e1fee774 100644 --- a/src/wiktextract/extractor/zh/example.py +++ b/src/wiktextract/extractor/zh/example.py @@ -1,5 +1,4 @@ -from collections import defaultdict -from typing import Dict, List, Union +from typing import Union from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode @@ -7,19 +6,20 @@ from wiktextract.wxr_context import WiktextractContext from ..ruby import extract_ruby +from .models import Example, Sense def extract_examples( wxr: WiktextractContext, - sense_data: Dict, - node: Union[WikiNode, List[WikiNode]], + sense_data: Sense, + node: Union[WikiNode, list[WikiNode]], ) -> None: if isinstance(node, list): for n in node: extract_examples(wxr, sense_data, n) elif isinstance(node, WikiNode): if node.kind == NodeKind.LIST_ITEM: - example_data = defaultdict(list, {"type": "example"}) + example_data = Example() # example text in the nested list # https://zh.wiktionary.org/wiki/%, the second example if node.contain_node(NodeKind.LIST): @@ -39,53 +39,54 @@ def extract_examples( elif template_name == "uxi": extract_template_uxi(wxr, child, example_data) else: - example_data["text"] = clean_node(wxr, None, child) + example_data.texts = [clean_node(wxr, None, child)] - if "text" in example_data or "texts" in example_data: - sense_data["examples"].append(example_data) + if len(example_data.texts) > 0: + sense_data.examples.append(example_data) else: extract_examples(wxr, sense_data, node.children) def extract_example_list( - wxr: WiktextractContext, node: WikiNode, example_data: Dict + wxr: WiktextractContext, node: WikiNode, example_data: Example ) -> None: for index, child_node in enumerate(node.children): if ( isinstance(child_node, WikiNode) and child_node.kind == NodeKind.LIST ): - example_data["type"] = "quotation" - example_data["ref"] = clean_node(wxr, None, node.children[:index]) - example_data["text"] = clean_node( - wxr, None, child_node.children[0].children - ) + example_data.ref = clean_node(wxr, None, node.children[:index]) + example_data.texts = [ + clean_node(wxr, None, child_node.children[0].children) + ] def extract_quote_templates( - wxr: WiktextractContext, node: TemplateNode, example_data: Dict + wxr: WiktextractContext, node: TemplateNode, example_data: Example ) -> None: """ Process template `quote-book` and "RQ:*". """ - example_data["type"] = "quotation" expanded_text = clean_node(wxr, None, node) for line_num, expanded_line in enumerate(expanded_text.splitlines()): if line_num == 0: key = "ref" elif line_num == 1: - key = "text" + key = "texts" elif line_num == 2 and "transliteration" in node.template_parameters: key = "roman" else: key = "translation" if expanded_line != "(請為本引文添加中文翻譯)": - example_data[key] = expanded_line + if key == "texts": + example_data.texts.append(expanded_line) + else: + setattr(example_data, key, expanded_line) def extract_template_ja_usex( - wxr: WiktextractContext, node: WikiNode, example_data: Dict + wxr: WiktextractContext, node: WikiNode, example_data: Example ) -> None: expanded_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(node), expand_all=True @@ -94,27 +95,30 @@ def extract_template_ja_usex( expanded_text = clean_node(wxr, None, node_without_ruby) for line_num, expanded_line in enumerate(expanded_text.splitlines()): if line_num == 0: - key = "text" + key = "texts" elif line_num == 1: key = "roman" else: key = "translation" - example_data[key] = expanded_line + if key == "texts": + example_data.texts.append(expanded_line) + else: + setattr(example_data, key, expanded_line) if len(ruby_data) > 0: - example_data["ruby"] = ruby_data + example_data.ruby = ruby_data def extract_template_zh_usex( - wxr: WiktextractContext, node: WikiNode, example_data: Dict + wxr: WiktextractContext, node: WikiNode, example_data: Example ) -> None: expanded_text = clean_node(wxr, None, node) if "―" in expanded_text: for index, split_text in enumerate(expanded_text.split("―")): if index == 0: for example_text in split_text.split(" / "): - example_data["texts"].append(example_text.strip()) + example_data.texts.append(example_text.strip()) elif index == 1: - example_data["roman"] = split_text.strip() + example_data.roman = split_text.strip() return for expanded_line in expanded_text.splitlines(): @@ -122,18 +126,17 @@ def extract_template_zh_usex( # expanded simplified or traditional Chinese # example sentence usually ends with # "繁體]" or "簡體]" - example_data["texts"].append(expanded_line) + example_data.texts.append(expanded_line) elif expanded_line.endswith("]"): - example_data["roman"] = expanded_line + example_data.roman = expanded_line elif expanded_line.startswith("來自:"): - example_data["ref"] = expanded_line[3:] - example_data["type"] = "quotation" + example_data.ref = expanded_line[3:] else: - example_data["translation"] = expanded_line + example_data.translation = expanded_line def extract_template_ux( - wxr: WiktextractContext, node: WikiNode, example_data: Dict + wxr: WiktextractContext, node: WikiNode, example_data: Example ) -> None: expanded_text = clean_node(wxr, None, node) if " ― " in expanded_text: @@ -143,7 +146,7 @@ def extract_template_ux( lines = expanded_text.splitlines() for line_num, expanded_line in enumerate(lines): if line_num == 0: - key = "text" + key = "texts" elif line_num == 1: if line_num == len(lines) - 1: key = "translation" @@ -151,21 +154,26 @@ def extract_template_ux( key = "roman" else: key = "translation" - example_data[key] = expanded_line + if key == "texts": + example_data.texts.append(expanded_line) + else: + setattr(example_data, key, expanded_line) def extract_template_uxi( - wxr: WiktextractContext, node: WikiNode, example_data: Dict + wxr: WiktextractContext, node: WikiNode, example_data: Example ) -> None: expanded_text = clean_node(wxr, None, node) extract_template_uxi_text(expanded_text, example_data) -def extract_template_uxi_text(expanded_text: str, example_data: Dict) -> None: +def extract_template_uxi_text( + expanded_text: str, example_data: Example +) -> None: parts = expanded_text.split(" ― ") for index, part in enumerate(parts): if index == 0: - key = "text" + key = "texts" elif index == 1: if index == len(parts) - 1: key = "translation" @@ -173,4 +181,7 @@ def extract_template_uxi_text(expanded_text: str, example_data: Dict) -> None: key = "roman" else: key = "translation" - example_data[key] = part + if key == "texts": + example_data.texts.append(part) + else: + setattr(example_data, key, part) diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py index a8c31da8..8b28a8ac 100644 --- a/src/wiktextract/extractor/zh/gloss.py +++ b/src/wiktextract/extractor/zh/gloss.py @@ -1,6 +1,4 @@ import re -from collections import defaultdict -from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode from wiktextract.page import clean_node @@ -8,15 +6,16 @@ from ..ruby import extract_ruby from .example import extract_examples +from .models import Sense, WordEntry def extract_gloss( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], list_node: WikiNode, - gloss_data: Dict[str, List[str]], + gloss_data: Sense, ) -> None: - lang_code = page_data[-1].get("lang_code") + lang_code = page_data[-1].lang_code for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): gloss_nodes = [ child @@ -38,7 +37,7 @@ def extract_gloss( gloss_data, extract_gloss_and_tags(raw_gloss_text) ) if len(ruby_data) > 0: - new_gloss_data["ruby"] = ruby_data + new_gloss_data.ruby = ruby_data has_nested_gloss = False if list_item_node.contain_node(NodeKind.LIST): @@ -50,20 +49,19 @@ def extract_gloss( extract_examples(wxr, new_gloss_data, child_node) if not has_nested_gloss: - page_data[-1]["senses"].append(new_gloss_data) + page_data[-1].senses.append(new_gloss_data) -def merge_gloss_data( - data_a: Dict[str, List[str]], data_b: Dict[str, List[str]] -) -> Dict[str, List[str]]: - new_data = defaultdict(list) +def merge_gloss_data(data_a: Sense, data_b: Sense) -> Sense: + new_data = Sense() for data in data_a, data_b: - for key, value in data.items(): - new_data[key].extend(value) + for field in data.model_fields: + pre_data = getattr(new_data, field) + pre_data.extend(getattr(data, field)) return new_data -def extract_gloss_and_tags(raw_gloss: str) -> Dict[str, List[str]]: +def extract_gloss_and_tags(raw_gloss: str) -> Sense: left_brackets = ("(", "(") right_brackets = (")", ")") if raw_gloss.startswith(left_brackets) or raw_gloss.endswith( @@ -87,8 +85,6 @@ def extract_gloss_and_tags(raw_gloss: str) -> Dict[str, List[str]]: tags += re.split(split_tag_regex, rear_label) gloss = raw_gloss[front_tag_end + 1 : rear_tag_start].strip() - return defaultdict( - list, {"glosses": [gloss], "raw_glosses": [raw_gloss], "tags": tags} - ) + return Sense(glosses=[gloss], raw_glosses=[raw_gloss], tags=tags) else: - return defaultdict(list, {"glosses": [raw_gloss]}) + return Sense(glosses=[raw_gloss]) diff --git a/src/wiktextract/extractor/zh/headword_line.py b/src/wiktextract/extractor/zh/headword_line.py index 7ba19dcb..39cce406 100644 --- a/src/wiktextract/extractor/zh/headword_line.py +++ b/src/wiktextract/extractor/zh/headword_line.py @@ -1,5 +1,5 @@ import re -from typing import Dict, List, Union +from typing import Union from wikitextprocessor import NodeKind, WikiNode from wiktextract.page import clean_node @@ -7,6 +7,7 @@ from ..ruby import extract_ruby from ..share import strip_nodes +from .models import Form, WordEntry # https://zh.wiktionary.org/wiki/Module:Gender_and_number GENDERS = { @@ -36,37 +37,9 @@ } -FORM_TAGS = { - "不可數": ["uncountable"], - "通常不可數": ["uncountable"], - "可數": ["countable"], - "複數": ["plural"], - # en-verb - "第三人稱單數簡單現在時": ["third-person", "singular", "simple", "present"], - "現在分詞": ["present", "participle"], - "一般過去時及過去分詞": ["past", "participle"], - # fr-noun, fr-adj - # https://zh.wiktionary.org/wiki/Module:Fr-headword - "指小詞": ["diminutive"], - "陰性": ["feminine"], - "陽性": ["masculine"], - "陽性複數": ["masculine", "plural"], - "陰性複數": ["feminine", "plural"], - "陽性單數": ["masculine", "singular"], - "元音前陽性單數": ["masculine", "singular", "before-vowel"], - "比較級": ["comparative"], - "最高級": ["superlative"], - # voice - "主動": ["active"], - "被動": ["passive"], - "及物": ["transitive"], - "不規則": ["irregular"], -} - - def extract_headword_line( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], node: WikiNode, lang_code: str, ) -> None: @@ -88,30 +61,30 @@ def extract_headword_line( if "headword-tr" in class_names: forms_start_index = index + 1 - page_data[-1]["forms"].append( - { - "form": clean_node(wxr, page_data[-1], child), - "tags": ["romanization"], - } + page_data[-1].forms.append( + Form( + form=clean_node(wxr, page_data[-1], child), + tags=["romanization"], + ) ) elif "gender" in class_names: forms_start_index = index + 1 for abbr_tag in child.find_html("abbr"): gender = abbr_tag.children[0] - page_data[-1]["tags"].append(GENDERS.get(gender, gender)) + page_data[-1].tags.append(GENDERS.get(gender, gender)) if lang_code == "ja": for span_child in child.find_html( "strong", attr_name="class", attr_value="headword" ): ruby_data, node_without_ruby = extract_ruby(wxr, span_child) - page_data[-1]["forms"].append( - { - "form": clean_node( + page_data[-1].forms.append( + Form( + form=clean_node( wxr, page_data[-1], node_without_ruby ), - "ruby": ruby_data, - "tags": ["canonical"], - } + ruby=ruby_data, + tags=["canonical"], + ) ) elif child.tag == "b": # this is a form tag, already inside form parentheses @@ -124,8 +97,8 @@ def extract_headword_line( def extract_headword_forms( wxr: WiktextractContext, - page_data: List[Dict], - form_nodes: List[Union[WikiNode, str]], + page_data: list[WordEntry], + form_nodes: list[Union[WikiNode, str]], ) -> None: current_nodes = [] for node in form_nodes: @@ -141,18 +114,18 @@ def extract_headword_forms( def process_forms_text( wxr: WiktextractContext, - page_data: List[Dict], - form_nodes: List[Union[WikiNode, str]], + page_data: list[WordEntry], + form_nodes: list[Union[WikiNode, str]], ) -> None: tag_nodes = [] has_forms = False striped_nodes = list(strip_nodes(form_nodes)) - lang_code = page_data[-1].get("lang_code") + lang_code = page_data[-1].lang_code for index, node in enumerate(striped_nodes): if isinstance(node, WikiNode) and node.kind == NodeKind.HTML: if node.tag == "b": has_forms = True - ruby_data = None + ruby_data = [] if lang_code == "ja": ruby_data, node_without_ruby = extract_ruby(wxr, node) form = clean_node(wxr, None, node_without_ruby) @@ -173,18 +146,11 @@ def process_forms_text( gender = clean_node(wxr, None, next_node) form_tags.append(GENDERS.get(gender, gender)) - form_data = { - "form": form, - "tags": form_tags, - } - if ruby_data is not None: - form_data["ruby"] = ruby_data - page_data[-1]["forms"].append(form_data) + form_data = Form(form=form, tags=form_tags, ruby=ruby_data) + page_data[-1].forms.append(form_data) elif node.tag == "span" and "tr" in node.attrs.get("class", ""): # romanization of the previous form tag - page_data[-1]["forms"][-1]["roman"] = clean_node( - wxr, None, node - ) + page_data[-1].forms[-1].roman = clean_node(wxr, None, node) else: tag_nodes.append(node) else: @@ -195,15 +161,15 @@ def process_forms_text( clean_node(wxr, page_data[-1], tag_nodes).strip("() ") ) if len(tags_list) > 0: - page_data[-1]["tags"].extend(tags_list) + page_data[-1].tags.extend(tags_list) else: clean_node(wxr, page_data[-1], tag_nodes) # find categories -def extract_headword_tags(tags_str: str) -> List[str]: +def extract_headword_tags(tags_str: str) -> list[str]: tags = [] for tag_str in ( s.strip() for s in re.split("&|或", tags_str) if len(s.strip()) > 0 ): - tags.extend(FORM_TAGS.get(tag_str, [tag_str])) + tags.append(tag_str) return tags diff --git a/src/wiktextract/extractor/zh/inflection.py b/src/wiktextract/extractor/zh/inflection.py index 3a620379..0bfb2289 100644 --- a/src/wiktextract/extractor/zh/inflection.py +++ b/src/wiktextract/extractor/zh/inflection.py @@ -1,9 +1,9 @@ -from typing import Dict, List - from wikitextprocessor import NodeKind, WikiNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import Form, WordEntry + # https://zh.wiktionary.org/wiki/Category:日語變格表模板 JAPANESE_INFLECTION_TEMPLATE_PREFIXES = ( "ja-i", @@ -21,21 +21,21 @@ def extract_inflections( wxr: WiktextractContext, - page_data: List[Dict], - node: WikiNode, + page_data: list[WordEntry], + level_node: WikiNode, ) -> None: - for child in node.find_child(NodeKind.TEMPLATE): + for child in level_node.find_child(NodeKind.TEMPLATE): template_name = child.template_name.lower() if template_name.startswith(JAPANESE_INFLECTION_TEMPLATE_PREFIXES): expanded_table = wxr.wtp.parse( - wxr.wtp.node_to_wikitext(node), expand_all=True + wxr.wtp.node_to_wikitext(level_node), expand_all=True ) extract_ja_i_template(wxr, page_data, expanded_table, "") def extract_ja_i_template( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], node: WikiNode, table_header: str, ) -> None: @@ -45,16 +45,15 @@ def extract_ja_i_template( if len(list(child.filter_empty_str_child())) == 1: table_header = clean_node(wxr, None, child.children) else: - inflection_data = { - "tags": [table_header], - "source": "inflection", - } + inflection_data = Form( + tags=[table_header], source="inflection" + ) cell_node_index = 0 keys = ["form", "hiragana", "roman"] for row_child in child.children: if isinstance(row_child, WikiNode): if row_child.kind == NodeKind.TABLE_HEADER_CELL: - inflection_data["tags"].append( + inflection_data.tags.append( clean_node(wxr, None, row_child) ) elif row_child.kind == NodeKind.TABLE_CELL: @@ -64,11 +63,13 @@ def extract_ja_i_template( if cell_node_index < len(keys): key = keys[cell_node_index] cell_node_index += 1 - inflection_data[key] = clean_node( - wxr, None, row_child + setattr( + inflection_data, + key, + clean_node(wxr, None, row_child), ) else: break - page_data[-1]["forms"].append(inflection_data) + page_data[-1].forms.append(inflection_data) else: extract_ja_i_template(wxr, page_data, child, table_header) diff --git a/src/wiktextract/extractor/zh/linkage.py b/src/wiktextract/extractor/zh/linkage.py index 04cdf3a5..13f187fd 100644 --- a/src/wiktextract/extractor/zh/linkage.py +++ b/src/wiktextract/extractor/zh/linkage.py @@ -1,10 +1,7 @@ -from collections import defaultdict -from copy import deepcopy -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Optional, Union from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode -from wiktextract.datautils import find_similar_gloss from wiktextract.page import LEVEL_KINDS, clean_node from wiktextract.wxr_context import WiktextractContext @@ -15,16 +12,16 @@ strip_nodes, ) from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item +from .models import Linkage, WordEntry def extract_linkages( wxr: WiktextractContext, - page_data: List[Dict], - nodes: List[Union[WikiNode, str]], + page_data: list[WordEntry], + nodes: list[Union[WikiNode, str]], linkage_type: str, sense: str, - append_to: Dict, -) -> Optional[Tuple[str, Dict]]: +) -> Optional[str]: """ Return linkage sense text for `sense` template inside a list item node. """ @@ -33,12 +30,11 @@ def extract_linkages( for node in strip_nodes(nodes): if isinstance(node, str) and len(sense) == 0: sense = node.strip(strip_sense_chars) - append_to = find_similar_gloss(page_data, sense) elif isinstance(node, WikiNode): if node.kind == NodeKind.LIST_ITEM: not_term_indexes = set() filtered_children = list(node.filter_empty_str_child()) - linkage_data = defaultdict(list) + linkage_data = Linkage() for index, item_child in enumerate(filtered_children): if ( isinstance(item_child, WikiNode) @@ -50,13 +46,12 @@ def extract_linkages( sense = clean_node(wxr, None, item_child).strip( strip_sense_chars ) - append_to = find_similar_gloss(page_data, sense) if index == len(filtered_children) - 1: # sense template before entry list - return sense, append_to + return sense elif template_name in {"qualifier", "qual"}: not_term_indexes.add(index) - linkage_data["tags"].append( + linkage_data.tags.append( clean_node(wxr, None, item_child).strip("()") ) elif template_name.lower() in DESCENDANT_TEMPLATES: @@ -87,81 +82,67 @@ def extract_linkages( roman, terms = capture_text_in_parentheses(terms) roman = roman[0] if len(roman) > 0 else None if roman is not None: - linkage_data["roman"] = roman + linkage_data.roman = roman if len(sense) > 0: - linkage_data["sense"] = sense + linkage_data.sense = sense for term in terms.split("、"): for variant_type, variant_term in split_chinese_variants( term ): - final_linkage_data = deepcopy(linkage_data) - final_linkage_data["word"] = variant_term + final_linkage_data = linkage_data.model_copy(deep=True) + final_linkage_data.word = variant_term if variant_type is not None: - final_linkage_data[ - "language_variant" - ] = variant_type - if len(final_linkage_data["word"]) > 0: - append_to[linkage_type].append(final_linkage_data) + final_linkage_data.language_variant = variant_type + if len(final_linkage_data.word) > 0: + pre_data = getattr(page_data[-1], linkage_type) + pre_data.append(final_linkage_data) elif node.kind == NodeKind.TEMPLATE: template_name = node.template_name.lower() if template_name in sense_template_names: sense = clean_node(wxr, None, node).strip(strip_sense_chars) elif template_name.endswith("-saurus"): extract_saurus_template( - wxr, node, page_data, linkage_type, sense, append_to + wxr, node, page_data, linkage_type, sense ) elif template_name == "zh-dial": extract_zh_dial_template( - wxr, node, linkage_type, sense, append_to + wxr, page_data, node, linkage_type, sense ) else: expanded_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(node), expand_all=True ) extract_linkages( - wxr, - page_data, - [expanded_node], - linkage_type, - sense, - append_to, + wxr, page_data, [expanded_node], linkage_type, sense ) elif node.kind in LEVEL_KINDS: from .page import parse_section - base_data = defaultdict( - list, - { - "lang_name": page_data[-1].get("lang_name"), - "lang_code": page_data[-1].get("lang_code"), - "word": wxr.wtp.title, - }, + base_data = WordEntry( + lang_code=page_data[-1].lang_code, + lang_name=page_data[-1].lang_name, + word=page_data[-1].word, ) parse_section(wxr, page_data, base_data, node) elif len(node.children) > 0: - returned_values = extract_linkages( + returned_sense = extract_linkages( wxr, page_data, node.children, linkage_type, sense, - append_to, ) - if returned_values is not None: - returned_sense, returned_append_target = returned_values - if len(returned_sense) > 0: - sense = returned_sense - append_to = returned_append_target + if returned_sense is not None: + sense = returned_sense return None def extract_saurus_template( wxr: WiktextractContext, node: WikiNode, - page_data: Dict, + page_data: list[WordEntry], linkage_type: str, sense: str, - append_to: Dict, ) -> None: """ Extract data from template names end with "-saurus", like "zh-syn-saurus" @@ -174,49 +155,52 @@ def extract_saurus_template( for thesaurus in search_thesaurus( wxr.thesaurus_db_conn, thesaurus_page_title, - page_data[-1].get("lang_code"), - page_data[-1].get("pos"), + page_data[-1].lang_code, + page_data[-1].pos, linkage_type, ): if thesaurus.term == wxr.wtp.title: continue - linkage_data = {"word": thesaurus.term} + linkage_data = Linkage(word=thesaurus.term) if thesaurus.roman is not None: - linkage_data["roman"] = thesaurus.roman + linkage_data.roman = thesaurus.roman if thesaurus.tags is not None: - linkage_data["tags"] = thesaurus.tags.split("|") + linkage_data.tags = thesaurus.tags.split("|") if thesaurus.language_variant is not None: - linkage_data["language_variant"] = thesaurus.language_variant + linkage_data.language_variant = thesaurus.language_variant if len(sense) > 0: - linkage_data["sense"] = sense + linkage_data.sense = sense elif thesaurus.sense is not None: - linkage_data["sense"] = thesaurus.sense - append_to[linkage_type].append(linkage_data) + linkage_data.sense = thesaurus.sense + + pre_data = getattr(page_data[-1], linkage_type) + pre_data.append(linkage_data) def extract_zh_dial_template( wxr: WiktextractContext, + page_data: list[WordEntry], node: Union[WikiNode, str], linkage_type: str, sense: str, - append_to: Dict, ) -> None: dial_data = {} node = wxr.wtp.parse(wxr.wtp.node_to_wikitext(node), expand_all=True) extract_zh_dial_recursively(wxr, node, dial_data, None) for term, tags in dial_data.items(): - linkage_data = {"word": term} + linkage_data = Linkage(word=term) if len(sense) > 0: - linkage_data["sense"] = sense + linkage_data.sense = sense if len(tags) > 0: - linkage_data["tags"] = tags - append_to[linkage_type].append(linkage_data) + linkage_data.tags = tags + pre_data = getattr(page_data[-1], linkage_type) + pre_data.append(linkage_data) def extract_zh_dial_recursively( wxr: WiktextractContext, node: Union[WikiNode, str], - dial_data: Dict[str, List[str]], + dial_data: dict[str, list[str]], header_lang: Optional[str], ) -> str: if isinstance(node, WikiNode) and node.kind == NodeKind.TABLE_ROW: @@ -261,7 +245,7 @@ def extract_zh_dial_recursively( def process_ja_r_template( wxr: WiktextractContext, - page_data: Dict[str, Any], + page_data: list[WordEntry], template_node: TemplateNode, linkage_type: str, sense: str, @@ -270,17 +254,15 @@ def process_ja_r_template( expanded_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(template_node), expand_all=True ) - linkage_data = defaultdict(list) - if len(sense) > 0: - linkage_data["sense"] = sense + linkage_data = Linkage(sense=sense) for span_node in expanded_node.find_html("span"): if "lang" in span_node.attrs: ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node) - linkage_data["word"] = clean_node(wxr, None, no_ruby_nodes) - if len(ruby_data) > 0: - linkage_data["ruby"] = ruby_data + linkage_data.word = clean_node(wxr, None, no_ruby_nodes) + linkage_data.ruby = ruby_data elif "tr" in span_node.attrs.get("class", ""): - linkage_data["roman"] = clean_node(wxr, None, span_node) + linkage_data.roman = clean_node(wxr, None, span_node) - if len(linkage_data.get("word", "")) > 0: - page_data[-1][linkage_type].append(linkage_data) + if len(linkage_data.word) > 0: + pre_data = getattr(page_data[-1], linkage_type) + pre_data.append(linkage_data) diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py new file mode 100644 index 00000000..b9b665c5 --- /dev/null +++ b/src/wiktextract/extractor/zh/models.py @@ -0,0 +1,127 @@ +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field + + +class ChineseBaseModel(BaseModel): + model_config = ConfigDict( + extra="ignore", + strict=True, + validate_assignment=True, + validate_default=True, + ) + + +class Example(ChineseBaseModel): + texts: list[str] = Field( + [], + description="Example usage sentences, some might have have both " + "Simplified and Traditional Chinese forms", + ) + translation: str = Field( + "", description="Chinese translation of the example sentence" + ) + roman: str = Field("", description="Romanization of the example sentence") + ref: str = Field( + "", + description="Source of the sentence, like book title and page number", + ) + ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + + +class Sense(ChineseBaseModel): + glosses: list[str] = [] + raw_glosses: list[str] = Field([], description="Gloss text without tags") + tags: list[str] = [] + categories: list[str] = [] + examples: list[Example] = [] + ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + + +class Form(ChineseBaseModel): + form: str = "" + tags: list[str] = [] + source: str = "" + ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + hiragana: str = "" + roman: str = "" + + +class Sound(ChineseBaseModel): + zh_pron: str = Field("", description="Chinese word pronunciation") + ipa: str = Field("", description="International Phonetic Alphabet") + audio: str = Field("", description="Audio file name") + wav_url: str = "" + oga_url: str = "" + ogg_url: str = "" + mp3_url: str = "" + opus_url: str = "" + tags: list[str] = [] + homophone: str = "" + + +class Translation(ChineseBaseModel): + lang_code: str = Field( + "", description="Wiktionary language code of the translation term" + ) + lang_name: str = Field("", description="Translation language name") + word: str = Field("", description="Translation term") + sense: str = Field("", description="Translation gloss") + tags: list[str] = [] + roman: str = "" + + +class Linkage(ChineseBaseModel): + word: str = "" + tags: list[str] = [] + roman: str = "" + sense: str = "" + language_variant: Literal["", "zh-Hant", "zh-Hans"] = Field( + "", description="Chinese character variant" + ) + ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + + +class Descendant(ChineseBaseModel): + lang_code: str = Field("", description="Wiktionary language code of") + lang_name: str = Field("", description="Translation language name") + word: str = "" + roman: str = "" + tags: list[str] = [] + descendants: list["Descendant"] = [] + ruby: list[list[str]] = Field([], description="Japanese Kanji and furigana") + + +class WordEntry(ChineseBaseModel): + model_config = ConfigDict(title="Chinese Wiktionary") + + word: str = Field(description="Word string") + lang_code: str = Field(description="Wiktionary language code") + lang_name: str = Field(description="Localized language name") + pos: str = Field("", description="Part of speech type") + etymology_text: str = "" + senses: list[Sense] = Field([], description="Sense list") + forms: list[Form] = Field([], description="Inflection forms list") + sounds: list[Sound] = [] + translations: list[Translation] = [] + synonyms: list[Linkage] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + holonyms: list[Linkage] = [] + meronyms: list[Linkage] = [] + derived: list[Linkage] = [] + troponyms: list[Linkage] = [] + paronyms: list[Linkage] = [] + related: list[Linkage] = [] + abbreviation: list[Linkage] = [] + proverbs: list[Linkage] = [] + antonyms: list[Linkage] = [] + coordinate_terms: list[Linkage] = [] + various: list[Linkage] = [] + compounds: list[Linkage] = [] + title: str = Field("", description="Redirect page source title") + redirect: str = Field("", description="Redirect page target title") + categories: list[str] = [] + notes: list[str] = [] + tags: list[str] = [] + descendants: list[Descendant] = [] diff --git a/src/wiktextract/extractor/zh/note.py b/src/wiktextract/extractor/zh/note.py index 0b70ad02..10411591 100644 --- a/src/wiktextract/extractor/zh/note.py +++ b/src/wiktextract/extractor/zh/note.py @@ -1,21 +1,21 @@ -from typing import Any, Dict, List - from wikitextprocessor import NodeKind, WikiNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import WordEntry + def extract_note( wxr: WiktextractContext, - page_data: List[Dict[str, Any]], + page_data: list[WordEntry], level_node: WikiNode, ) -> None: for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): - page_data[-1]["notes"].append( + page_data[-1].notes.append( clean_node(wxr, page_data[-1], list_item.children) ) if not level_node.contain_node(NodeKind.LIST): - page_data[-1]["notes"].append( + page_data[-1].notes.append( clean_node(wxr, page_data[-1], level_node.children) ) diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py index 95d84b7b..6c97de94 100644 --- a/src/wiktextract/extractor/zh/page.py +++ b/src/wiktextract/extractor/zh/page.py @@ -1,12 +1,9 @@ -import copy import logging import re -from collections import defaultdict -from typing import Dict, List, Union +from typing import Any, Union from mediawiki_langcodes import name_to_code from wikitextprocessor import NodeKind, WikiNode -from wiktextract.datautils import append_base_data from wiktextract.page import LEVEL_KINDS, clean_node from wiktextract.wxr_context import WiktextractContext @@ -15,9 +12,11 @@ from .headword_line import extract_headword_line from .inflection import extract_inflections from .linkage import extract_linkages +from .models import Sense, WordEntry from .note import extract_note from .pronunciation import extract_pronunciation_recursively from .translation import extract_translation +from .util import append_base_data # Templates that are used to form panels on pages and that # should be ignored in various positions @@ -29,42 +28,42 @@ PANEL_PREFIXES = {} # Additional templates to be expanded in the pre-expand phase -ADDITIONAL_EXPAND_TEMPLATES = { - "multitrans", - "multitrans-nowiki", - "checktrans-top", - "checktrans-bottom", - "col1", - "col2", - "col3", - "col4", - "col5", - "col1-u", - "col2-u", - "col3-u", - "col4-u", - "col5-u", - "check deprecated lang param usage", - "deprecated code", - "ru-verb-alt-ё", - "ru-noun-alt-ё", - "ru-adj-alt-ё", - "ru-proper noun-alt-ё", - "ru-pos-alt-ё", - "ru-alt-ё", - # langhd is needed for pre-expanding language heading templates in the - # Chinese Wiktionary dump file: https://zh.wiktionary.org/wiki/Template:-en- - "langhd", - "zh-der", # col3 for Chinese - "der3", # redirects to col3 -} +ADDITIONAL_EXPAND_TEMPLATES = frozenset( + { + "multitrans", + "multitrans-nowiki", + "col1", + "col2", + "col3", + "col4", + "col5", + "col1-u", + "col2-u", + "col3-u", + "col4-u", + "col5-u", + "check deprecated lang param usage", + "deprecated code", + "ru-verb-alt-ё", + "ru-noun-alt-ё", + "ru-adj-alt-ё", + "ru-proper noun-alt-ё", + "ru-pos-alt-ё", + "ru-alt-ё", + # langhd is needed for pre-expanding language heading templates: + # https://zh.wiktionary.org/wiki/Template:-en- + "langhd", + "zh-der", # col3 for Chinese + "der3", # redirects to col3 + } +) def parse_section( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, - node: Union[WikiNode, List[Union[WikiNode, str]]], + page_data: list[WordEntry], + base_data: WordEntry, + node: Union[WikiNode, list[Union[WikiNode, str]]], ) -> None: if isinstance(node, list): for x in node: @@ -100,7 +99,6 @@ def parse_section( node.children, wxr.config.LINKAGE_SUBTITLES[subtitle], "", - page_data[-1], ) elif ( wxr.config.capture_translations @@ -128,21 +126,22 @@ def parse_section( def process_pos_block( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, + page_data: list[WordEntry], + base_data: WordEntry, node: WikiNode, pos_text: str, ): pos_type = wxr.config.POS_SUBTITLES[pos_text]["pos"] - base_data["pos"] = pos_type + base_data.pos = pos_type append_base_data(page_data, "pos", pos_type, base_data) for index, child in enumerate(node.filter_empty_str_child()): if isinstance(child, WikiNode): if index == 0 and child.kind == NodeKind.TEMPLATE: - lang_code = base_data.get("lang_code") - extract_headword_line(wxr, page_data, child, lang_code) + extract_headword_line( + wxr, page_data, child, base_data.lang_code + ) elif child.kind == NodeKind.LIST: - extract_gloss(wxr, page_data, child, defaultdict(list)) + extract_gloss(wxr, page_data, child, Sense()) elif child.kind in LEVEL_KINDS: parse_section(wxr, page_data, base_data, child) else: @@ -151,9 +150,9 @@ def process_pos_block( def extract_etymology( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, - nodes: List[Union[WikiNode, str]], + page_data: list[WordEntry], + base_data: WordEntry, + nodes: list[Union[WikiNode, str]], ) -> None: level_node_index = -1 for index, node in enumerate(nodes): @@ -165,7 +164,7 @@ def extract_etymology( else: etymology = clean_node(wxr, page_data[-1], nodes) if len(etymology) > 0: - base_data["etymology_text"] = etymology + base_data.etymology_text = etymology append_base_data(page_data, "etymology_text", etymology, base_data) if level_node_index != -1: parse_section(wxr, page_data, base_data, nodes[level_node_index:]) @@ -173,11 +172,11 @@ def extract_etymology( def extract_pronunciation( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, - nodes: List[Union[WikiNode, str]], + page_data: list[WordEntry], + base_data: WordEntry, + nodes: list[Union[WikiNode, str]], ) -> None: - lang_code = base_data.get("lang_code") + lang_code = base_data.lang_code for index, node in enumerate(nodes): if isinstance(node, WikiNode): if node.kind in LEVEL_KINDS: @@ -194,7 +193,7 @@ def extract_pronunciation( def parse_page( wxr: WiktextractContext, page_title: str, page_text: str -) -> List[Dict[str, str]]: +) -> list[dict[str, Any]]: if wxr.config.verbose: logging.info(f"Parsing page: {page_title}") @@ -211,31 +210,25 @@ def parse_page( page_data = [] for level2_node in tree.find_child(NodeKind.LEVEL2): - categories_and_links = defaultdict(list) - lang_name = clean_node(wxr, categories_and_links, level2_node.largs) - if name_to_code(lang_name, "zh") == "": + categories = {} + lang_name = clean_node(wxr, categories, level2_node.largs) + lang_code = name_to_code(lang_name, "zh") + if lang_code == "": wxr.wtp.warning( f"Unrecognized language name: {lang_name}", sortid="extractor/zh/page/parse_page/509", ) - lang_code = name_to_code(lang_name, "zh") if ( wxr.config.capture_language_codes is not None and lang_code not in wxr.config.capture_language_codes ): continue wxr.wtp.start_section(lang_name) - - base_data = defaultdict( - list, - { - "lang_name": lang_name, - "lang_code": lang_code, - "word": wxr.wtp.title, - }, + base_data = WordEntry( + word=wxr.wtp.title, lang_code=lang_code, lang_name=lang_name ) - base_data.update(categories_and_links) - page_data.append(copy.deepcopy(base_data)) + base_data.categories = categories.get("categories", []) + page_data.append(base_data.model_copy(deep=True)) parse_section(wxr, page_data, base_data, level2_node.children) - return page_data + return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/src/wiktextract/extractor/zh/pronunciation.py b/src/wiktextract/extractor/zh/pronunciation.py index c9e9014b..724f9a26 100644 --- a/src/wiktextract/extractor/zh/pronunciation.py +++ b/src/wiktextract/extractor/zh/pronunciation.py @@ -1,21 +1,23 @@ import re -from typing import Any, Dict, List, Optional, Union +from typing import Optional, Union from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import HTMLNode, TemplateNode -from wiktextract.datautils import append_base_data from wiktextract.extractor.share import create_audio_url_dict from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import Sound, WordEntry +from .util import append_base_data + def extract_pronunciation_recursively( wxr: WiktextractContext, - page_data: List[Dict[str, Any]], - base_data: Dict[str, Any], + page_data: list[WordEntry], + base_data: WordEntry, lang_code: str, - node: Union[WikiNode, List[Union[WikiNode, str]]], - tags: List[str], + node: Union[WikiNode, list[Union[WikiNode, str]]], + tags: list[str], ) -> None: if isinstance(node, list): for x in node: @@ -39,20 +41,25 @@ def extract_pronunciation_recursively( # audio file usually after Pinyin # add back to previous Pinyin dictionary if it doesn't have # audio file data and they are sibling nodes(similar tags). - last_sounds_list = page_data[-1].get("sounds", []) + last_sounds_list = page_data[-1].sounds for index in range(len(last_sounds_list)): - if last_sounds_list[index].get("audio") is None and ( - tags == last_sounds_list[index].get("tags", [])[:-1] + if last_sounds_list[index].audio == "" and ( + tags == last_sounds_list[index].tags[:-1] or lang_code != "zh" ): - page_data[-1].get("sounds")[index].update( - create_audio_url_dict(data) - ) - elif isinstance(data, dict): + for key, value in create_audio_url_dict(data).items(): + if key in Sound.model_fields: + setattr(page_data[-1].sounds[index], key, value) + else: + wxr.wtp.warning( + f"{key=} not defined in Sound", + sortid="zh.pronunciation/56", + ) + elif isinstance(data, Sound): append_base_data( page_data, "sounds", - [data], + data, base_data, ) # list children could contain audio file @@ -62,7 +69,7 @@ def extract_pronunciation_recursively( base_data, lang_code, rest_children, - data.get("tags")[:-1], + data.tags[:-1], ) elif isinstance(data, list): # list item is a tag @@ -81,8 +88,8 @@ def extract_pronunciation_recursively( def combine_pronunciation_tags( - old_tags: List[str], new_tags: List[str] -) -> List[str]: + old_tags: list[str], new_tags: list[str] +) -> list[str]: combined_tags = old_tags[:] old_tags_set = set(old_tags) for tag in new_tags: @@ -91,7 +98,7 @@ def combine_pronunciation_tags( return combined_tags -def split_pronunciation_tags(text: str) -> List[str]: +def split_pronunciation_tags(text: str) -> list[str]: return list( filter( None, @@ -107,11 +114,11 @@ def split_pronunciation_tags(text: str) -> List[str]: def extract_pronunciation_item( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], lang_code: str, - node_children: List[WikiNode], - tags: List[str], -) -> Optional[Union[Dict[str, Any], str, List[str]]]: + node_children: list[WikiNode], + tags: list[str], +) -> Optional[Union[Sound, str, list[str]]]: """ Return audio file name(eg. "File:LL-Q1860 (eng)-Vealhurl-manga.wav") string or a dictionary contains IPA and tags @@ -138,9 +145,9 @@ def extract_pronunciation_item( tags, split_pronunciation_tags(sound_tags_text) ) if len(ipa) > 0: - data = {"tags": new_tags} - ipa_key = "zh-pron" if lang_code == "zh" else "ipa" - data[ipa_key] = ipa[0].strip() + data = Sound(tags=new_tags) + ipa_key = "zh_pron" if lang_code == "zh" else "ipa" + setattr(data, ipa_key, ipa[0].strip()) return data for child in filter( @@ -155,9 +162,9 @@ def extract_pronunciation_item( def process_homophone_data( wxr: WiktextractContext, - page_data: List[Dict], - node_children: List[WikiNode], - tags: List[str], + page_data: list[WordEntry], + node_children: list[WikiNode], + tags: list[str], ) -> None: # Process the collapsible homophone table created from "zh-pron" template # and the "homophones" template @@ -167,11 +174,10 @@ def process_homophone_data( for span_node in node.find_html_recursively( "span", attr_name="lang" ): - sound_data = { - "homophone": clean_node(wxr, None, span_node), - "tags": tags, - } - page_data[-1]["sounds"].append(sound_data) + sound_data = Sound( + homophone=clean_node(wxr, None, span_node), tags=tags + ) + page_data[-1].sounds.append(sound_data) elif ( isinstance(node, TemplateNode) and node.template_name == "homophones" @@ -182,8 +188,7 @@ def process_homophone_data( for span_node in expaned_template.find_html_recursively( "span", attr_name="lang" ): - sound_data = { - "homophone": clean_node(wxr, None, span_node), - "tags": tags, - } - page_data[-1]["sounds"].append(sound_data) + sound_data = Sound( + homophone=clean_node(wxr, None, span_node), tags=tags + ) + page_data[-1].sounds.append(sound_data) diff --git a/src/wiktextract/extractor/zh/thesaurus.py b/src/wiktextract/extractor/zh/thesaurus.py index 3f466b96..6c2d89e5 100644 --- a/src/wiktextract/extractor/zh/thesaurus.py +++ b/src/wiktextract/extractor/zh/thesaurus.py @@ -1,11 +1,12 @@ import logging import re -from typing import List, Optional, Union +from typing import Optional, Union from mediawiki_langcodes import name_to_code from wikitextprocessor import NodeKind, Page, WikiNode from ...page import clean_node +from ...thesaurus import ThesaurusTerm from ...wxr_context import WiktextractContext from ..share import capture_text_in_parentheses, split_chinese_variants @@ -26,9 +27,7 @@ def parse_ja_thesaurus_term( sense: Optional[str], linkage: Optional[str], term_str: str, -) -> List["ThesaurusTerm"]: - from wiktextract.thesaurus import ThesaurusTerm - +) -> list[ThesaurusTerm]: tags = None roman = None if term_str.startswith("("): # has qualifier @@ -40,8 +39,8 @@ def parse_ja_thesaurus_term( for term_str in term_str.split("、"): # Example term_str from https://zh.wiktionary.org/wiki/Thesaurus:死ぬ # Fromat: (qualifer) term (roman, gloss) - # 'この世(よ)を去(さ)る (kono yo o saru, 字面意思為“to leave this world”)' - # '若死(わかじ)にする (wakajini suru, “还年轻时死去”)' + # この世(よ)を去(さ)る (kono yo o saru, 字面意思為“to leave this world”) + # 若死(わかじ)にする (wakajini suru, “还年轻时死去”) term_end = term_str.find(" (") term = term_str[:term_end] roman_and_gloss = term_str[term_end + 2 :].removesuffix(")").split(", ") @@ -70,9 +69,7 @@ def parse_zh_thesaurus_term( sense: Optional[str], linkage: Optional[str], term_str: str, -) -> List["ThesaurusTerm"]: - from wiktextract.thesaurus import ThesaurusTerm - +) -> list[ThesaurusTerm]: # Example term_str from https://zh.wiktionary.org/wiki/Thesaurus:安置 # Fromat: traditional/simplified (pinyin) (tags) # 施設/施设 (shīshè) (書面) @@ -112,9 +109,7 @@ def parse_thesaurus_term( sense: Optional[str], linkage: Optional[str], node: WikiNode, -) -> List["ThesaurusTerm"]: - from wiktextract.thesaurus import ThesaurusTerm - +) -> list[ThesaurusTerm]: node_str = clean_node(wxr, None, node) node_str = node_str.removeprefix("* ") # remove list wikitext @@ -146,8 +141,8 @@ def recursive_parse( pos: Optional[str], sense: Optional[str], linkage: Optional[str], - node: Union[WikiNode, List[Union[WikiNode, str]]], -) -> Optional[List["ThesaurusTerm"]]: + node: Union[WikiNode, list[Union[WikiNode, str]]], +) -> Optional[list[ThesaurusTerm]]: if isinstance(node, list): thesaurus = [] for x in node: @@ -225,7 +220,7 @@ def recursive_parse( def extract_thesaurus_page( wxr: WiktextractContext, page: Page -) -> Optional[List["ThesaurusTerm"]]: +) -> Optional[list[ThesaurusTerm]]: entry = page.title[page.title.find(":") + 1 :] wxr.wtp.start_page(page.title) root = wxr.wtp.parse(page.body, additional_expand={"ws", "zh-syn-list"}) diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index e11fec28..a9086e59 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -1,21 +1,19 @@ import re -from collections import defaultdict -from typing import Dict, List, Optional, Union +from typing import Optional, Union from mediawiki_langcodes import name_to_code from wikitextprocessor import NodeKind, WikiNode -from wiktextract.datautils import find_similar_gloss from wiktextract.page import LEVEL_KINDS, clean_node from wiktextract.wxr_context import WiktextractContext from ..share import capture_text_in_parentheses +from .models import Translation, WordEntry def extract_translation( - wxr: WiktextractContext, page_data: List[Dict], node: WikiNode + wxr: WiktextractContext, page_data: list[WordEntry], node: WikiNode ) -> None: sense_text = "" - append_to = page_data[-1] for child in node.children: if isinstance(child, WikiNode): if child.kind == NodeKind.TEMPLATE: @@ -27,7 +25,6 @@ def extract_translation( sense_text = clean_node( wxr, None, child.template_parameters.get(1) ) - append_to = find_similar_gloss(page_data, sense_text) elif template_name == "checktrans-top": return elif template_name == "see translation subpage": @@ -42,7 +39,6 @@ def extract_translation( page_data, clean_node(wxr, None, list_item_node.children), sense_text, - append_to, ) else: nested_list_index = 0 @@ -65,7 +61,6 @@ def extract_translation( list_item_node.children[:nested_list_index], ), sense_text, - append_to, ) for nested_list_node in list_item_node.find_child( NodeKind.LIST @@ -80,16 +75,14 @@ def extract_translation( wxr, None, nested_list_item.children ), sense_text, - append_to, ) def process_translation_list_item( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], expanded_text: str, sense: str, - append_to: Dict, ) -> None: from .headword_line import GENDERS @@ -107,38 +100,33 @@ def process_translation_list_item( for word_and_tags in re.split(r"[,;、](?![^(]*\))\s*", words_text): tags, word = capture_text_in_parentheses(word_and_tags) tags = [tag for tag in tags if tag != lang_code] # rm Wiktionary link - translation_data = defaultdict( - list, - { - "lang_code": lang_code, - "lang_name": lang_text, - "word": word, - }, + translation_data = Translation( + lang_code=lang_code, lang_name=lang_text, word=word ) tags_without_roman = [] for tag in tags: if re.search(r"[a-z]", tag): - translation_data["roman"] = tag + translation_data.roman = tag else: tags_without_roman.append(tag) if len(tags_without_roman) > 0: - translation_data["tags"] = tags_without_roman + translation_data.tags = tags_without_roman gender = word.split(" ")[-1] if gender in GENDERS: - translation_data["word"] = word.removesuffix(f" {gender}") - translation_data["tags"].append(GENDERS.get(gender)) + translation_data.word = word.removesuffix(f" {gender}") + translation_data.tags.append(GENDERS.get(gender)) if len(sense) > 0: - translation_data["sense"] = sense - append_to["translations"].append(translation_data) + translation_data.sense = sense + page_data[-1].translations.append(translation_data) def translation_subpage( wxr: WiktextractContext, - page_data: List[Dict], - template_args: Dict[str, str], + page_data: list[WordEntry], + template_args: dict[str, str], ) -> None: from .page import ADDITIONAL_EXPAND_TEMPLATES @@ -174,7 +162,7 @@ def translation_subpage( def find_subpage_section( wxr: WiktextractContext, node: Union[WikiNode, str], - target_section: Union[str, List[str]], + target_section: Union[str, list[str]], ) -> Optional[WikiNode]: if isinstance(node, WikiNode): if node.kind in LEVEL_KINDS: diff --git a/src/wiktextract/extractor/zh/util.py b/src/wiktextract/extractor/zh/util.py new file mode 100644 index 00000000..0cb1ae79 --- /dev/null +++ b/src/wiktextract/extractor/zh/util.py @@ -0,0 +1,26 @@ +from typing import Any + +from .models import WordEntry + + +def append_base_data( + page_data: list[WordEntry], field: str, value: Any, base_data: WordEntry +) -> None: + """ + Chinese Wiktionary's POS sections could under other sections or at the same + level of other sections. This function is to decide whether append a new + WordEntry data. + """ + if len(page_data) == 0 or ( + len(getattr(page_data[-1], field)) > 0 and len(page_data[-1].senses) > 0 + ): + # Append new entry if last data has same field and also has gloss data + page_data.append(base_data.model_copy(deep=True)) + + # Don't append new WordEntry if POS section is not processed + # Example page "kirin", "北庫爾德語" section + pre_data = getattr(page_data[-1], field) + if isinstance(pre_data, list): + pre_data.append(value) + else: + setattr(page_data[-1], field, value) diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py index 590552b1..1e525364 100644 --- a/src/wiktextract/page.py +++ b/src/wiktextract/page.py @@ -5,7 +5,7 @@ import re from collections import defaultdict from copy import copy -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union from mediawiki_langcodes import get_all_names, name_to_code from wikitextprocessor import NodeKind, WikiNode @@ -28,7 +28,7 @@ def parse_page( wxr: WiktextractContext, page_title: str, page_text: str -) -> List[Dict[str, str]]: +) -> list[dict[str, Any]]: """Parses the text of a Wiktionary page and returns a list of dictionaries, one for each word/part-of-speech defined on the page for the languages specified by ``capture_language_codes`` (None means @@ -56,9 +56,9 @@ def is_panel_template(wxr: WiktextractContext, template_name: str) -> bool: def recursively_extract( - contents: Union[WikiNode, List[WikiNode]], - fn: Callable[[Union[WikiNode, List[WikiNode]]], bool], -) -> Tuple[List[WikiNode], List[WikiNode]]: + contents: Union[WikiNode, list[WikiNode]], + fn: Callable[[Union[WikiNode, list[WikiNode]]], bool], +) -> tuple[list[WikiNode], list[WikiNode]]: """Recursively extracts elements from contents for which ``fn`` returns True. This returns two lists, the extracted elements and the remaining content (with the extracted elements removed at each level). Only @@ -146,7 +146,7 @@ def recursively_extract( return extracted, new_contents -def inject_linkages(wxr: WiktextractContext, page_data: List[Dict]) -> None: +def inject_linkages(wxr: WiktextractContext, page_data: list[dict]) -> None: # Inject linkages from thesaurus entries from .thesaurus import search_thesaurus @@ -183,7 +183,7 @@ def inject_linkages(wxr: WiktextractContext, page_data: List[Dict]) -> None: data_append(data, term.linkage, dt) -def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None: +def process_categories(wxr: WiktextractContext, page_data: list[dict]) -> None: # Categories are not otherwise disambiguated, but if there is only # one sense and only one data in ret for the same language, move # categories to the only sense. Note that categories are commonly @@ -275,7 +275,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None: data["categories"] = new_cats -def remove_duplicate_data(page_data: Dict) -> None: +def remove_duplicate_data(page_data: dict) -> None: # Remove duplicates from tags, categories, etc. for data in page_data: for field in ("categories", "topics", "tags", "wikidata", "wikipedia"): @@ -310,10 +310,10 @@ def remove_duplicate_data(page_data: Dict) -> None: def clean_node( wxr: WiktextractContext, - sense_data: Optional[Dict], - wikinode: Union[str, WikiNode, List[Union[str, WikiNode, List]]], - template_fn: Optional[Callable[[str, Dict], str]] = None, - post_template_fn: Optional[Callable[[str, Dict, str], str]] = None, + sense_data: Optional[Any], + wikinode: Union[str, WikiNode, list[Union[str, WikiNode]]], + template_fn: Optional[Callable[[str, dict], str]] = None, + post_template_fn: Optional[Callable[[str, dict, str], str]] = None, collect_links: bool = False, ) -> str: """ diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py index dc63e8a4..a0a97639 100644 --- a/tests/test_zh_descendant.py +++ b/tests/test_zh_descendant.py @@ -1,9 +1,9 @@ -from collections import defaultdict from unittest import TestCase from unittest.mock import Mock from wikitextprocessor import Wtp from wiktextract.extractor.zh.descendant import extract_descendants +from wiktextract.extractor.zh.models import WordEntry from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -32,19 +32,17 @@ def test_ruby(self): '[[你好#日語|-{你好(ニイハオ)}-]] (nīhao)', ) root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}") - page_data = defaultdict(list) + page_data = WordEntry(word="你好", lang_code="ja", lang_name="日語") extract_descendants(self.wxr, root, page_data) self.assertEqual( - page_data.get("descendants"), - [ - { - "lang_code": "ja", - "lang_name": "日語", - "roman": "nīhao", - "ruby": [("你好", "ニイハオ")], - "word": "你好", - } - ], + page_data.descendants[0].model_dump(exclude_defaults=True), + { + "lang_code": "ja", + "lang_name": "日語", + "roman": "nīhao", + "ruby": [["你好", "ニイハオ"]], + "word": "你好", + }, ) def test_roman_only_list(self): @@ -55,22 +53,21 @@ def test_roman_only_list(self): ' 壯語:[[mwngz ndei#壯語|-{mwngz ndei}-]] (仿譯)', ) root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}") - page_data = defaultdict(list) + page_data = WordEntry(word="你好", lang_code="zh", lang_name="漢語") extract_descendants(self.wxr, root, page_data) self.assertEqual( - page_data.get("descendants"), - [ - { - "lang_code": "za", - "lang_name": "壯語", - "tags": ["仿譯"], - "word": "mwngz ndei", - } - ], + page_data.descendants[0].model_dump(exclude_defaults=True), + { + "lang_code": "za", + "lang_name": "壯語", + "tags": ["仿譯"], + "word": "mwngz ndei", + }, ) def test_nested_list(self): # https://zh.wiktionary.org/wiki/オタク + self.maxDiff = None self.wxr.wtp.start_page("オタク") self.wxr.wtp.add_page( "Template:desc", @@ -87,30 +84,28 @@ def test_nested_list(self): *:* {{desc|cmn|-|der=1}} {{zh-l|宅男}} *:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}""" ) - page_data = defaultdict(list) + page_data = WordEntry(word="オタク", lang_code="ja", lang_name="日語") extract_descendants(self.wxr, root, page_data) self.assertEqual( - page_data.get("descendants"), - [ - { - "descendants": [ - { - "lang_code": "cmn", - "lang_name": "官話", - "roman": "宅男", - "word": "宅男", - }, - { - "lang_code": "cmn", - "lang_name": "官話", - "roman": "宅女", - "word": "宅女", - }, - ], - "lang_code": "cmn", - "lang_name": "官話", - "roman": "御宅族", - "word": "御宅族", - } - ], + page_data.descendants[0].model_dump(exclude_defaults=True), + { + "descendants": [ + { + "lang_code": "cmn", + "lang_name": "官話", + "roman": "宅男", + "word": "宅男", + }, + { + "lang_code": "cmn", + "lang_name": "官話", + "roman": "宅女", + "word": "宅女", + }, + ], + "lang_code": "cmn", + "lang_name": "官話", + "roman": "御宅族", + "word": "御宅族", + }, ) diff --git a/tests/test_zh_example.py b/tests/test_zh_example.py index 0cb0f6d2..18448bb0 100644 --- a/tests/test_zh_example.py +++ b/tests/test_zh_example.py @@ -1,15 +1,15 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from unittest.mock import patch from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.zh.example import extract_examples +from wiktextract.extractor.zh.models import Sense from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext -class TestExample(unittest.TestCase): +class TestExample(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh") @@ -22,7 +22,7 @@ def tearDown(self) -> None: ) def test_example_list(self) -> None: - sense_data = defaultdict(list) + sense_data = Sense() wikitext = """ #* ref text #*: example text @@ -31,14 +31,11 @@ def test_example_list(self) -> None: node = self.wxr.wtp.parse(wikitext) extract_examples(self.wxr, sense_data, node) self.assertEqual( - sense_data.get("examples"), - [ - { - "ref": "ref text", - "text": "example text", - "type": "quotation", - }, - ], + sense_data.examples[0].model_dump(exclude_defaults=True), + { + "ref": "ref text", + "texts": ["example text"], + }, ) @patch( @@ -48,19 +45,16 @@ def test_example_list(self) -> None: translation text""", ) def test_quote_example(self, mock_clean_node) -> None: - sense_data = defaultdict(list) + sense_data = Sense() wikitext = "#* {{RQ:Schuster Hepaticae}}" self.wxr.wtp.start_page("test") node = self.wxr.wtp.parse(wikitext) extract_examples(self.wxr, sense_data, node) self.assertEqual( - sense_data.get("examples"), - [ - { - "ref": "ref text", - "text": "quote text", - "translation": "translation text", - "type": "quotation", - }, - ], + sense_data.examples[0].model_dump(exclude_defaults=True), + { + "ref": "ref text", + "texts": ["quote text"], + "translation": "translation text", + }, ) diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py index ae8eb794..b44a70b6 100644 --- a/tests/test_zh_gloss.py +++ b/tests/test_zh_gloss.py @@ -1,15 +1,15 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from unittest.mock import patch from wikitextprocessor import NodeKind, WikiNode, Wtp from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.zh.models import Sense, WordEntry from wiktextract.extractor.zh.page import extract_gloss, parse_section from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext -class TestExample(unittest.TestCase): +class TestExample(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh") @@ -23,13 +23,10 @@ def tearDown(self) -> None: def test_example_list(self) -> None: page_data = [ - defaultdict( - list, - { - "lang_name": "日語", - "lang_code": "ja", - "word": "可笑しい", - }, + WordEntry( + lang_name="日語", + lang_code="ja", + word="可笑しい", ) ] wikitext = """# [[好玩]]的: @@ -43,9 +40,9 @@ def test_example_list(self) -> None: ## [[很好]]的,[[卓越]]的""" self.wxr.wtp.start_page("test") node = self.wxr.wtp.parse(wikitext) - extract_gloss(self.wxr, page_data, node.children[0], {}) + extract_gloss(self.wxr, page_data, node.children[0], Sense()) self.assertEqual( - page_data[0]["senses"], + [s.model_dump(exclude_defaults=True) for s in page_data[0].senses], [ {"glosses": ["好玩的:", "有趣的,滑稽的,可笑的"]}, {"glosses": ["好玩的:", "奇怪的,不正常的"]}, @@ -81,7 +78,8 @@ def test_pos_title_number( mock_process_pos_block, ) -> None: node = WikiNode(NodeKind.LEVEL3, 0) - parse_section(self.wxr, [{}], {}, node) + base_data = WordEntry(word="", lang_code="", lang_name="") + parse_section(self.wxr, [base_data], base_data, node) mock_process_pos_block.assert_called() @patch("wiktextract.extractor.zh.page.process_pos_block") @@ -92,5 +90,6 @@ def test_pos_title_chinese_numeral( mock_process_pos_block, ) -> None: node = WikiNode(NodeKind.LEVEL3, 0) - parse_section(self.wxr, [{}], {}, node) + base_data = WordEntry(word="", lang_code="", lang_name="") + parse_section(self.wxr, [base_data], base_data, node) mock_process_pos_block.assert_called() diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py index b1f31fdf..d651cadf 100644 --- a/tests/test_zh_headword.py +++ b/tests/test_zh_headword.py @@ -1,9 +1,9 @@ -from collections import defaultdict from unittest import TestCase from unittest.mock import Mock, patch from wikitextprocessor import Wtp from wiktextract.extractor.zh.headword_line import extract_headword_line +from wiktextract.extractor.zh.models import WordEntry from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -28,18 +28,21 @@ def test_english_headword(self, mock_node_to_wikitext) -> None: # expanded text: manga (可數 & 不可數,複數 manga 或 mangas) node = Mock() node.largs = [["en-noun"]] - page_data = [defaultdict(list)] + page_data = [WordEntry(word="manga", lang_code="en", lang_name="英語")] self.wxr.wtp.title = "manga" extract_headword_line(self.wxr, page_data, node, "en") self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data], [ { + "word": "manga", + "lang_code": "en", + "lang_name": "英語", "forms": [ - {"form": "manga", "tags": ["plural"]}, - {"form": "mangas", "tags": ["plural"]}, + {"form": "manga", "tags": ["複數"]}, + {"form": "mangas", "tags": ["複數"]}, ], - "tags": ["countable", "uncountable"], + "tags": ["可數", "不可數"], } ], ) @@ -54,16 +57,19 @@ def test_headword_gender(self, mock_node_to_wikitext) -> None: # expanded text: manga m (複數 manga's,指小詞 mangaatje n) node = Mock() node.largs = [["nl-noun"]] - page_data = [defaultdict(list)] + page_data = [WordEntry(word="manga", lang_code="en", lang_name="英語")] self.wxr.wtp.title = "manga" extract_headword_line(self.wxr, page_data, node, "nl") self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data], [ { + "word": "manga", + "lang_code": "en", + "lang_name": "英語", "forms": [ - {"form": "manga's", "tags": ["plural"]}, - {"form": "mangaatje", "tags": ["diminutive", "neuter"]}, + {"form": "manga's", "tags": ["複數"]}, + {"form": "mangaatje", "tags": ["指小詞", "neuter"]}, ], "tags": ["masculine"], } @@ -80,13 +86,18 @@ def test_headword_roman(self, mock_node_to_wikitext) -> None: # expanded text: -κρατίᾱς (-kratíās) f node = Mock() node.largs = [["head"]] - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="-κρατίας", lang_code="grc", lang_name="古希臘語") + ] self.wxr.wtp.title = "-κρατίας" extract_headword_line(self.wxr, page_data, node, "grc") self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data], [ { + "word": "-κρατίας", + "lang_code": "grc", + "lang_name": "古希臘語", "forms": [ {"form": "-kratíās", "tags": ["romanization"]}, ], diff --git a/tests/test_zh_inflection.py b/tests/test_zh_inflection.py index d3464b7f..64230fed 100644 --- a/tests/test_zh_inflection.py +++ b/tests/test_zh_inflection.py @@ -1,15 +1,15 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from unittest.mock import patch from wikitextprocessor import Page, Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.zh.inflection import extract_inflections +from wiktextract.extractor.zh.models import WordEntry from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext -class TestInflection(unittest.TestCase): +class TestInflection(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh") @@ -39,22 +39,13 @@ def tearDown(self) -> None: ), ) def test_ja_i_template(self, mock_get_page) -> None: - page_data = [ - defaultdict( - list, - { - "lang": "日語", - "lang_code": "ja", - "word": "可笑しい", - }, - ) - ] + page_data = [WordEntry(lang_name="日語", lang_code="ja", word="可笑しい")] wikitext = "{{ja-i|可笑し|おかし|okashi}}" self.wxr.wtp.start_page("可笑しい") node = self.wxr.wtp.parse(wikitext) extract_inflections(self.wxr, page_data, node) self.assertEqual( - page_data[0].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[0].forms], [ { "form": "可笑しかろ", diff --git a/tests/test_zh_linkage.py b/tests/test_zh_linkage.py index e736fb9a..d316d372 100644 --- a/tests/test_zh_linkage.py +++ b/tests/test_zh_linkage.py @@ -1,14 +1,14 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.zh.linkage import extract_linkages +from wiktextract.extractor.zh.models import Sense, WordEntry from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext -class TestLinkage(unittest.TestCase): +class TestLinkage(TestCase): def setUp(self): self.wxr = WiktextractContext( Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh") @@ -22,12 +22,12 @@ def tearDown(self): def test_sense_term_list(self): page_data = [ - { - "lang": "跨語言", - "lang_code": "mul", - "word": "%", - "senses": [defaultdict(list, {"glosses": ["百分比"]})], - } + WordEntry( + lang_name="跨語言", + lang_code="mul", + word="%", + senses=[Sense(glosses=["百分比"])], + ) ] wikitext = "* {{sense|百分比}} {{l|mul|cU}}、[[centiuno]]" self.wxr.wtp.add_page("Template:Sense", 10, "{{{1}}}") @@ -35,11 +35,12 @@ def test_sense_term_list(self): self.wxr.wtp.db_conn.commit() self.wxr.wtp.start_page("%") node = self.wxr.wtp.parse(wikitext) - extract_linkages( - self.wxr, page_data, node.children, "synonyms", "", page_data[-1] - ) + extract_linkages(self.wxr, page_data, node.children, "synonyms", "") self.assertEqual( - page_data[0]["senses"][0].get("synonyms"), + [ + s.model_dump(exclude_defaults=True) + for s in page_data[0].synonyms + ], [ {"sense": "百分比", "word": "cU"}, {"sense": "百分比", "word": "centiuno"}, @@ -55,22 +56,14 @@ def test_ja_r_template(self): '[[家主#日語|-{()(ぬし)}-]] (yanushi)', ) node = self.wxr.wtp.parse("{{s|房東}}\n* {{ja-r|家%主|や%ぬし}}") - page_data = [defaultdict(list)] - extract_linkages( - self.wxr, page_data, node.children, "synonyms", "", page_data[-1] - ) + page_data = [WordEntry(word="大家", lang_code="zh", lang_name="漢語")] + extract_linkages(self.wxr, page_data, node.children, "synonyms", "") self.assertEqual( - page_data, - [ - { - "synonyms": [ - { - "roman": "yanushi", - "ruby": [("家", "や"), ("主", "ぬし")], - "sense": "房東", - "word": "家主", - } - ] - } - ], + page_data[0].synonyms[0].model_dump(exclude_defaults=True), + { + "roman": "yanushi", + "ruby": [["家", "や"], ["主", "ぬし"]], + "sense": "房東", + "word": "家主", + }, ) diff --git a/tests/test_zh_note.py b/tests/test_zh_note.py index 04a3406b..c41aa623 100644 --- a/tests/test_zh_note.py +++ b/tests/test_zh_note.py @@ -1,8 +1,8 @@ -from collections import defaultdict from unittest import TestCase from unittest.mock import Mock from wikitextprocessor import Wtp +from wiktextract.extractor.zh.models import WordEntry from wiktextract.extractor.zh.note import extract_note from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext @@ -22,14 +22,14 @@ def test_note_list(self): # https://zh.wiktionary.org/wiki/オタク self.wxr.wtp.start_page("オタク") root = self.wxr.wtp.parse("* note list 1\n* note list 2") - page_data = [defaultdict(list)] + page_data = [WordEntry(word="オタク", lang_code="ja", lang_name="日語")] extract_note(self.wxr, page_data, root) - self.assertEqual(page_data, [{"notes": ["note list 1", "note list 2"]}]) + self.assertEqual(page_data[-1].notes, ["note list 1", "note list 2"]) def test_note_no_list(self): # https://zh.wiktionary.org/wiki/clavarder self.wxr.wtp.start_page("clavarder") root = self.wxr.wtp.parse("note text") - page_data = [defaultdict(list)] + page_data = [WordEntry(word="オタク", lang_code="fr", lang_name="法語")] extract_note(self.wxr, page_data, root) - self.assertEqual(page_data, [{"notes": ["note text"]}]) + self.assertEqual(page_data[-1].notes, ["note text"]) diff --git a/tests/test_zh_pronunciation.py b/tests/test_zh_pronunciation.py index 9a677af5..82c5d9aa 100644 --- a/tests/test_zh_pronunciation.py +++ b/tests/test_zh_pronunciation.py @@ -1,8 +1,8 @@ -from collections import defaultdict from unittest import TestCase from unittest.mock import Mock from wikitextprocessor import Wtp +from wiktextract.extractor.zh.models import WordEntry from wiktextract.extractor.zh.pronunciation import ( extract_pronunciation_recursively, ) @@ -25,19 +25,16 @@ def test_homophone_table(self): root = self.wxr.wtp.parse( """* 同音詞
[展開/摺疊]
[[大姑#漢語|大姑]]
[[小姑#漢語|小姑]]
""" ) - page_data = [defaultdict(list)] + base_data = WordEntry(word="大家", lang_code="zh", lang_name="漢語") + page_data = [base_data.model_copy(deep=True)] extract_pronunciation_recursively( - self.wxr, page_data, {}, "zh", root, [] + self.wxr, page_data, base_data, "zh", root, [] ) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[0].sounds], [ - { - "sounds": [ - {"homophone": "大姑", "tags": ["同音詞"]}, - {"homophone": "小姑", "tags": ["同音詞"]}, - ] - } + {"homophone": "大姑", "tags": ["同音詞"]}, + {"homophone": "小姑", "tags": ["同音詞"]}, ], ) @@ -49,19 +46,16 @@ def test_homophone_template(self): '[[Appendix:Glossary#同音词|同音词]]:[[大矢#日語|-{大矢}-]], [[大宅#日語|-{大宅}-]], [[大谷#日語|-{大谷}-]][[Category:有同音詞的日語詞]]', ) root = self.wxr.wtp.parse("* {{homophones|ja|大矢|大宅|大谷}}") - page_data = [defaultdict(list)] + base_data = WordEntry(word="大家", lang_code="zh", lang_name="漢語") + page_data = [base_data.model_copy(deep=True)] extract_pronunciation_recursively( - self.wxr, page_data, {}, "ja", root, [] + self.wxr, page_data, base_data, "ja", root, [] ) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[0].sounds], [ - { - "sounds": [ - {"homophone": "大矢", "tags": ["同音詞"]}, - {"homophone": "大宅", "tags": ["同音詞"]}, - {"homophone": "大谷", "tags": ["同音詞"]}, - ] - } + {"homophone": "大矢", "tags": ["同音詞"]}, + {"homophone": "大宅", "tags": ["同音詞"]}, + {"homophone": "大谷", "tags": ["同音詞"]}, ], ) diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index d7d63771..e1752700 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -1,15 +1,15 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from unittest.mock import patch from wikitextprocessor import Page, Wtp from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.zh.models import WordEntry from wiktextract.extractor.zh.translation import extract_translation from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext -class TestTranslation(unittest.TestCase): +class TestZhTranslation(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="zh"), WiktionaryConfig(dump_file_lang_code="zh") @@ -27,7 +27,7 @@ def tearDown(self) -> None: ) def test_normal(self, mock_get_page) -> None: # test wikitext from page "你好" and "這裡" - page_data = [defaultdict(list)] + page_data = [WordEntry(word="你好", lang_code="zh", lang_name="漢語")] wikitext = """ {{trans-top|靠近說話者的地方}} * 阿爾巴尼亞語:këtu (sq) @@ -43,7 +43,10 @@ def test_normal(self, mock_get_page) -> None: node = self.wxr.wtp.parse(wikitext) extract_translation(self.wxr, page_data, node) self.assertEqual( - page_data[0].get("translations"), + [ + d.model_dump(exclude_defaults=True) + for d in page_data[0].translations + ], [ { "lang_code": "sq", @@ -52,7 +55,6 @@ def test_normal(self, mock_get_page) -> None: "word": "këtu", }, { - "lang_code": "", "lang_name": "西阿帕切語", "sense": "靠近說話者的地方", "word": "kú",