From 529b763d76d3fe22c0b5a46f6de77ec4dd7f8c23 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 4 Jan 2024 13:26:00 +0100 Subject: [PATCH] Use linkage model in Russian Wiktionary --- src/wiktextract/extractor/ru/linkage.py | 4 +- src/wiktextract/extractor/ru/models.py | 41 +++++++------------ src/wiktextract/extractor/ru/pronunciation.py | 6 ++- tests/test_ru_pronunciation.py | 8 ++-- 4 files changed, 25 insertions(+), 34 deletions(-) diff --git a/src/wiktextract/extractor/ru/linkage.py b/src/wiktextract/extractor/ru/linkage.py index 0b87d17e..d63e0ef0 100644 --- a/src/wiktextract/extractor/ru/linkage.py +++ b/src/wiktextract/extractor/ru/linkage.py @@ -1,6 +1,6 @@ from wikitextprocessor import NodeKind, WikiNode -from wiktextract.extractor.ru.models import WordEntry +from wiktextract.extractor.ru.models import Linkage, WordEntry from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -20,4 +20,4 @@ def extract_linkages( for link_node in level_node.find_child_recursively(NodeKind.LINK): word = clean_node(wxr, {}, link_node).strip() if word: - getattr(word_entry, linkage_type).append(word) + getattr(word_entry, linkage_type).append(Linkage(word=word)) diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index 861e4a7b..07e7aab7 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -21,6 +21,10 @@ class Translation(BaseModelWrap): ) +class Linkage(BaseModelWrap): + word: str = "" + + class Sound(BaseModelWrap): ipa: Optional[str] = Field( default=None, description="International Phonetic Alphabet" @@ -34,7 +38,7 @@ class Sound(BaseModelWrap): tags: Optional[list[str]] = Field( default=[], description="Specifying the variant of the pronunciation" ) - homophones: Optional[list[str]] = Field( + homophones: list[Linkage] = Field( default=[], description="Words with same pronunciation" ) @@ -118,34 +122,19 @@ class WordEntry(BaseModelWrap): sounds: Optional[list[Sound]] = [] senses: Optional[list[Sense]] = [] translations: Optional[list[Translation]] = [] - - antonyms: Optional[list[str]] = Field( - default=[], description="List of antonyms" - ) - anagrams: Optional[list[str]] = Field( - default=[], description="List of anagrams" - ) - variants: Optional[list[str]] = Field( - default=[], description="List of variants" - ) - hypernyms: Optional[list[str]] = Field( + antonyms: list[Linkage] = Field(default=[], description="List of antonyms") + anagrams: list[Linkage] = Field(default=[], description="List of anagrams") + variants: list[Linkage] = Field(default=[], description="List of variants") + hypernyms: list[Linkage] = Field( default=[], description="List of hypernyms" ) - hyponyms: Optional[list[str]] = Field( - default=[], description="List of hyponyms" - ) - derived: Optional[list[str]] = Field( + hyponyms: list[Linkage] = Field(default=[], description="List of hyponyms") + derived: list[Linkage] = Field( default=[], description="List of derived terms" ) - meronyms: Optional[list[str]] = Field( - default=[], description="List of meronyms" - ) - synonyms: Optional[list[str]] = Field( - default=[], description="List of synonyms" - ) - coordinate_terms: Optional[list[str]] = Field( + meronyms: list[Linkage] = Field(default=[], description="List of meronyms") + synonyms: list[Linkage] = Field(default=[], description="List of synonyms") + coordinate_terms: list[Linkage] = Field( default=[], description="List of coordinate terms" ) - holonyms: Optional[list[str]] = Field( - default=[], description="List of holonyms" - ) + holonyms: list[Linkage] = Field(default=[], description="List of holonyms") diff --git a/src/wiktextract/extractor/ru/pronunciation.py b/src/wiktextract/extractor/ru/pronunciation.py index 14a981a0..5c60b69d 100644 --- a/src/wiktextract/extractor/ru/pronunciation.py +++ b/src/wiktextract/extractor/ru/pronunciation.py @@ -4,7 +4,7 @@ from wikitextprocessor import NodeKind from wikitextprocessor.parser import LevelNode, WikiNode, WikiNodeChildrenList -from wiktextract.extractor.ru.models import Sound, WordEntry +from wiktextract.extractor.ru.models import Linkage, Sound, WordEntry from wiktextract.extractor.share import create_audio_url_dict from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -208,7 +208,9 @@ def extract_homophones( template_params: dict[str, WikiNode], ): homophones_raw = clean_node(wxr, {}, template_params.get("омофоны", "")) - homophones = [h.strip() for h in homophones_raw.split(",") if h.strip()] + homophones = [ + Linkage(word=h.strip()) for h in homophones_raw.split(",") if h.strip() + ] if homophones: if isinstance(sounds, list): for sound in sounds: diff --git a/tests/test_ru_pronunciation.py b/tests/test_ru_pronunciation.py index fc1df342..9a29ff4f 100644 --- a/tests/test_ru_pronunciation.py +++ b/tests/test_ru_pronunciation.py @@ -69,7 +69,7 @@ def test_process_transcription_template(self): "expected": { "ipa": "vot", "audio": "Ru-вот.ogg", - "homophones": ["вод"], + "homophones": [{"word": "вод"}], }, }, ] @@ -103,7 +103,7 @@ def test_process_transcriptions_template(self): "expected": [ { "ipa": "bɐˈlʲit", - "homophones": ["болит"], + "homophones": [{"word": "болит"}], "tags": ["singular"], }, { @@ -154,7 +154,7 @@ def test_process_transcription_ru_template_2(self): { "ipa": "vot", "audio": "Ru-вот.ogg", - "homophones": ["вод"], + "homophones": [{"word": "вод"}], } ], ) @@ -194,7 +194,7 @@ def test_process_transcriptions_ru_template_2(self): [ { "ipa": "bɐˈlʲit", - "homophones": ["болит"], + "homophones": [{"word": "болит"}], "tags": ["singular"], }, {