Skip to content

Commit

Permalink
Merge pull request #461 from xxyzz/ru
Browse files Browse the repository at this point in the history
Remove optional types in Russian pydantic model
  • Loading branch information
xxyzz authored Jan 16, 2024
2 parents 909ec05 + c5e4225 commit e330bb9
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 73 deletions.
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,5 @@ def process_link(
semantic_links.append(Linkage(word=clean_link))


def contains_dash(text: str):
return re.search(r"[–—―‒-]", text)
def contains_dash(text: str) -> bool:
return re.search(r"[–—―‒-]", text) is not None
23 changes: 9 additions & 14 deletions src/wiktextract/extractor/ru/example.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from wikitextprocessor import WikiNode

from wiktextract.extractor.ru.models import Example, Reference, Sense
from wiktextract.extractor.ru.models import Example, Sense
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand All @@ -26,33 +26,28 @@ def process_example_template(
sense: Sense,
template_node: WikiNode,
):
# https://ru.wiktionary.org/wiki/Шаблон:пример
example = Example()
reference = Reference()
for key, value_raw in template_node.template_parameters.items():
value = clean_node(wxr, {}, value_raw).strip()
if not value:
value = clean_node(wxr, None, value_raw)
if len(value) == 0:
continue
if isinstance(key, int) and key == 1:
example.text = value

else:
key = clean_node(wxr, {}, key) if not isinstance(key, int) else key
if key == "текст":
example.text = value
elif key == "перевод":
example.translation = value
elif key in EXAMPLE_TEMPLATE_KEY_MAPPING:
field_name = EXAMPLE_TEMPLATE_KEY_MAPPING.get(key, key)
if field_name in reference.model_fields:
setattr(reference, field_name, value)
if field_name in example.model_fields:
setattr(example, field_name, value)
else:
wxr.wtp.debug(
f"Unknown key {key} in example template {template_node}",
sortid="extractor/ru/example/process_example_template/54",
f"Unknown {key=} in example template {template_node}",
sortid="ru/example/process_example_template/54",
)

if example.model_dump(exclude_defaults=True) != {}:
if reference.model_dump(exclude_defaults=True) != {}:
example.ref = reference

if len(example.model_dump(exclude_defaults=True)) > 0:
sense.examples.append(example)
85 changes: 38 additions & 47 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field


class BaseModelWrap(BaseModel):
model_config = ConfigDict(validate_assignment=True, extra="forbid")
model_config = ConfigDict(
extra="forbid",
strict=True,
validate_assignment=True,
validate_default=True,
)


class Translation(BaseModelWrap):
Expand All @@ -15,8 +18,8 @@ class Translation(BaseModelWrap):
lang: str = Field(
description="Localized language name of the translation term"
)
sense: Optional[str] = Field(
default=None,
sense: str = Field(
default="",
description="An optional gloss describing the sense translated",
)

Expand All @@ -26,55 +29,42 @@ class Linkage(BaseModelWrap):


class Sound(BaseModelWrap):
ipa: Optional[str] = Field(
default=None, description="International Phonetic Alphabet"
)
audio: Optional[str] = Field(default=None, description="Audio file name")
wav_url: Optional[str] = Field(default=None)
ogg_url: Optional[str] = Field(default=None)
oga_url: Optional[str] = Field(default=None)
mp3_url: Optional[str] = Field(default=None)
flac_url: Optional[str] = Field(default=None)
tags: Optional[list[str]] = Field(
ipa: str = Field(default="", description="International Phonetic Alphabet")
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
ogg_url: str = ""
oga_url: str = ""
mp3_url: str = ""
flac_url: str = ""
tags: list[str] = Field(
default=[], description="Specifying the variant of the pronunciation"
)
homophones: list[Linkage] = Field(
default=[], description="Words with same pronunciation"
)


class Reference(BaseModelWrap):
author: Optional[str] = Field(default=None, description="Author's name")
title: Optional[str] = Field(
default=None, description="Title of the reference"
)
date: Optional[str] = Field(default=None, description="Original date")
date_published: Optional[str] = Field(
default=None, description="Date of publication"
)

collection: Optional[str] = Field(
default=None,
class Example(BaseModelWrap):
text: str = Field(default="", description="Example usage sentence")
translation: str = Field(
default="", description="Spanish translation of the example sentence"
)
author: str = Field(default="", description="Author's name")
title: str = Field(default="", description="Title of the reference")
date: str = Field(default="", description="Original date")
date_published: str = Field(default="", description="Date of publication")
collection: str = Field(
default="",
description="Name of the collection the example was taken from",
)
editor: Optional[str] = Field(default=None, description="Editor")
translator: Optional[str] = Field(default=None, description="Translator")
source: Optional[str] = Field(
default=None,
editor: str = Field(default="", description="Editor")
translator: str = Field(default="", description="Translator")
source: str = Field(
default="",
description="Source of reference, corresponds to template parameter 'источник'",
)


class Example(BaseModelWrap):
text: Optional[str] = Field(
default=None, description="Example usage sentence"
)
translation: Optional[str] = Field(
default=None, description="Spanish translation of the example sentence"
)
ref: Optional[Reference] = Field(default=None, description="")


class Sense(BaseModelWrap):
raw_glosses: list[str] = Field(
default=[],
Expand All @@ -101,14 +91,15 @@ class Sense(BaseModelWrap):

class WordEntry(BaseModelWrap):
"""
WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.
WordEntry is a dictionary containing lexical information of a single word
extracted from Wiktionary with wiktextract.
"""

model_config = ConfigDict(title="Russian Wiktionary")

word: str = Field(description="word string")
pos: str = Field(default=None, description="Part of speech type")
pos_title: str = Field(default=None, description="Original POS title")
pos: str = Field(default="", description="Part of speech type")
pos_title: str = Field(default="", description="Original POS title")
lang_code: str = Field(
description="Wiktionary language code", examples=["ru"]
)
Expand All @@ -119,9 +110,9 @@ class WordEntry(BaseModelWrap):
default=[],
description="list of non-disambiguated categories for the word",
)
sounds: Optional[list[Sound]] = []
senses: Optional[list[Sense]] = []
translations: Optional[list[Translation]] = []
sounds: list[Sound] = []
senses: list[Sense] = []
translations: list[Translation] = []
antonyms: list[Linkage] = Field(default=[], description="List of antonyms")
anagrams: list[Linkage] = Field(default=[], description="List of anagrams")
variants: list[Linkage] = Field(default=[], description="List of variants")
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/ru/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ def extract_translations(
word_entry: WordEntry,
level3_node: WikiNode,
):
sense = None
sense = ""
for template_node in level3_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "перев-блок":
gloss_nodes = template_node.template_parameters.get(1, [])
if gloss_nodes:
sense = clean_node(wxr, {}, gloss_nodes).strip()
sense = clean_node(wxr, None, gloss_nodes)
for key, raw_value in template_node.template_parameters.items():
if isinstance(key, str):
lang_code = key
Expand All @@ -38,7 +38,7 @@ def extract_translations(
lang_code=lang_code,
lang=lang,
word=word,
sense=sense if sense else None,
sense=sense,
)
)
# XXX: Extract non link content such as tags
Expand Down
12 changes: 5 additions & 7 deletions tests/test_ru_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,11 @@ def test_ru_extract_example(self):
"input": "{{пример|Недолго думая, отправляю овощ в рот.|М. И. Саитов|Островки||Бельские Просторы|2010|источник=НКРЯ}}",
"expected": [
{
"ref": {
"author": "М. И. Саитов",
"collection": "Бельские Просторы",
"date_published": "2010",
"source": "НКРЯ",
"title": "Островки",
},
"author": "М. И. Саитов",
"collection": "Бельские Просторы",
"date_published": "2010",
"source": "НКРЯ",
"title": "Островки",
"text": "Недолго думая, отправляю овощ в рот.",
}
],
Expand Down

0 comments on commit e330bb9

Please sign in to comment.