Skip to content

Commit

Permalink
Implement code review suggestions for Russian Wiktionary
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Dec 8, 2023
1 parent f5f39fc commit 9cc660f
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 51 deletions.
23 changes: 9 additions & 14 deletions src/wiktextract/extractor/ru/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
"ответственный": "editor",
"перев": "translator",
"источник": "source",
2: "author",
3: "title",
4: "date",
5: "collection",
6: "date_published",
}


Expand All @@ -27,21 +32,11 @@ def process_example_template(
value = clean_node(wxr, {}, value_raw).strip()
if not value:
continue
if isinstance(key, int):
if int(key) == 1:
example.text = value
elif int(key) == 2:
reference.author = value
elif int(key) == 3:
reference.title = value
elif int(key) == 4:
reference.date = value
elif int(key) == 5:
reference.collection = value
elif int(key) == 6:
reference.date_published = value
if isinstance(key, int) and key == 1:
example.text = value

else:
key = clean_node(wxr, {}, key)
key = clean_node(wxr, {}, key) if not isinstance(key, int) else key
if key == "текст":
example.text = value
elif key == "перевод":
Expand Down
58 changes: 25 additions & 33 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,38 +39,6 @@ class Sound(BaseModelWrap):
)


class Sense(BaseModelWrap):
raw_gloss: Optional[str] = Field(
default=None,
description="Raw gloss string for the word sense. This might contain tags and other markup.",
)
gloss: Optional[str] = Field(
default=None,
description="Gloss string for the word sense. This has been cleaned, and should be straightforward text with no tags.",
)
tags: list[str] = Field(
default=[],
description="List of tags affecting the word sense.",
)
notes: list[str] = Field(
default=[],
description="List of notes for the word sense. Usually describing usage.",
)
categories: list[str] = Field(
default=[],
description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
)
examples: list["Example"] = Field(
default=[], description="List of examples"
)
# subsenses: list["Sense"] = Field(
# default=[], description="List of subsenses"
# )
# senseid: Optional[int] = Field(
# default=None, description="Sense number used in Wiktionary"
# )


class Reference(BaseModelWrap):
author: Optional[str] = Field(default=None, description="Author's name")
title: Optional[str] = Field(
Expand Down Expand Up @@ -100,7 +68,31 @@ class Example(BaseModelWrap):
translation: Optional[str] = Field(
default=None, description="Spanish translation of the example sentence"
)
ref: Optional["Reference"] = Field(default=None, description="")
ref: Optional[Reference] = Field(default=None, description="")


class Sense(BaseModelWrap):
raw_gloss: Optional[str] = Field(
default=None,
description="Raw gloss string for the word sense. This might contain tags and other markup.",
)
gloss: Optional[str] = Field(
default=None,
description="Gloss string for the word sense. This has been cleaned, and should be straightforward text with no tags.",
)
tags: list[str] = Field(
default=[],
description="List of tags affecting the word sense.",
)
notes: list[str] = Field(
default=[],
description="List of notes for the word sense. Usually describing usage.",
)
categories: list[str] = Field(
default=[],
description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
)
examples: list[Example] = Field(default=[], description="List of examples")


class WordEntry(BaseModelWrap):
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/ru/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ def parse_page(
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
print(f"Skipping language {lang_code}")
continue

categories = {"categories": []}
Expand Down
6 changes: 3 additions & 3 deletions tests/test_ru_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ def test_ru_extract_example(self):
},
# https://ru.wiktionary.org/wiki/house
{
"input": "{{пример|This is my {{выдел|house}} and my family’s ancestral home.||перевод=Это мой {{выдел|дом}} и поселение моих семейных предков.}}",
"input": "{{пример|This is my house and my family’s ancestral home.||перевод=Это мой дом и поселение моих семейных предков.}}",
"expected": [
{
"text": "This is my and my family’s ancestral home.",
"translation": "Это мой и поселение моих семейных предков.",
"text": "This is my house and my family’s ancestral home.",
"translation": "Это мой дом и поселение моих семейных предков.",
}
],
},
Expand Down

0 comments on commit 9cc660f

Please sign in to comment.