From b01244b9605d36d9460645808042fb105f32a2ca Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 7 Dec 2023 14:16:02 +0100 Subject: [PATCH 1/7] Extract glosses from Russian Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This extracts raw_glosses and attempts to extract clean glosses (without notes and tags). Currently only some tags are recognized and moved to the tags field. The rest of the tags are left in the gloss. This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- .../data/ru/linkage_subtitles.json | 12 ++ src/wiktextract/extractor/ru/example.py | 13 ++ src/wiktextract/extractor/ru/gloss.py | 122 ++++++++++++++++++ src/wiktextract/extractor/ru/linkage.py | 13 ++ src/wiktextract/extractor/ru/models.py | 34 +++++ src/wiktextract/extractor/ru/page.py | 27 +++- tests/test_ru_gloss.py | 74 +++++++++++ tests/test_ru_page.py | 55 ++++++++ 8 files changed, 348 insertions(+), 2 deletions(-) create mode 100644 src/wiktextract/data/ru/linkage_subtitles.json create mode 100644 src/wiktextract/extractor/ru/example.py create mode 100644 src/wiktextract/extractor/ru/gloss.py create mode 100644 src/wiktextract/extractor/ru/linkage.py create mode 100644 tests/test_ru_gloss.py create mode 100644 tests/test_ru_page.py diff --git a/src/wiktextract/data/ru/linkage_subtitles.json b/src/wiktextract/data/ru/linkage_subtitles.json new file mode 100644 index 00000000..11ac6776 --- /dev/null +++ b/src/wiktextract/data/ru/linkage_subtitles.json @@ -0,0 +1,12 @@ +{ + "антонимы": "antonyms", + "анаграммы": "anagrams", + "варианты": "variants", + "гиперонимы": "hypernyms", + "гипонимы": "hyponyms", + "дериваты": "derived", + "меронимы": "meronyms", + "синонимы": "synonyms", + "согипонимы": "coordinate_terms", + "холонимы": "holonyms" +} diff --git a/src/wiktextract/extractor/ru/example.py b/src/wiktextract/extractor/ru/example.py new file mode 100644 index 00000000..9fc6cf40 --- /dev/null +++ b/src/wiktextract/extractor/ru/example.py @@ -0,0 +1,13 @@ +from wikitextprocessor import WikiNode + +from wiktextract.extractor.ru.models import WordEntry +from wiktextract.wxr_context import WiktextractContext + + +def process_example_template( + wxr: WiktextractContext, + word_entry: WordEntry, + template_node: WikiNode, +): + pass + # wxr.wtp.debug(str(template_node), sortid="example") diff --git a/src/wiktextract/extractor/ru/gloss.py b/src/wiktextract/extractor/ru/gloss.py new file mode 100644 index 00000000..e4c36104 --- /dev/null +++ b/src/wiktextract/extractor/ru/gloss.py @@ -0,0 +1,122 @@ +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import WikiNodeChildrenList + +from wiktextract.extractor.ru.example import process_example_template +from wiktextract.extractor.ru.models import Sense, WordEntry +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + +TAGS_TEMPLATE_NAMES = { + # XXX: This list is incomplete. There are many more tag templates. Perhaps it would be better to assume all templates that are not recognized as something else are tags? + "жарг.", + "зоол.", + "искусств.", + "истор.", + "ихтиол.", + "книжн.", + "кулин.", + "ласк.", + "лингв.", + "матем.", + "мед.", + "минер.", + "минерал.", + "миф.", + "мифол.", + "неодобр.", + "п.", + "перен.", + "полит.", + "поэт.", + "пренебр.", + "прост.", + "разг.", + "религ.", + "техн.", + "устар.", + "фарм.", + "физ.", + "физиол.", + "филол.", + "филос.", + "фолькл.", + "хим.", + "церк.", + "шутл.", + "эвф.", + "экон.", + "юр.", +} + + +def extract_gloss( + wxr: WiktextractContext, + word_entry: WordEntry, + item_node: WikiNode, +): + sense = Sense() + + raw_gloss_children: WikiNodeChildrenList = [] + clean_gloss_children: WikiNodeChildrenList = [] + tag_templates: list[WikiNode] = [] + note_templates: list[WikiNode] = [] + + for child in item_node.children: + if isinstance(child, WikiNode) and child.kind == NodeKind.TEMPLATE: + if child.template_name == "пример": + process_example_template(wxr, word_entry, child) + + elif child.template_name in TAGS_TEMPLATE_NAMES: + tag_templates.append(child) + raw_gloss_children.append(child) + + elif child.template_name == "помета": + note_templates.append(child) + raw_gloss_children.append(child) + + else: + clean_gloss_children.append(child) + raw_gloss_children.append(child) + + wxr.wtp.debug( + f"Found template '{child.template_name}' in gloss that could be a tag", + sortid="extractor/ru/gloss/extract_gloss/75", + ) + else: + clean_gloss_children.append(child) + raw_gloss_children.append(child) + + remove_obsolete_leading_nodes(raw_gloss_children) + remove_obsolete_leading_nodes(clean_gloss_children) + + if raw_gloss_children: + raw_gloss = clean_node(wxr, {}, raw_gloss_children).strip() + if raw_gloss: + sense.raw_gloss = raw_gloss + + if clean_gloss_children: + gloss = clean_node(wxr, {}, clean_gloss_children).strip() + if gloss: + sense.gloss = gloss + + for tag_template in tag_templates: + tag = clean_node(wxr, {}, tag_template).strip() + if tag: + sense.tags.append(tag) + + for note_template in note_templates: + note = clean_node(wxr, {}, note_template).strip() + if note: + sense.notes.append(note) + + if sense.model_dump(exclude_defaults=True) != {}: + word_entry.senses.append(sense) + + +def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList): + while ( + nodes + and isinstance(nodes[0], str) + and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":", "\n"] + ): + nodes.pop(0) diff --git a/src/wiktextract/extractor/ru/linkage.py b/src/wiktextract/extractor/ru/linkage.py new file mode 100644 index 00000000..0fff9cf4 --- /dev/null +++ b/src/wiktextract/extractor/ru/linkage.py @@ -0,0 +1,13 @@ +from wikitextprocessor import WikiNode + +from wiktextract.extractor.ru.models import WordEntry +from wiktextract.wxr_context import WiktextractContext + + +def extract_linkages( + wxr: WiktextractContext, + word_entry: WordEntry, + linkage_type: str, + level_node: WikiNode, +): + pass diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index f8da0213..da2ae69a 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -1,4 +1,5 @@ from typing import Optional + from pydantic import BaseModel, ConfigDict, Field @@ -24,6 +25,38 @@ class Sound(BaseModelWrap): ) +class Sense(BaseModelWrap): + raw_gloss: Optional[str] = Field( + default=None, + description="Raw gloss string for the word sense. This might contain tags and other markup.", + ) + gloss: Optional[str] = Field( + default=None, + description="Gloss string for the word sense. This has been cleaned, and should be straightforward text with no tags.", + ) + tags: list[str] = Field( + default=[], + description="List of tags affecting the word sense.", + ) + notes: list[str] = Field( + default=[], + description="List of notes for the word sense. Usually describing usage.", + ) + categories: list[str] = Field( + default=[], + description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", + ) + # examples: list["Example"] = Field( + # default=[], description="List of examples" + # ) + # subsenses: list["Sense"] = Field( + # default=[], description="List of subsenses" + # ) + # senseid: Optional[int] = Field( + # default=None, description="Sense number used in Wiktionary" + # ) + + class WordEntry(BaseModelWrap): """ WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract. @@ -45,3 +78,4 @@ class WordEntry(BaseModelWrap): description="list of non-disambiguated categories for the word", ) sounds: Optional[list[Sound]] = [] + senses: Optional[list[Sense]] = [] diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py index ef6cef1b..572db35a 100644 --- a/src/wiktextract/extractor/ru/page.py +++ b/src/wiktextract/extractor/ru/page.py @@ -4,6 +4,8 @@ from wikitextprocessor import NodeKind, WikiNode +from wiktextract.extractor.ru.gloss import extract_gloss +from wiktextract.extractor.ru.linkage import extract_linkages from wiktextract.extractor.ru.models import WordEntry from wiktextract.extractor.ru.pronunciation import extract_pronunciation from wiktextract.page import clean_node @@ -27,7 +29,24 @@ def process_semantic_section( page_data: list[WordEntry], semantic_level_node: WikiNode, ): - pass + for level4_node in semantic_level_node.find_child(NodeKind.LEVEL4): + section_title = clean_node(wxr, {}, level4_node.largs).lower() + if section_title == "значение": + for list_item in level4_node.find_child_recursively( + NodeKind.LIST_ITEM + ): + extract_gloss(wxr, page_data[-1], list_item) + + elif section_title in wxr.config.LINKAGE_SUBTITLES: + linkage_type = wxr.config.LINKAGE_SUBTITLES.get(section_title) + extract_linkages(wxr, page_data[-1], linkage_type, level4_node) + else: + wxr.wtp.debug( + f"Unprocessed section {section_title} in semantic section", + sortid="extractor/ru/page/process_semantic_section/35", + ) + + # XXX: Process non level4 nodes such as illustration templates "{илл|...}", cf. https://ru.wiktionary.org/wiki/овощ def get_pos( @@ -178,6 +197,7 @@ def parse_page( wxr.config.capture_language_codes is not None and lang_code not in wxr.config.capture_language_codes ): + print(f"Skipping language {lang_code}") continue categories = {"categories": []} @@ -221,8 +241,11 @@ def parse_page( for level3_node in level2_node.find_child(NodeKind.LEVEL3): parse_section(wxr, page_data, level3_node) - page_data.append(copy.deepcopy(base_data)) + is_first_level2_node = True for level3_node in level1_node.find_child(NodeKind.LEVEL3): + if is_first_level2_node: + page_data.append(copy.deepcopy(base_data)) + is_first_level2_node = False parse_section(wxr, page_data, level3_node) return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/tests/test_ru_gloss.py b/tests/test_ru_gloss.py new file mode 100644 index 00000000..2882c90e --- /dev/null +++ b/tests/test_ru_gloss.py @@ -0,0 +1,74 @@ +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.ru.gloss import extract_gloss +from wiktextract.extractor.ru.models import WordEntry +from wiktextract.wxr_context import WiktextractContext + + +class TestRUGloss(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="ru"), + WiktionaryConfig(dump_file_lang_code="ru"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_page_data(self) -> list[WordEntry]: + return [WordEntry(word="пример", lang_code="ru", lang_name="Русский")] + + def test_ru_extract_gloss(self): + # https://ru.wiktionary.org/wiki/овощ + test_cases = [ + # Cleans examples from gloss and raw_gloss + { + "input": "# [[съедобный|съедобная]] [[часть]] овоща [1] {{пример|Недолго думая, отправляю овощ в рот.|М. И. Саитов|Островки||Бельские Просторы|2010|источник=НКРЯ}}", + "expected": { + "raw_gloss": "съедобная часть овоща [1]", + "gloss": "съедобная часть овоща [1]", + }, + }, + # Extracts tags + { + "input": "# {{разг.|ru}}, {{неодобр.|ru}} или {{пренебр.|ru}} [[бесхарактерный]], [[безвольный]] человек, лишённый активной жизненной позиции {{пример|}}", + "expected": { + "tags": ["разг.", "неодобр.", "пренебр."], + "gloss": "бесхарактерный, безвольный человек, лишённый активной жизненной позиции", + "raw_gloss": "разг., неодобр. или пренебр. бесхарактерный, безвольный человек, лишённый активной жизненной позиции", + }, + }, + # Extracts notes + { + "input": "# {{помета|часто мн}} обобщающее [[название]] растительной пищи, не включающей [[фрукт]]ы ''и'' [[крупа|крупы]]", + "expected": { + "notes": ["часто мн. ч."], + "gloss": "обобщающее название растительной пищи, не включающей фрукты и крупы", + "raw_gloss": "часто мн. ч. обобщающее название растительной пищи, не включающей фрукты и крупы", + }, + }, + ] + + self.wxr.wtp.add_page("Шаблон:разг.", 10, "разг.") + self.wxr.wtp.add_page("Шаблон:неодобр.", 10, "неодобр.") + self.wxr.wtp.add_page("Шаблон:пренебр.", 10, "пренебр.") + self.wxr.wtp.add_page("Шаблон:помета", 10, "часто мн. ч.") + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + page_data = self.get_default_page_data() + + root = self.wxr.wtp.parse(case["input"]) + + extract_gloss( + self.wxr, page_data[-1], root.children[0].children[0] + ) + + new_sense = ( + page_data[-1].senses[-1].model_dump(exclude_defaults=True) + ) + self.assertEqual(new_sense, case["expected"]) diff --git a/tests/test_ru_page.py b/tests/test_ru_page.py new file mode 100644 index 00000000..9d34857c --- /dev/null +++ b/tests/test_ru_page.py @@ -0,0 +1,55 @@ +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.ru.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestRUPage(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="ru"), + WiktionaryConfig( + dump_file_lang_code="ru", capture_language_codes={"ru"} + ), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + # def get_default_page_data(self) -> list[WordEntry]: + # return [WordEntry(word="test", lang_code="es", lang_name="Language")] + + def test_ru_parse_page_1(self): + # Navigates homonyms/homographs + # E.g. https://ru.wiktionary.org/wiki/овощ + + self.wxr.wtp.add_page("Шаблон:-ru-", 10, "") + self.wxr.wtp.add_page("Шаблон:з", 10, "") + + page_text = """= {{-ru-}} = +== {{з|I}} == +=== Морфологические и синтаксические свойства === +== {{з|II}} == +=== Морфологические и синтаксические свойства === +""" + + page_data_dicts = parse_page(self.wxr, "овощ", page_text) + + self.assertEqual(len(page_data_dicts), 2) + + def test_ru_parse_page_2(self): + # Navigates in case of absence of H2 headings (homonyms/homographs) + # E.g. https://ru.wiktionary.org/wiki/сарлык + + self.wxr.wtp.add_page("Шаблон:-ru-", 10, "") + + page_text = """= {{-ru-}} = +=== Морфологические и синтаксические свойства === +""" + + page_data_dicts = parse_page(self.wxr, "овощ", page_text) + + self.assertEqual(len(page_data_dicts), 1) From 85c1328c4f5c6098077d0aed9bcd108aa68e9df9 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 7 Dec 2023 15:04:34 +0100 Subject: [PATCH 2/7] Extract examples from Russian Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/ru/example.py | 58 +++++++++++++++++++-- src/wiktextract/extractor/ru/gloss.py | 2 +- src/wiktextract/extractor/ru/models.py | 38 ++++++++++++-- tests/test_ru_example.py | 69 +++++++++++++++++++++++++ 4 files changed, 159 insertions(+), 8 deletions(-) create mode 100644 tests/test_ru_example.py diff --git a/src/wiktextract/extractor/ru/example.py b/src/wiktextract/extractor/ru/example.py index 9fc6cf40..f32946df 100644 --- a/src/wiktextract/extractor/ru/example.py +++ b/src/wiktextract/extractor/ru/example.py @@ -1,13 +1,63 @@ from wikitextprocessor import WikiNode -from wiktextract.extractor.ru.models import WordEntry +from wiktextract.extractor.ru.models import Example, Reference, Sense +from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +EXAMPLE_TEMPLATE_KEY_MAPPING = { + "автор": "author", + "титул": "title", + "дата": "date", + "издание": "collection", + "дата издания": "date_published", + "ответственный": "editor", + "перев": "translator", + "источник": "source", +} + def process_example_template( wxr: WiktextractContext, - word_entry: WordEntry, + sense: Sense, template_node: WikiNode, ): - pass - # wxr.wtp.debug(str(template_node), sortid="example") + example = Example() + reference = Reference() + for key, value_raw in template_node.template_parameters.items(): + value = clean_node(wxr, {}, value_raw).strip() + if not value: + continue + if isinstance(key, int): + if int(key) == 1: + example.text = value + elif int(key) == 2: + reference.author = value + elif int(key) == 3: + reference.title = value + elif int(key) == 4: + reference.date = value + elif int(key) == 5: + reference.collection = value + elif int(key) == 6: + reference.date_published = value + else: + key = clean_node(wxr, {}, key) + if key == "текст": + example.text = value + elif key == "перевод": + example.translation = value + elif key in EXAMPLE_TEMPLATE_KEY_MAPPING: + field_name = EXAMPLE_TEMPLATE_KEY_MAPPING.get(key, key) + if field_name in reference.model_fields: + setattr(reference, field_name, value) + else: + wxr.wtp.debug( + f"Unknown key {key} in example template {template_node}", + sortid="wiktextract/extractor/ru/example/process_example_template/54", + ) + + if example.model_dump(exclude_defaults=True) != {}: + if reference.model_dump(exclude_defaults=True) != {}: + example.ref = reference + + sense.examples.append(example) diff --git a/src/wiktextract/extractor/ru/gloss.py b/src/wiktextract/extractor/ru/gloss.py index e4c36104..da07388b 100644 --- a/src/wiktextract/extractor/ru/gloss.py +++ b/src/wiktextract/extractor/ru/gloss.py @@ -64,7 +64,7 @@ def extract_gloss( for child in item_node.children: if isinstance(child, WikiNode) and child.kind == NodeKind.TEMPLATE: if child.template_name == "пример": - process_example_template(wxr, word_entry, child) + process_example_template(wxr, sense, child) elif child.template_name in TAGS_TEMPLATE_NAMES: tag_templates.append(child) diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index da2ae69a..348f4a50 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -46,9 +46,9 @@ class Sense(BaseModelWrap): default=[], description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", ) - # examples: list["Example"] = Field( - # default=[], description="List of examples" - # ) + examples: list["Example"] = Field( + default=[], description="List of examples" + ) # subsenses: list["Sense"] = Field( # default=[], description="List of subsenses" # ) @@ -57,6 +57,38 @@ class Sense(BaseModelWrap): # ) +class Reference(BaseModelWrap): + author: Optional[str] = Field(default=None, description="Author's name") + title: Optional[str] = Field( + default=None, description="Title of the reference" + ) + date: Optional[str] = Field(default=None, description="Original date") + date_published: Optional[str] = Field( + default=None, description="Date of publication" + ) + + collection: Optional[str] = Field( + default=None, + description="Name of the collection the example was taken from", + ) + editor: Optional[str] = Field(default=None, description="Editor") + translator: Optional[str] = Field(default=None, description="Translator") + source: Optional[str] = Field( + default=None, + description="Source of reference, corresponds to template parameter 'источник'", + ) + + +class Example(BaseModelWrap): + text: Optional[str] = Field( + default=None, description="Example usage sentence" + ) + translation: Optional[str] = Field( + default=None, description="Spanish translation of the example sentence" + ) + ref: Optional["Reference"] = Field(default=None, description="") + + class WordEntry(BaseModelWrap): """ WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract. diff --git a/tests/test_ru_example.py b/tests/test_ru_example.py new file mode 100644 index 00000000..d2a0524e --- /dev/null +++ b/tests/test_ru_example.py @@ -0,0 +1,69 @@ +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.ru.example import process_example_template +from wiktextract.extractor.ru.models import Sense +from wiktextract.wxr_context import WiktextractContext + + +class TestRUGloss(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="ru"), + WiktionaryConfig(dump_file_lang_code="ru"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_sense_data(self) -> Sense: + return Sense() + + def test_ru_extract_gloss(self): + test_cases = [ + # Ignores empty template + {"input": "{{пример|}}", "expected": []}, + # https://ru.wiktionary.org/wiki/Красная_Шапочка + { + "input": "{{пример|Недолго думая, отправляю овощ в рот.|М. И. Саитов|Островки||Бельские Просторы|2010|источник=НКРЯ}}", + "expected": [ + { + "ref": { + "author": "М. И. Саитов", + "collection": "Бельские Просторы", + "date_published": "2010", + "source": "НКРЯ", + "title": "Островки", + }, + "text": "Недолго думая, отправляю овощ в рот.", + } + ], + }, + # https://ru.wiktionary.org/wiki/house + { + "input": "{{пример|This is my {{выдел|house}} and my family’s ancestral home.||перевод=Это мой {{выдел|дом}} и поселение моих семейных предков.}}", + "expected": [ + { + "text": "This is my and my family’s ancestral home.", + "translation": "Это мой и поселение моих семейных предков.", + } + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + sense_data = self.get_default_sense_data() + + root = self.wxr.wtp.parse(case["input"]) + + process_example_template(self.wxr, sense_data, root.children[0]) + + examples = [ + e.model_dump(exclude_defaults=True) + for e in sense_data.examples + ] + self.assertEqual(examples, case["expected"]) From a2084e3de4322e8f0d6149b06b605413138a5cc1 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 7 Dec 2023 15:19:10 +0100 Subject: [PATCH 3/7] Fix test_ru_gloss.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- tests/test_ru_gloss.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_ru_gloss.py b/tests/test_ru_gloss.py index 2882c90e..7b708371 100644 --- a/tests/test_ru_gloss.py +++ b/tests/test_ru_gloss.py @@ -69,6 +69,8 @@ def test_ru_extract_gloss(self): ) new_sense = ( - page_data[-1].senses[-1].model_dump(exclude_defaults=True) + page_data[-1] + .senses[-1] + .model_dump(exclude_defaults=True, exclude={"examples"}) ) self.assertEqual(new_sense, case["expected"]) From b4aebf24bdcb8cbe8d709f4c5f86e6d240f533e5 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 7 Dec 2023 16:08:17 +0100 Subject: [PATCH 4/7] Extract translations from Russian Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/ru/models.py | 15 +++ src/wiktextract/extractor/ru/page.py | 3 +- src/wiktextract/extractor/ru/translation.py | 51 ++++++++++ tests/test_ru_example.py | 4 +- tests/test_ru_translation.py | 107 ++++++++++++++++++++ 5 files changed, 177 insertions(+), 3 deletions(-) create mode 100644 src/wiktextract/extractor/ru/translation.py create mode 100644 tests/test_ru_translation.py diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index 348f4a50..0458fc6c 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -7,6 +7,20 @@ class BaseModelWrap(BaseModel): model_config = ConfigDict(validate_assignment=True, extra="forbid") +class Translation(BaseModelWrap): + word: str = Field(description="Translation term") + lang_code: str = Field( + description="Wiktionary language code of the translation term" + ) + lang_name: str = Field( + description="Localized language name of the translation term" + ) + sense: Optional[str] = Field( + default=None, + description="An optional gloss describing the sense translated", + ) + + class Sound(BaseModelWrap): ipa: Optional[str] = Field( default=None, description="International Phonetic Alphabet" @@ -111,3 +125,4 @@ class WordEntry(BaseModelWrap): ) sounds: Optional[list[Sound]] = [] senses: Optional[list[Sense]] = [] + translations: Optional[list[Translation]] = [] diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py index 572db35a..17cdf856 100644 --- a/src/wiktextract/extractor/ru/page.py +++ b/src/wiktextract/extractor/ru/page.py @@ -8,6 +8,7 @@ from wiktextract.extractor.ru.linkage import extract_linkages from wiktextract.extractor.ru.models import WordEntry from wiktextract.extractor.ru.pronunciation import extract_pronunciation +from wiktextract.extractor.ru.translation import extract_translations from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -147,7 +148,7 @@ def parse_section( pass elif section_title == "Перевод": if wxr.config.capture_translations: - pass + extract_translations(wxr, page_data[-1], level3_node) elif section_title in ["Анаграммы", "Метаграммы", "Синонимы", "Антонимы"]: pass elif section_title == "Библиография": diff --git a/src/wiktextract/extractor/ru/translation.py b/src/wiktextract/extractor/ru/translation.py new file mode 100644 index 00000000..e1f1a0b0 --- /dev/null +++ b/src/wiktextract/extractor/ru/translation.py @@ -0,0 +1,51 @@ +from mediawiki_langcodes import code_to_name +from wikitextprocessor import NodeKind, WikiNode + +from wiktextract.extractor.ru.models import Translation, WordEntry +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + + +def extract_translations( + wxr: WiktextractContext, + word_entry: WordEntry, + level3_node: WikiNode, +): + sense = None + for template_node in level3_node.find_child(NodeKind.TEMPLATE): + if template_node.template_name == "перев-блок": + gloss_nodes = template_node.template_parameters.get(1, []) + if gloss_nodes: + sense = clean_node(wxr, {}, gloss_nodes).strip() + for key, raw_value in template_node.template_parameters.items(): + if isinstance(key, str): + lang_code = key + lang_name = code_to_name(lang_code, "ru") + + for value_node in ( + raw_value + if isinstance(raw_value, list) + else [raw_value] + ): + if ( + isinstance(value_node, WikiNode) + and value_node.kind == NodeKind.LINK + ): + word = clean_node(wxr, {}, value_node).strip() + if word: + word_entry.translations.append( + Translation( + lang_code=lang_code, + lang_name=lang_name, + word=word, + sense=sense if sense else None, + ) + ) + # XXX: Extract non link content such as tags + + else: + wxr.wtp.debug( + f"Found unexpected template {template_node.template_name} in translation section", + sortid="extractor/ru/translation/extract_translations/100", + ) + pass diff --git a/tests/test_ru_example.py b/tests/test_ru_example.py index d2a0524e..1aa3ce77 100644 --- a/tests/test_ru_example.py +++ b/tests/test_ru_example.py @@ -8,7 +8,7 @@ from wiktextract.wxr_context import WiktextractContext -class TestRUGloss(unittest.TestCase): +class TestRUExample(unittest.TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="ru"), @@ -21,7 +21,7 @@ def tearDown(self) -> None: def get_default_sense_data(self) -> Sense: return Sense() - def test_ru_extract_gloss(self): + def test_ru_extract_example(self): test_cases = [ # Ignores empty template {"input": "{{пример|}}", "expected": []}, diff --git a/tests/test_ru_translation.py b/tests/test_ru_translation.py new file mode 100644 index 00000000..7fdd5595 --- /dev/null +++ b/tests/test_ru_translation.py @@ -0,0 +1,107 @@ +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.ru.models import WordEntry +from wiktextract.extractor.ru.translation import extract_translations +from wiktextract.wxr_context import WiktextractContext + + +class TestRUTranslation(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="ru"), + WiktionaryConfig(dump_file_lang_code="ru"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_word_entry(self) -> WordEntry: + return WordEntry(word="test", lang_code="ru", lang_name="русский") + + def test_ru_extract_gloss(self): + # Test cases adapted from: https://ru.wiktionary.org/wiki/дом + test_cases = [ + { + # No translations + "input": "{{перев-блок|ab=}}", + "expected": [], + }, + { + # No translations but gloss + "input": "{{перев-блок|сооружение|ab=}}", + "expected": [], + }, + { + # Translations, no gloss + "input": "{{перев-блок|en=[[house]]|ar=[[بيت]]}}", + "expected": [ + { + "word": "house", + "lang_code": "en", + "lang_name": "английский", + }, + {"word": "بيت", "lang_code": "ar", "lang_name": "арабский"}, + ], + }, + { + # Ignore tags for now + "input": "{{перев-блок|сооружение|ab=|en=[[house]]|ar=[[بيت]]}}", + "expected": [ + { + "word": "house", + "lang_code": "en", + "lang_name": "английский", + "sense": "сооружение", + }, + { + "word": "بيت", + "lang_code": "ar", + "lang_name": "арабский", + "sense": "сооружение", + }, + ], + }, + { + "input": "{{перев-блок||br=[[ti]] {{m}}|grc=[[αὐλή]] {{f}}; [[δόμος]] {{m}}; [[δῶμα]] {{n}}}}", + "expected": [ + { + "word": "ti", + "lang_code": "br", + "lang_name": "бретонский", + }, + { + "word": "αὐλή", + "lang_code": "grc", + "lang_name": "древнегреческий", + }, + { + "word": "δόμος", + "lang_code": "grc", + "lang_name": "древнегреческий", + }, + { + "word": "δῶμα", + "lang_code": "grc", + "lang_name": "древнегреческий", + }, + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + word_entry = self.get_default_word_entry() + + root = self.wxr.wtp.parse(case["input"]) + + extract_translations(self.wxr, word_entry, root) + + translations = [ + t.model_dump(exclude_defaults=True) + for t in word_entry.translations + ] + self.assertEqual(translations, case["expected"]) From 46d77f53da7c6a540d5b98e0bde84511503e9a2d Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 7 Dec 2023 16:38:00 +0100 Subject: [PATCH 5/7] Extract linkages from Russian Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/ru/linkage.py | 14 +++++++++-- src/wiktextract/extractor/ru/models.py | 31 +++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/ru/linkage.py b/src/wiktextract/extractor/ru/linkage.py index 0fff9cf4..0b87d17e 100644 --- a/src/wiktextract/extractor/ru/linkage.py +++ b/src/wiktextract/extractor/ru/linkage.py @@ -1,6 +1,7 @@ -from wikitextprocessor import WikiNode +from wikitextprocessor import NodeKind, WikiNode from wiktextract.extractor.ru.models import WordEntry +from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -10,4 +11,13 @@ def extract_linkages( linkage_type: str, level_node: WikiNode, ): - pass + if not linkage_type in word_entry.model_fields: + wxr.wtp.debug( + f"Linkage type {linkage_type} not defined for word entry", + sortid="extractor/ru/linkage/extract_linkages/10", + ) + return + for link_node in level_node.find_child_recursively(NodeKind.LINK): + word = clean_node(wxr, {}, link_node).strip() + if word: + getattr(word_entry, linkage_type).append(word) diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index 0458fc6c..20d0bb24 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -126,3 +126,34 @@ class WordEntry(BaseModelWrap): sounds: Optional[list[Sound]] = [] senses: Optional[list[Sense]] = [] translations: Optional[list[Translation]] = [] + + antonyms: Optional[list[str]] = Field( + default=[], description="List of antonyms" + ) + anagrams: Optional[list[str]] = Field( + default=[], description="List of anagrams" + ) + variants: Optional[list[str]] = Field( + default=[], description="List of variants" + ) + hypernyms: Optional[list[str]] = Field( + default=[], description="List of hypernyms" + ) + hyponyms: Optional[list[str]] = Field( + default=[], description="List of hyponyms" + ) + derived: Optional[list[str]] = Field( + default=[], description="List of derived terms" + ) + meronyms: Optional[list[str]] = Field( + default=[], description="List of meronyms" + ) + synonyms: Optional[list[str]] = Field( + default=[], description="List of synonyms" + ) + coordinate_terms: Optional[list[str]] = Field( + default=[], description="List of coordinate terms" + ) + holonyms: Optional[list[str]] = Field( + default=[], description="List of holonyms" + ) From f5f39fcb6ab4d73708020933c9a9fafe1d851542 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 7 Dec 2023 17:59:08 +0100 Subject: [PATCH 6/7] Improve extraction of tags from glosses in Russian Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/ru/gloss.py | 112 +++++++++++++++----------- 1 file changed, 63 insertions(+), 49 deletions(-) diff --git a/src/wiktextract/extractor/ru/gloss.py b/src/wiktextract/extractor/ru/gloss.py index da07388b..8196c6a9 100644 --- a/src/wiktextract/extractor/ru/gloss.py +++ b/src/wiktextract/extractor/ru/gloss.py @@ -6,48 +6,58 @@ from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -TAGS_TEMPLATE_NAMES = { - # XXX: This list is incomplete. There are many more tag templates. Perhaps it would be better to assume all templates that are not recognized as something else are tags? - "жарг.", - "зоол.", - "искусств.", - "истор.", - "ихтиол.", - "книжн.", - "кулин.", +# Wiktioniary intern templates that can be ignores +META_TEMPLATES = { + "помета.", + "Нужен перевод", + "?", +} + +# Templates that are part of the clean gloss when expanded +GLOSS_TEMPLATES = { + "-", + "=", + "===", + "english surname example", + "lang", + "аббр.", + "выдел", + "гипокор.", + "дееприч.", + "действие", + "женск.", "ласк.", - "лингв.", - "матем.", - "мед.", - "минер.", - "минерал.", - "миф.", - "мифол.", - "неодобр.", - "п.", - "перен.", - "полит.", - "поэт.", - "пренебр.", - "прост.", - "разг.", - "религ.", - "техн.", - "устар.", - "фарм.", - "физ.", - "физиол.", - "филол.", - "филос.", - "фолькл.", - "хим.", - "церк.", - "шутл.", - "эвф.", - "экон.", - "юр.", + "мн", + "морфема", + "нареч.", + "наречие", + "однокр.", + "отн.", + "по.", + "по", + "превосх.", + "прич.", + "свойство", + "совершить", + "сокр.", + "сокращ", + "соотн.", + "сравн.", + "страд.", + "то же", + "увелич.", + "уменьш.", + "умласк", + "умласк.", + "унич.", + "уничиж.", + "хим-элем", + "элемент", } +# Templates that specify a note for the gloss +NOTE_TEMPLATES = {"пример", "помета", "??", "as ru"} + def extract_gloss( wxr: WiktextractContext, @@ -66,22 +76,25 @@ def extract_gloss( if child.template_name == "пример": process_example_template(wxr, sense, child) - elif child.template_name in TAGS_TEMPLATE_NAMES: - tag_templates.append(child) - raw_gloss_children.append(child) - - elif child.template_name == "помета": + elif child.template_name == "семантика": + # https://ru.wiktionary.org/wiki/Шаблон:семантика + # XXX: Extract semantic templates to linkages + continue + elif child.template_name in NOTE_TEMPLATES: note_templates.append(child) raw_gloss_children.append(child) - else: + elif child.template_name in META_TEMPLATES: + continue + + elif child.template_name in GLOSS_TEMPLATES: clean_gloss_children.append(child) raw_gloss_children.append(child) + else: + # Assume node is tag template + tag_templates.append(child) + raw_gloss_children.append(child) - wxr.wtp.debug( - f"Found template '{child.template_name}' in gloss that could be a tag", - sortid="extractor/ru/gloss/extract_gloss/75", - ) else: clean_gloss_children.append(child) raw_gloss_children.append(child) @@ -100,6 +113,7 @@ def extract_gloss( sense.gloss = gloss for tag_template in tag_templates: + # XXX: Expanded tags are mostly still abbreviations. In Wiktionary, however, they show the full word on hover. Perhaps it's possible to extract the full word from the template? tag = clean_node(wxr, {}, tag_template).strip() if tag: sense.tags.append(tag) From 9cc660f67c5fa2f5ca1f8dfc547e4343fb4063ad Mon Sep 17 00:00:00 2001 From: Empiriker Date: Fri, 8 Dec 2023 10:28:23 +0100 Subject: [PATCH 7/7] Implement code review suggestions for Russian Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/ru/example.py | 23 ++++------ src/wiktextract/extractor/ru/models.py | 58 +++++++++++-------------- src/wiktextract/extractor/ru/page.py | 1 - tests/test_ru_example.py | 6 +-- 4 files changed, 37 insertions(+), 51 deletions(-) diff --git a/src/wiktextract/extractor/ru/example.py b/src/wiktextract/extractor/ru/example.py index f32946df..a3dd1967 100644 --- a/src/wiktextract/extractor/ru/example.py +++ b/src/wiktextract/extractor/ru/example.py @@ -13,6 +13,11 @@ "ответственный": "editor", "перев": "translator", "источник": "source", + 2: "author", + 3: "title", + 4: "date", + 5: "collection", + 6: "date_published", } @@ -27,21 +32,11 @@ def process_example_template( value = clean_node(wxr, {}, value_raw).strip() if not value: continue - if isinstance(key, int): - if int(key) == 1: - example.text = value - elif int(key) == 2: - reference.author = value - elif int(key) == 3: - reference.title = value - elif int(key) == 4: - reference.date = value - elif int(key) == 5: - reference.collection = value - elif int(key) == 6: - reference.date_published = value + if isinstance(key, int) and key == 1: + example.text = value + else: - key = clean_node(wxr, {}, key) + key = clean_node(wxr, {}, key) if not isinstance(key, int) else key if key == "текст": example.text = value elif key == "перевод": diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index 20d0bb24..861e4a7b 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -39,38 +39,6 @@ class Sound(BaseModelWrap): ) -class Sense(BaseModelWrap): - raw_gloss: Optional[str] = Field( - default=None, - description="Raw gloss string for the word sense. This might contain tags and other markup.", - ) - gloss: Optional[str] = Field( - default=None, - description="Gloss string for the word sense. This has been cleaned, and should be straightforward text with no tags.", - ) - tags: list[str] = Field( - default=[], - description="List of tags affecting the word sense.", - ) - notes: list[str] = Field( - default=[], - description="List of notes for the word sense. Usually describing usage.", - ) - categories: list[str] = Field( - default=[], - description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", - ) - examples: list["Example"] = Field( - default=[], description="List of examples" - ) - # subsenses: list["Sense"] = Field( - # default=[], description="List of subsenses" - # ) - # senseid: Optional[int] = Field( - # default=None, description="Sense number used in Wiktionary" - # ) - - class Reference(BaseModelWrap): author: Optional[str] = Field(default=None, description="Author's name") title: Optional[str] = Field( @@ -100,7 +68,31 @@ class Example(BaseModelWrap): translation: Optional[str] = Field( default=None, description="Spanish translation of the example sentence" ) - ref: Optional["Reference"] = Field(default=None, description="") + ref: Optional[Reference] = Field(default=None, description="") + + +class Sense(BaseModelWrap): + raw_gloss: Optional[str] = Field( + default=None, + description="Raw gloss string for the word sense. This might contain tags and other markup.", + ) + gloss: Optional[str] = Field( + default=None, + description="Gloss string for the word sense. This has been cleaned, and should be straightforward text with no tags.", + ) + tags: list[str] = Field( + default=[], + description="List of tags affecting the word sense.", + ) + notes: list[str] = Field( + default=[], + description="List of notes for the word sense. Usually describing usage.", + ) + categories: list[str] = Field( + default=[], + description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", + ) + examples: list[Example] = Field(default=[], description="List of examples") class WordEntry(BaseModelWrap): diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py index 17cdf856..cdfa00d3 100644 --- a/src/wiktextract/extractor/ru/page.py +++ b/src/wiktextract/extractor/ru/page.py @@ -198,7 +198,6 @@ def parse_page( wxr.config.capture_language_codes is not None and lang_code not in wxr.config.capture_language_codes ): - print(f"Skipping language {lang_code}") continue categories = {"categories": []} diff --git a/tests/test_ru_example.py b/tests/test_ru_example.py index 1aa3ce77..dcdea45d 100644 --- a/tests/test_ru_example.py +++ b/tests/test_ru_example.py @@ -43,11 +43,11 @@ def test_ru_extract_example(self): }, # https://ru.wiktionary.org/wiki/house { - "input": "{{пример|This is my {{выдел|house}} and my family’s ancestral home.||перевод=Это мой {{выдел|дом}} и поселение моих семейных предков.}}", + "input": "{{пример|This is my house and my family’s ancestral home.||перевод=Это мой дом и поселение моих семейных предков.}}", "expected": [ { - "text": "This is my and my family’s ancestral home.", - "translation": "Это мой и поселение моих семейных предков.", + "text": "This is my house and my family’s ancestral home.", + "translation": "Это мой дом и поселение моих семейных предков.", } ], },