Merge pull request #423 from empiriker/ru

Extract glosses, examples, translations and linkages from Russian Wiktionary
tatuylonen · Dec 8, 2023 · 8cd256b · 8cd256b
2 parents 3214765 + 9cc660f
commit 8cd256b
Show file tree

Hide file tree

Showing 11 changed files with 717 additions and 3 deletions.
diff --git a/src/wiktextract/data/ru/linkage_subtitles.json b/src/wiktextract/data/ru/linkage_subtitles.json
@@ -0,0 +1,12 @@
+{
+  "антонимы": "antonyms",
+  "анаграммы": "anagrams",
+  "варианты": "variants",
+  "гиперонимы": "hypernyms",
+  "гипонимы": "hyponyms",
+  "дериваты": "derived",
+  "меронимы": "meronyms",
+  "синонимы": "synonyms",
+  "согипонимы": "coordinate_terms",
+  "холонимы": "holonyms"
+}
diff --git a/src/wiktextract/extractor/ru/example.py b/src/wiktextract/extractor/ru/example.py
@@ -0,0 +1,58 @@
+from wikitextprocessor import WikiNode
+
+from wiktextract.extractor.ru.models import Example, Reference, Sense
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+EXAMPLE_TEMPLATE_KEY_MAPPING = {
+    "автор": "author",
+    "титул": "title",
+    "дата": "date",
+    "издание": "collection",
+    "дата издания": "date_published",
+    "ответственный": "editor",
+    "перев": "translator",
+    "источник": "source",
+    2: "author",
+    3: "title",
+    4: "date",
+    5: "collection",
+    6: "date_published",
+}
+
+
+def process_example_template(
+    wxr: WiktextractContext,
+    sense: Sense,
+    template_node: WikiNode,
+):
+    example = Example()
+    reference = Reference()
+    for key, value_raw in template_node.template_parameters.items():
+        value = clean_node(wxr, {}, value_raw).strip()
+        if not value:
+            continue
+        if isinstance(key, int) and key == 1:
+            example.text = value
+
+        else:
+            key = clean_node(wxr, {}, key) if not isinstance(key, int) else key
+            if key == "текст":
+                example.text = value
+            elif key == "перевод":
+                example.translation = value
+            elif key in EXAMPLE_TEMPLATE_KEY_MAPPING:
+                field_name = EXAMPLE_TEMPLATE_KEY_MAPPING.get(key, key)
+                if field_name in reference.model_fields:
+                    setattr(reference, field_name, value)
+                else:
+                    wxr.wtp.debug(
+                        f"Unknown key {key} in example template {template_node}",
+                        sortid="wiktextract/extractor/ru/example/process_example_template/54",
+                    )
+
+    if example.model_dump(exclude_defaults=True) != {}:
+        if reference.model_dump(exclude_defaults=True) != {}:
+            example.ref = reference
+
+        sense.examples.append(example)
diff --git a/src/wiktextract/extractor/ru/gloss.py b/src/wiktextract/extractor/ru/gloss.py
@@ -0,0 +1,136 @@
+from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import WikiNodeChildrenList
+
+from wiktextract.extractor.ru.example import process_example_template
+from wiktextract.extractor.ru.models import Sense, WordEntry
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+# Wiktioniary intern templates that can be ignores
+META_TEMPLATES = {
+    "помета.",
+    "Нужен перевод",
+    "?",
+}
+
+# Templates that are part of the clean gloss when expanded
+GLOSS_TEMPLATES = {
+    "-",
+    "=",
+    "===",
+    "english surname example",
+    "lang",
+    "аббр.",
+    "выдел",
+    "гипокор.",
+    "дееприч.",
+    "действие",
+    "женск.",
+    "ласк.",
+    "мн",
+    "морфема",
+    "нареч.",
+    "наречие",
+    "однокр.",
+    "отн.",
+    "по.",
+    "по",
+    "превосх.",
+    "прич.",
+    "свойство",
+    "совершить",
+    "сокр.",
+    "сокращ",
+    "соотн.",
+    "сравн.",
+    "страд.",
+    "то же",
+    "увелич.",
+    "уменьш.",
+    "умласк",
+    "умласк.",
+    "унич.",
+    "уничиж.",
+    "хим-элем",
+    "элемент",
+}
+
+# Templates that specify a note for the gloss
+NOTE_TEMPLATES = {"пример", "помета", "??", "as ru"}
+
+
+def extract_gloss(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    item_node: WikiNode,
+):
+    sense = Sense()
+
+    raw_gloss_children: WikiNodeChildrenList = []
+    clean_gloss_children: WikiNodeChildrenList = []
+    tag_templates: list[WikiNode] = []
+    note_templates: list[WikiNode] = []
+
+    for child in item_node.children:
+        if isinstance(child, WikiNode) and child.kind == NodeKind.TEMPLATE:
+            if child.template_name == "пример":
+                process_example_template(wxr, sense, child)
+
+            elif child.template_name == "семантика":
+                # https://ru.wiktionary.org/wiki/Шаблон:семантика
+                # XXX: Extract semantic templates to linkages
+                continue
+            elif child.template_name in NOTE_TEMPLATES:
+                note_templates.append(child)
+                raw_gloss_children.append(child)
+
+            elif child.template_name in META_TEMPLATES:
+                continue
+
+            elif child.template_name in GLOSS_TEMPLATES:
+                clean_gloss_children.append(child)
+                raw_gloss_children.append(child)
+            else:
+                # Assume node is tag template
+                tag_templates.append(child)
+                raw_gloss_children.append(child)
+
+        else:
+            clean_gloss_children.append(child)
+            raw_gloss_children.append(child)
+
+    remove_obsolete_leading_nodes(raw_gloss_children)
+    remove_obsolete_leading_nodes(clean_gloss_children)
+
+    if raw_gloss_children:
+        raw_gloss = clean_node(wxr, {}, raw_gloss_children).strip()
+        if raw_gloss:
+            sense.raw_gloss = raw_gloss
+
+    if clean_gloss_children:
+        gloss = clean_node(wxr, {}, clean_gloss_children).strip()
+        if gloss:
+            sense.gloss = gloss
+
+    for tag_template in tag_templates:
+        # XXX: Expanded tags are mostly still abbreviations. In Wiktionary, however, they show the full word on hover. Perhaps it's possible to extract the full word from the template?
+        tag = clean_node(wxr, {}, tag_template).strip()
+        if tag:
+            sense.tags.append(tag)
+
+    for note_template in note_templates:
+        note = clean_node(wxr, {}, note_template).strip()
+        if note:
+            sense.notes.append(note)
+
+    if sense.model_dump(exclude_defaults=True) != {}:
+        word_entry.senses.append(sense)
+
+
+def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList):
+    while (
+        nodes
+        and isinstance(nodes[0], str)
+        and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":", "\n"]
+    ):
+        nodes.pop(0)
diff --git a/src/wiktextract/extractor/ru/linkage.py b/src/wiktextract/extractor/ru/linkage.py
@@ -0,0 +1,23 @@
+from wikitextprocessor import NodeKind, WikiNode
+
+from wiktextract.extractor.ru.models import WordEntry
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+
+def extract_linkages(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    linkage_type: str,
+    level_node: WikiNode,
+):
+    if not linkage_type in word_entry.model_fields:
+        wxr.wtp.debug(
+            f"Linkage type {linkage_type} not defined for word entry",
+            sortid="extractor/ru/linkage/extract_linkages/10",
+        )
+        return
+    for link_node in level_node.find_child_recursively(NodeKind.LINK):
+        word = clean_node(wxr, {}, link_node).strip()
+        if word:
+            getattr(word_entry, linkage_type).append(word)
diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py
@@ -1,11 +1,26 @@
 from typing import Optional
+
 from pydantic import BaseModel, ConfigDict, Field
 
 
 class BaseModelWrap(BaseModel):
     model_config = ConfigDict(validate_assignment=True, extra="forbid")
 
 
+class Translation(BaseModelWrap):
+    word: str = Field(description="Translation term")
+    lang_code: str = Field(
+        description="Wiktionary language code of the translation term"
+    )
+    lang_name: str = Field(
+        description="Localized language name of the translation term"
+    )
+    sense: Optional[str] = Field(
+        default=None,
+        description="An optional gloss describing the sense translated",
+    )
+
+
 class Sound(BaseModelWrap):
     ipa: Optional[str] = Field(
         default=None, description="International Phonetic Alphabet"
@@ -24,6 +39,62 @@ class Sound(BaseModelWrap):
     )
 
 
+class Reference(BaseModelWrap):
+    author: Optional[str] = Field(default=None, description="Author's name")
+    title: Optional[str] = Field(
+        default=None, description="Title of the reference"
+    )
+    date: Optional[str] = Field(default=None, description="Original date")
+    date_published: Optional[str] = Field(
+        default=None, description="Date of publication"
+    )
+
+    collection: Optional[str] = Field(
+        default=None,
+        description="Name of the collection the example was taken from",
+    )
+    editor: Optional[str] = Field(default=None, description="Editor")
+    translator: Optional[str] = Field(default=None, description="Translator")
+    source: Optional[str] = Field(
+        default=None,
+        description="Source of reference, corresponds to template parameter 'источник'",
+    )
+
+
+class Example(BaseModelWrap):
+    text: Optional[str] = Field(
+        default=None, description="Example usage sentence"
+    )
+    translation: Optional[str] = Field(
+        default=None, description="Spanish translation of the example sentence"
+    )
+    ref: Optional[Reference] = Field(default=None, description="")
+
+
+class Sense(BaseModelWrap):
+    raw_gloss: Optional[str] = Field(
+        default=None,
+        description="Raw gloss string for the word sense. This might contain tags and other markup.",
+    )
+    gloss: Optional[str] = Field(
+        default=None,
+        description="Gloss string for the word sense. This has been cleaned, and should be straightforward text with no tags.",
+    )
+    tags: list[str] = Field(
+        default=[],
+        description="List of tags affecting the word sense.",
+    )
+    notes: list[str] = Field(
+        default=[],
+        description="List of notes for the word sense. Usually describing usage.",
+    )
+    categories: list[str] = Field(
+        default=[],
+        description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
+    )
+    examples: list[Example] = Field(default=[], description="List of examples")
+
+
 class WordEntry(BaseModelWrap):
     """
     WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.
@@ -45,3 +116,36 @@ class WordEntry(BaseModelWrap):
         description="list of non-disambiguated categories for the word",
     )
     sounds: Optional[list[Sound]] = []
+    senses: Optional[list[Sense]] = []
+    translations: Optional[list[Translation]] = []
+
+    antonyms: Optional[list[str]] = Field(
+        default=[], description="List of antonyms"
+    )
+    anagrams: Optional[list[str]] = Field(
+        default=[], description="List of anagrams"
+    )
+    variants: Optional[list[str]] = Field(
+        default=[], description="List of variants"
+    )
+    hypernyms: Optional[list[str]] = Field(
+        default=[], description="List of hypernyms"
+    )
+    hyponyms: Optional[list[str]] = Field(
+        default=[], description="List of hyponyms"
+    )
+    derived: Optional[list[str]] = Field(
+        default=[], description="List of derived terms"
+    )
+    meronyms: Optional[list[str]] = Field(
+        default=[], description="List of meronyms"
+    )
+    synonyms: Optional[list[str]] = Field(
+        default=[], description="List of synonyms"
+    )
+    coordinate_terms: Optional[list[str]] = Field(
+        default=[], description="List of coordinate terms"
+    )
+    holonyms: Optional[list[str]] = Field(
+        default=[], description="List of holonyms"
+    )