Skip to content

Commit

Permalink
Merge pull request #423 from empiriker/ru
Browse files Browse the repository at this point in the history
Extract glosses, examples, translations and linkages from Russian Wiktionary
  • Loading branch information
xxyzz authored Dec 8, 2023
2 parents 3214765 + 9cc660f commit 8cd256b
Show file tree
Hide file tree
Showing 11 changed files with 717 additions and 3 deletions.
12 changes: 12 additions & 0 deletions src/wiktextract/data/ru/linkage_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"антонимы": "antonyms",
"анаграммы": "anagrams",
"варианты": "variants",
"гиперонимы": "hypernyms",
"гипонимы": "hyponyms",
"дериваты": "derived",
"меронимы": "meronyms",
"синонимы": "synonyms",
"согипонимы": "coordinate_terms",
"холонимы": "holonyms"
}
58 changes: 58 additions & 0 deletions src/wiktextract/extractor/ru/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from wikitextprocessor import WikiNode

from wiktextract.extractor.ru.models import Example, Reference, Sense
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

EXAMPLE_TEMPLATE_KEY_MAPPING = {
"автор": "author",
"титул": "title",
"дата": "date",
"издание": "collection",
"дата издания": "date_published",
"ответственный": "editor",
"перев": "translator",
"источник": "source",
2: "author",
3: "title",
4: "date",
5: "collection",
6: "date_published",
}


def process_example_template(
wxr: WiktextractContext,
sense: Sense,
template_node: WikiNode,
):
example = Example()
reference = Reference()
for key, value_raw in template_node.template_parameters.items():
value = clean_node(wxr, {}, value_raw).strip()
if not value:
continue
if isinstance(key, int) and key == 1:
example.text = value

else:
key = clean_node(wxr, {}, key) if not isinstance(key, int) else key
if key == "текст":
example.text = value
elif key == "перевод":
example.translation = value
elif key in EXAMPLE_TEMPLATE_KEY_MAPPING:
field_name = EXAMPLE_TEMPLATE_KEY_MAPPING.get(key, key)
if field_name in reference.model_fields:
setattr(reference, field_name, value)
else:
wxr.wtp.debug(
f"Unknown key {key} in example template {template_node}",
sortid="wiktextract/extractor/ru/example/process_example_template/54",
)

if example.model_dump(exclude_defaults=True) != {}:
if reference.model_dump(exclude_defaults=True) != {}:
example.ref = reference

sense.examples.append(example)
136 changes: 136 additions & 0 deletions src/wiktextract/extractor/ru/gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.ru.example import process_example_template
from wiktextract.extractor.ru.models import Sense, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

# Wiktioniary intern templates that can be ignores
META_TEMPLATES = {
"помета.",
"Нужен перевод",
"?",
}

# Templates that are part of the clean gloss when expanded
GLOSS_TEMPLATES = {
"-",
"=",
"===",
"english surname example",
"lang",
"аббр.",
"выдел",
"гипокор.",
"дееприч.",
"действие",
"женск.",
"ласк.",
"мн",
"морфема",
"нареч.",
"наречие",
"однокр.",
"отн.",
"по.",
"по",
"превосх.",
"прич.",
"свойство",
"совершить",
"сокр.",
"сокращ",
"соотн.",
"сравн.",
"страд.",
"то же",
"увелич.",
"уменьш.",
"умласк",
"умласк.",
"унич.",
"уничиж.",
"хим-элем",
"элемент",
}

# Templates that specify a note for the gloss
NOTE_TEMPLATES = {"пример", "помета", "??", "as ru"}


def extract_gloss(
wxr: WiktextractContext,
word_entry: WordEntry,
item_node: WikiNode,
):
sense = Sense()

raw_gloss_children: WikiNodeChildrenList = []
clean_gloss_children: WikiNodeChildrenList = []
tag_templates: list[WikiNode] = []
note_templates: list[WikiNode] = []

for child in item_node.children:
if isinstance(child, WikiNode) and child.kind == NodeKind.TEMPLATE:
if child.template_name == "пример":
process_example_template(wxr, sense, child)

elif child.template_name == "семантика":
# https://ru.wiktionary.org/wiki/Шаблон:семантика
# XXX: Extract semantic templates to linkages
continue
elif child.template_name in NOTE_TEMPLATES:
note_templates.append(child)
raw_gloss_children.append(child)

elif child.template_name in META_TEMPLATES:
continue

elif child.template_name in GLOSS_TEMPLATES:
clean_gloss_children.append(child)
raw_gloss_children.append(child)
else:
# Assume node is tag template
tag_templates.append(child)
raw_gloss_children.append(child)

else:
clean_gloss_children.append(child)
raw_gloss_children.append(child)

remove_obsolete_leading_nodes(raw_gloss_children)
remove_obsolete_leading_nodes(clean_gloss_children)

if raw_gloss_children:
raw_gloss = clean_node(wxr, {}, raw_gloss_children).strip()
if raw_gloss:
sense.raw_gloss = raw_gloss

if clean_gloss_children:
gloss = clean_node(wxr, {}, clean_gloss_children).strip()
if gloss:
sense.gloss = gloss

for tag_template in tag_templates:
# XXX: Expanded tags are mostly still abbreviations. In Wiktionary, however, they show the full word on hover. Perhaps it's possible to extract the full word from the template?
tag = clean_node(wxr, {}, tag_template).strip()
if tag:
sense.tags.append(tag)

for note_template in note_templates:
note = clean_node(wxr, {}, note_template).strip()
if note:
sense.notes.append(note)

if sense.model_dump(exclude_defaults=True) != {}:
word_entry.senses.append(sense)


def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList):
while (
nodes
and isinstance(nodes[0], str)
and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":", "\n"]
):
nodes.pop(0)
23 changes: 23 additions & 0 deletions src/wiktextract/extractor/ru/linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from wikitextprocessor import NodeKind, WikiNode

from wiktextract.extractor.ru.models import WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_linkages(
wxr: WiktextractContext,
word_entry: WordEntry,
linkage_type: str,
level_node: WikiNode,
):
if not linkage_type in word_entry.model_fields:
wxr.wtp.debug(
f"Linkage type {linkage_type} not defined for word entry",
sortid="extractor/ru/linkage/extract_linkages/10",
)
return
for link_node in level_node.find_child_recursively(NodeKind.LINK):
word = clean_node(wxr, {}, link_node).strip()
if word:
getattr(word_entry, linkage_type).append(word)
104 changes: 104 additions & 0 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
from typing import Optional

from pydantic import BaseModel, ConfigDict, Field


class BaseModelWrap(BaseModel):
model_config = ConfigDict(validate_assignment=True, extra="forbid")


class Translation(BaseModelWrap):
word: str = Field(description="Translation term")
lang_code: str = Field(
description="Wiktionary language code of the translation term"
)
lang_name: str = Field(
description="Localized language name of the translation term"
)
sense: Optional[str] = Field(
default=None,
description="An optional gloss describing the sense translated",
)


class Sound(BaseModelWrap):
ipa: Optional[str] = Field(
default=None, description="International Phonetic Alphabet"
Expand All @@ -24,6 +39,62 @@ class Sound(BaseModelWrap):
)


class Reference(BaseModelWrap):
author: Optional[str] = Field(default=None, description="Author's name")
title: Optional[str] = Field(
default=None, description="Title of the reference"
)
date: Optional[str] = Field(default=None, description="Original date")
date_published: Optional[str] = Field(
default=None, description="Date of publication"
)

collection: Optional[str] = Field(
default=None,
description="Name of the collection the example was taken from",
)
editor: Optional[str] = Field(default=None, description="Editor")
translator: Optional[str] = Field(default=None, description="Translator")
source: Optional[str] = Field(
default=None,
description="Source of reference, corresponds to template parameter 'источник'",
)


class Example(BaseModelWrap):
text: Optional[str] = Field(
default=None, description="Example usage sentence"
)
translation: Optional[str] = Field(
default=None, description="Spanish translation of the example sentence"
)
ref: Optional[Reference] = Field(default=None, description="")


class Sense(BaseModelWrap):
raw_gloss: Optional[str] = Field(
default=None,
description="Raw gloss string for the word sense. This might contain tags and other markup.",
)
gloss: Optional[str] = Field(
default=None,
description="Gloss string for the word sense. This has been cleaned, and should be straightforward text with no tags.",
)
tags: list[str] = Field(
default=[],
description="List of tags affecting the word sense.",
)
notes: list[str] = Field(
default=[],
description="List of notes for the word sense. Usually describing usage.",
)
categories: list[str] = Field(
default=[],
description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
)
examples: list[Example] = Field(default=[], description="List of examples")


class WordEntry(BaseModelWrap):
"""
WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.
Expand All @@ -45,3 +116,36 @@ class WordEntry(BaseModelWrap):
description="list of non-disambiguated categories for the word",
)
sounds: Optional[list[Sound]] = []
senses: Optional[list[Sense]] = []
translations: Optional[list[Translation]] = []

antonyms: Optional[list[str]] = Field(
default=[], description="List of antonyms"
)
anagrams: Optional[list[str]] = Field(
default=[], description="List of anagrams"
)
variants: Optional[list[str]] = Field(
default=[], description="List of variants"
)
hypernyms: Optional[list[str]] = Field(
default=[], description="List of hypernyms"
)
hyponyms: Optional[list[str]] = Field(
default=[], description="List of hyponyms"
)
derived: Optional[list[str]] = Field(
default=[], description="List of derived terms"
)
meronyms: Optional[list[str]] = Field(
default=[], description="List of meronyms"
)
synonyms: Optional[list[str]] = Field(
default=[], description="List of synonyms"
)
coordinate_terms: Optional[list[str]] = Field(
default=[], description="List of coordinate terms"
)
holonyms: Optional[list[str]] = Field(
default=[], description="List of holonyms"
)
Loading

0 comments on commit 8cd256b

Please sign in to comment.