-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #423 from empiriker/ru
Extract glosses, examples, translations and linkages from Russian Wiktionary
- Loading branch information
Showing
11 changed files
with
717 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"антонимы": "antonyms", | ||
"анаграммы": "anagrams", | ||
"варианты": "variants", | ||
"гиперонимы": "hypernyms", | ||
"гипонимы": "hyponyms", | ||
"дериваты": "derived", | ||
"меронимы": "meronyms", | ||
"синонимы": "synonyms", | ||
"согипонимы": "coordinate_terms", | ||
"холонимы": "holonyms" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from wikitextprocessor import WikiNode | ||
|
||
from wiktextract.extractor.ru.models import Example, Reference, Sense | ||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
EXAMPLE_TEMPLATE_KEY_MAPPING = { | ||
"автор": "author", | ||
"титул": "title", | ||
"дата": "date", | ||
"издание": "collection", | ||
"дата издания": "date_published", | ||
"ответственный": "editor", | ||
"перев": "translator", | ||
"источник": "source", | ||
2: "author", | ||
3: "title", | ||
4: "date", | ||
5: "collection", | ||
6: "date_published", | ||
} | ||
|
||
|
||
def process_example_template( | ||
wxr: WiktextractContext, | ||
sense: Sense, | ||
template_node: WikiNode, | ||
): | ||
example = Example() | ||
reference = Reference() | ||
for key, value_raw in template_node.template_parameters.items(): | ||
value = clean_node(wxr, {}, value_raw).strip() | ||
if not value: | ||
continue | ||
if isinstance(key, int) and key == 1: | ||
example.text = value | ||
|
||
else: | ||
key = clean_node(wxr, {}, key) if not isinstance(key, int) else key | ||
if key == "текст": | ||
example.text = value | ||
elif key == "перевод": | ||
example.translation = value | ||
elif key in EXAMPLE_TEMPLATE_KEY_MAPPING: | ||
field_name = EXAMPLE_TEMPLATE_KEY_MAPPING.get(key, key) | ||
if field_name in reference.model_fields: | ||
setattr(reference, field_name, value) | ||
else: | ||
wxr.wtp.debug( | ||
f"Unknown key {key} in example template {template_node}", | ||
sortid="wiktextract/extractor/ru/example/process_example_template/54", | ||
) | ||
|
||
if example.model_dump(exclude_defaults=True) != {}: | ||
if reference.model_dump(exclude_defaults=True) != {}: | ||
example.ref = reference | ||
|
||
sense.examples.append(example) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
from wikitextprocessor import NodeKind, WikiNode | ||
from wikitextprocessor.parser import WikiNodeChildrenList | ||
|
||
from wiktextract.extractor.ru.example import process_example_template | ||
from wiktextract.extractor.ru.models import Sense, WordEntry | ||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
# Wiktioniary intern templates that can be ignores | ||
META_TEMPLATES = { | ||
"помета.", | ||
"Нужен перевод", | ||
"?", | ||
} | ||
|
||
# Templates that are part of the clean gloss when expanded | ||
GLOSS_TEMPLATES = { | ||
"-", | ||
"=", | ||
"===", | ||
"english surname example", | ||
"lang", | ||
"аббр.", | ||
"выдел", | ||
"гипокор.", | ||
"дееприч.", | ||
"действие", | ||
"женск.", | ||
"ласк.", | ||
"мн", | ||
"морфема", | ||
"нареч.", | ||
"наречие", | ||
"однокр.", | ||
"отн.", | ||
"по.", | ||
"по", | ||
"превосх.", | ||
"прич.", | ||
"свойство", | ||
"совершить", | ||
"сокр.", | ||
"сокращ", | ||
"соотн.", | ||
"сравн.", | ||
"страд.", | ||
"то же", | ||
"увелич.", | ||
"уменьш.", | ||
"умласк", | ||
"умласк.", | ||
"унич.", | ||
"уничиж.", | ||
"хим-элем", | ||
"элемент", | ||
} | ||
|
||
# Templates that specify a note for the gloss | ||
NOTE_TEMPLATES = {"пример", "помета", "??", "as ru"} | ||
|
||
|
||
def extract_gloss( | ||
wxr: WiktextractContext, | ||
word_entry: WordEntry, | ||
item_node: WikiNode, | ||
): | ||
sense = Sense() | ||
|
||
raw_gloss_children: WikiNodeChildrenList = [] | ||
clean_gloss_children: WikiNodeChildrenList = [] | ||
tag_templates: list[WikiNode] = [] | ||
note_templates: list[WikiNode] = [] | ||
|
||
for child in item_node.children: | ||
if isinstance(child, WikiNode) and child.kind == NodeKind.TEMPLATE: | ||
if child.template_name == "пример": | ||
process_example_template(wxr, sense, child) | ||
|
||
elif child.template_name == "семантика": | ||
# https://ru.wiktionary.org/wiki/Шаблон:семантика | ||
# XXX: Extract semantic templates to linkages | ||
continue | ||
elif child.template_name in NOTE_TEMPLATES: | ||
note_templates.append(child) | ||
raw_gloss_children.append(child) | ||
|
||
elif child.template_name in META_TEMPLATES: | ||
continue | ||
|
||
elif child.template_name in GLOSS_TEMPLATES: | ||
clean_gloss_children.append(child) | ||
raw_gloss_children.append(child) | ||
else: | ||
# Assume node is tag template | ||
tag_templates.append(child) | ||
raw_gloss_children.append(child) | ||
|
||
else: | ||
clean_gloss_children.append(child) | ||
raw_gloss_children.append(child) | ||
|
||
remove_obsolete_leading_nodes(raw_gloss_children) | ||
remove_obsolete_leading_nodes(clean_gloss_children) | ||
|
||
if raw_gloss_children: | ||
raw_gloss = clean_node(wxr, {}, raw_gloss_children).strip() | ||
if raw_gloss: | ||
sense.raw_gloss = raw_gloss | ||
|
||
if clean_gloss_children: | ||
gloss = clean_node(wxr, {}, clean_gloss_children).strip() | ||
if gloss: | ||
sense.gloss = gloss | ||
|
||
for tag_template in tag_templates: | ||
# XXX: Expanded tags are mostly still abbreviations. In Wiktionary, however, they show the full word on hover. Perhaps it's possible to extract the full word from the template? | ||
tag = clean_node(wxr, {}, tag_template).strip() | ||
if tag: | ||
sense.tags.append(tag) | ||
|
||
for note_template in note_templates: | ||
note = clean_node(wxr, {}, note_template).strip() | ||
if note: | ||
sense.notes.append(note) | ||
|
||
if sense.model_dump(exclude_defaults=True) != {}: | ||
word_entry.senses.append(sense) | ||
|
||
|
||
def remove_obsolete_leading_nodes(nodes: WikiNodeChildrenList): | ||
while ( | ||
nodes | ||
and isinstance(nodes[0], str) | ||
and nodes[0].strip() in ["", "и", "или", ",", ".", ";", ":", "\n"] | ||
): | ||
nodes.pop(0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from wikitextprocessor import NodeKind, WikiNode | ||
|
||
from wiktextract.extractor.ru.models import WordEntry | ||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
def extract_linkages( | ||
wxr: WiktextractContext, | ||
word_entry: WordEntry, | ||
linkage_type: str, | ||
level_node: WikiNode, | ||
): | ||
if not linkage_type in word_entry.model_fields: | ||
wxr.wtp.debug( | ||
f"Linkage type {linkage_type} not defined for word entry", | ||
sortid="extractor/ru/linkage/extract_linkages/10", | ||
) | ||
return | ||
for link_node in level_node.find_child_recursively(NodeKind.LINK): | ||
word = clean_node(wxr, {}, link_node).strip() | ||
if word: | ||
getattr(word_entry, linkage_type).append(word) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.