Skip to content

Commit

Permalink
Merge pull request #465 from xxyzz/ru
Browse files Browse the repository at this point in the history
Extract linkage tags for ru edition
  • Loading branch information
xxyzz authored Jan 24, 2024
2 parents e04c498 + dcc30a2 commit 4f08e34
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 4 deletions.
41 changes: 37 additions & 4 deletions src/wiktextract/extractor/ru/linkage.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.extractor.ru.models import Linkage, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -16,7 +17,39 @@ def extract_linkages(
sortid="extractor/ru/linkage/extract_linkages/10",
)
return
for link_node in level_node.find_child_recursively(NodeKind.LINK):
word = clean_node(wxr, {}, link_node).strip()
if word:
getattr(word_entry, linkage_type).append(Linkage(word=word))
for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
linkage = Linkage()
for node in list_item.children:
if isinstance(node, WikiNode):
if node.kind == NodeKind.LINK:
linkage.word = clean_node(wxr, None, node)
elif isinstance(node, TemplateNode):
find_linkage_tag(wxr, linkage, node)
elif isinstance(node, str) and node.strip() in (";", ","):
if len(linkage.word) > 0:
getattr(word_entry, linkage_type).append(linkage)
tags = linkage.tags
linkage = Linkage()
if node.strip() == ",":
linkage.tags = tags

if len(linkage.word) > 0:
getattr(word_entry, linkage_type).append(linkage)
linkage = Linkage()


def find_linkage_tag(
wxr: WiktextractContext,
linkage: Linkage,
template_node: TemplateNode,
) -> None:
expanded_template = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
for span_node in expanded_template.find_html_recursively("span"):
if "title" in span_node.attrs:
tag = span_node.attrs["title"]
else:
tag = clean_node(wxr, None, span_node)
if len(tag) > 0:
linkage.tags.append(tag)
1 change: 1 addition & 0 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class Translation(BaseModelWrap):

class Linkage(BaseModelWrap):
word: str = ""
tags: list[str] = []


class Sound(BaseModelWrap):
Expand Down
47 changes: 47 additions & 0 deletions tests/test_ru_linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from unittest import TestCase

from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.ru.linkage import extract_linkages
from wiktextract.extractor.ru.models import WordEntry
from wiktextract.wxr_context import WiktextractContext


class TestLinkage(TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="ru"), WiktionaryConfig(dump_file_lang_code="ru")
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_linkage(self):
word_entry = WordEntry(
word="русский", pos="adj", lang_code="ru", lang="Русский"
)
self.wxr.wtp.start_page("русский")
self.wxr.wtp.add_page("Шаблон:помета", 10, "<span>экзоэтнонимы</span>")
self.wxr.wtp.add_page(
"Шаблон:собир.",
10,
'[[Викисловарь:Условные сокращения|<span title="собирательное">собир.</span>]]',
)
self.wxr.wtp.add_page(
"Шаблон:уничиж.",
10,
'[[Викисловарь:Условные сокращения|<span title="уничижительное">уничиж.</span>]]',
)
root = self.wxr.wtp.parse(
"# {{помета|экзоэтнонимы}}: [[кацап]], [[москаль]], [[шурави]]; {{собир.|-}}, {{уничиж.|-}}: [[русня]]"
)
extract_linkages(self.wxr, word_entry, "synonyms", root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in word_entry.synonyms],
[
{"word": "кацап", "tags": ["экзоэтнонимы"]},
{"word": "москаль", "tags": ["экзоэтнонимы"]},
{"word": "шурави", "tags": ["экзоэтнонимы"]},
{"word": "русня", "tags": ["собирательное", "уничижительное"]},
],
)

0 comments on commit 4f08e34

Please sign in to comment.