Skip to content

Commit

Permalink
Merge pull request #567 from xxyzz/de
Browse files Browse the repository at this point in the history
Handle de edition's "Ähnlichkeiten Umschrift" template
  • Loading branch information
xxyzz authored Apr 2, 2024
2 parents 2d30b31 + 82f52fc commit cdbac03
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,4 @@ class WordEntry(BaseModelWrap):
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
redirects: list[str] = []
30 changes: 29 additions & 1 deletion src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from mediawiki_langcodes import name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wikitextprocessor.parser import LevelNode, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -199,8 +199,36 @@ def parse_page(
)
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)
for template_node in level2_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "Ähnlichkeiten Umschrift":
process_umschrift_template(
wxr, page_data, base_data, template_node
)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [d.model_dump(exclude_defaults=True) for d in page_data]


def process_umschrift_template(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
template_node: TemplateNode,
) -> None:
# https://de.wiktionary.org/wiki/Vorlage:Ähnlichkeiten_Umschrift
# soft-redirect template, similar to en edition's "zh-see"
data = base_data.model_copy(deep=True)
data.pos = "soft-redirect"
for key, value in template_node.template_parameters.items():
if isinstance(key, int):
redirect_page = clean_node(wxr, None, value)
link_arg = template_node.template_parameters.get(f"link{key}", "")
link_text = clean_node(wxr, None, link_arg)
if len(link_text) > 0:
redirect_page = link_text
if len(redirect_page) > 0:
data.redirects.append(redirect_page)
if len(data.redirects) > 0:
page_data.append(data)
26 changes: 26 additions & 0 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,29 @@ def test_multiple_pos(self):
}
],
)

def test_umschrift(self):
self.wxr.wtp.add_page("Vorlage:Sprache", 10, "{{{1}}}")
self.wxr.wtp.start_page("iku")
self.assertEqual(
parse_page(
self.wxr,
"iku",
"""== hiki ({{Sprache|Umschrift}}) ==
{{Ähnlichkeiten Umschrift
|1=行く|spr1=ja
|2=幾|spr2=ja
|3=𒃷#𒃷 (iku) (Sumerisch)|spr3=sux|link3=𒃷
}}""",
),
[
{
"lang_code": "",
"lang": "Umschrift",
"pos": "soft-redirect",
"redirects": ["行く", "幾", "𒃷"],
"senses": [{"tags": ["no-gloss"]}],
"word": "iku",
}
],
)

0 comments on commit cdbac03

Please sign in to comment.