Skip to content

Commit

Permalink
Handle de edition's "Ähnlichkeiten Umschrift" template
Browse files Browse the repository at this point in the history
2934 pages use this template to list transcription words.
  • Loading branch information
xxyzz committed Apr 2, 2024
1 parent 2d30b31 commit 82f52fc
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,4 @@ class WordEntry(BaseModelWrap):
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
redirects: list[str] = []
30 changes: 29 additions & 1 deletion src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from mediawiki_langcodes import name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode
from wikitextprocessor.parser import LevelNode, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand Down Expand Up @@ -199,8 +199,36 @@ def parse_page(
)
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)
for template_node in level2_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "Ähnlichkeiten Umschrift":
process_umschrift_template(
wxr, page_data, base_data, template_node
)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [d.model_dump(exclude_defaults=True) for d in page_data]


def process_umschrift_template(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
template_node: TemplateNode,
) -> None:
# https://de.wiktionary.org/wiki/Vorlage:Ähnlichkeiten_Umschrift
# soft-redirect template, similar to en edition's "zh-see"
data = base_data.model_copy(deep=True)
data.pos = "soft-redirect"
for key, value in template_node.template_parameters.items():
if isinstance(key, int):
redirect_page = clean_node(wxr, None, value)
link_arg = template_node.template_parameters.get(f"link{key}", "")
link_text = clean_node(wxr, None, link_arg)
if len(link_text) > 0:
redirect_page = link_text
if len(redirect_page) > 0:
data.redirects.append(redirect_page)
if len(data.redirects) > 0:
page_data.append(data)
26 changes: 26 additions & 0 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,29 @@ def test_multiple_pos(self):
}
],
)

def test_umschrift(self):
self.wxr.wtp.add_page("Vorlage:Sprache", 10, "{{{1}}}")
self.wxr.wtp.start_page("iku")
self.assertEqual(
parse_page(
self.wxr,
"iku",
"""== hiki ({{Sprache|Umschrift}}) ==
{{Ähnlichkeiten Umschrift
|1=行く|spr1=ja
|2=幾|spr2=ja
|3=𒃷#𒃷 (iku) (Sumerisch)|spr3=sux|link3=𒃷
}}""",
),
[
{
"lang_code": "",
"lang": "Umschrift",
"pos": "soft-redirect",
"redirects": ["行く", "幾", "𒃷"],
"senses": [{"tags": ["no-gloss"]}],
"word": "iku",
}
],
)

0 comments on commit 82f52fc

Please sign in to comment.