Skip to content

Commit

Permalink
[pt] extract translation section
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 3, 2024
1 parent b518906 commit b07ffde
Show file tree
Hide file tree
Showing 4 changed files with 371 additions and 0 deletions.
17 changes: 17 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,22 @@ class Sense(PortugueseBaseModel):
examples: list[Example] = []


class Translation(PortugueseBaseModel):
lang_code: str = Field(
default="",
description="Wiktionary language code of the translation term",
)
lang: str = Field(default="", description="Translation language name")
word: str = Field(default="", description="Translation term")
sense: str = Field(default="", description="Translation gloss")
sense_index: int = Field(
default=0, ge=0, description="Number of the definition, start from 1"
)
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""


class WordEntry(PortugueseBaseModel):
model_config = ConfigDict(title="Portuguese Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -36,3 +52,4 @@ class WordEntry(PortugueseBaseModel):
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
translations: list[Translation] = []
17 changes: 17 additions & 0 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .translation import extract_translation_section


def parse_section(
Expand All @@ -30,13 +31,29 @@ def parse_section(
title_text,
cats.get("categories", []),
)
elif title_text == "Tradução":
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)

cats = {}
for link_node in level_node.find_child(NodeKind.LINK):
clean_node(wxr, cats, link_node)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.categories.extend(cats.get("categories", []))

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
# page layout
# https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
if "/traduções" in page_title: # skip translation page
return []
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text)
page_data: list[WordEntry] = []
Expand Down
232 changes: 232 additions & 0 deletions src/wiktextract/extractor/pt/translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
import re

from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Translation, WordEntry


def extract_translation_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
sense = ""
sense_index = 0
for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
match node.kind:
case NodeKind.TEMPLATE:
if node.template_name == "tradini":
sense, sense_index = extract_tradini_template(wxr, node)
case NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_translation_list_item(
wxr, word_entry, list_item, sense, sense_index
)


def extract_tradini_template(
wxr: WiktextractContext, t_node: TemplateNode
) -> tuple[str, str]:
# https://pt.wiktionary.org/wiki/Predefinição:tradini
sense = ""
sense_index = 0
first_arg_str = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
m = re.match(r"De (\d+)", first_arg_str)
if m is not None:
sense_index = int(m.group(1))
sense = first_arg_str[m.end() :].strip("() ")
else:
sense = first_arg_str
return sense, sense_index


def extract_translation_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
sense: str,
sense_index: int,
) -> None:
translations = []
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
link_str = clean_node(wxr, None, node)
if "/traduções" in link_str:
extract_translation_subpage(wxr, word_entry, link_str)
elif isinstance(node, TemplateNode):
match node.template_name:
case "trad":
translations.extend(
extract_trad_template(wxr, node, sense, sense_index)
)
case "trad-":
translations.extend(
extract_trad_minus_template(
wxr, node, sense, sense_index
)
)
case "t":
translations.extend(
extract_t_template(wxr, node, sense, sense_index)
)
case "xlatio":
translations.extend(
extract_xlatio_template(
wxr,
node,
sense,
sense_index,
translations[-1].lang
if len(translations) > 0
else "unknown",
)
)
elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None:
roman = node.strip("() ")
for tr_data in translations:
tr_data.roman = roman
elif (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and len(translations) > 0
):
raw_tag = clean_node(wxr, None, node)
if raw_tag != "":
translations[-1].raw_tags.append(raw_tag)

word_entry.translations.extend(translations)


def extract_trad_template(
wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
) -> list[Translation]:
# https://pt.wiktionary.org/wiki/Predefinição:trad
translations = []
roman = clean_node(wxr, None, t_node.template_parameters.get("t", ""))
lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for link_node in expanded_node.find_child(NodeKind.LINK):
lang_name = clean_node(wxr, None, link_node)
break
for arg in range(2, 12):
if arg not in t_node.template_parameters:
break
tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, ""))
translations.append(
Translation(
word=tr_str,
lang=lang_name,
lang_code=lang_code,
roman=roman,
sense=sense,
sense_index=sense_index,
)
)
return translations


def extract_trad_minus_template(
wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
) -> list[Translation]:
# https://pt.wiktionary.org/wiki/Predefinição:trad-
translations = []
lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
lang_name = "unknown"
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for link_node in expanded_node.find_child(NodeKind.LINK):
lang_name = clean_node(wxr, None, link_node)
break
tr_data = Translation(
word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
lang=lang_name,
lang_code=lang_code,
roman=clean_node(
wxr, None, t_node.template_parameters.get(3, "")
).strip("() "),
sense=sense,
sense_index=sense_index,
)
if tr_data.word != "":
translations.append(tr_data)
return translations


TRANSLATION_GENDER_TAGS = {
"c": "common",
"f": "feminine",
"m": "masculine",
"n": "neuter",
}


def extract_t_template(
wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
) -> list[Translation]:
# https://pt.wiktionary.org/wiki/Predefinição:t
translations = []
lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
lang_name = "unknown"
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for link_node in expanded_node.find_child(NodeKind.LINK):
lang_name = clean_node(wxr, None, link_node)
break
tr_data = Translation(
word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
lang=lang_name,
lang_code=lang_code,
roman=clean_node(
wxr, None, t_node.template_parameters.get(4, "")
).strip("() "),
sense=sense,
sense_index=sense_index,
)
gender_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
if gender_arg in TRANSLATION_GENDER_TAGS:
tr_data.tags.append(TRANSLATION_GENDER_TAGS[gender_arg])
if tr_data.word != "":
translations.append(tr_data)
return translations


def extract_xlatio_template(
wxr: WiktextractContext,
t_node: TemplateNode,
sense: str,
sense_index: int,
lang_name: str,
) -> list[Translation]:
# https://pt.wiktionary.org/wiki/Predefinição:xlatio
translations = []
lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
tr_data = Translation(
word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
lang=lang_name,
lang_code=lang_code,
sense=sense,
sense_index=sense_index,
)
third_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
if third_arg.strip(".") in TRANSLATION_GENDER_TAGS:
tr_data.tags.append(TRANSLATION_GENDER_TAGS[third_arg.strip(".")])
else:
tr_data.roman = third_arg.strip("() ")
if tr_data.word != "":
translations.append(tr_data)
return translations


def extract_translation_subpage(
wxr: WiktextractContext, word_entry: WordEntry, page_title: str
) -> None:
page = wxr.wtp.get_page(page_title, 0)
if page is not None and page.body is not None:
root = wxr.wtp.parse(page.body)
extract_translation_section(wxr, word_entry, root)
Loading

0 comments on commit b07ffde

Please sign in to comment.