diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py
index 72251087..8c50d4d7 100644
--- a/src/wiktextract/extractor/pt/models.py
+++ b/src/wiktextract/extractor/pt/models.py
@@ -25,6 +25,22 @@ class Sense(PortugueseBaseModel):
examples: list[Example] = []
+class Translation(PortugueseBaseModel):
+ lang_code: str = Field(
+ default="",
+ description="Wiktionary language code of the translation term",
+ )
+ lang: str = Field(default="", description="Translation language name")
+ word: str = Field(default="", description="Translation term")
+ sense: str = Field(default="", description="Translation gloss")
+ sense_index: int = Field(
+ default=0, ge=0, description="Number of the definition, start from 1"
+ )
+ tags: list[str] = []
+ raw_tags: list[str] = []
+ roman: str = ""
+
+
class WordEntry(PortugueseBaseModel):
model_config = ConfigDict(title="Portuguese Wiktionary")
word: str = Field(description="Word string", min_length=1)
@@ -36,3 +52,4 @@ class WordEntry(PortugueseBaseModel):
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
+ translations: list[Translation] = []
diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py
index a626b08f..87632e31 100644
--- a/src/wiktextract/extractor/pt/page.py
+++ b/src/wiktextract/extractor/pt/page.py
@@ -11,6 +11,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
+from .translation import extract_translation_section
def parse_section(
@@ -30,6 +31,20 @@ def parse_section(
title_text,
cats.get("categories", []),
)
+ elif title_text == "Tradução":
+ extract_translation_section(
+ wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+ )
+
+ cats = {}
+ for link_node in level_node.find_child(NodeKind.LINK):
+ clean_node(wxr, cats, link_node)
+ for data in page_data:
+ if data.lang_code == page_data[-1].lang_code:
+ data.categories.extend(cats.get("categories", []))
+
+ for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
+ parse_section(wxr, page_data, base_data, next_level)
def parse_page(
@@ -37,6 +52,8 @@ def parse_page(
) -> list[dict[str, Any]]:
# page layout
# https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
+ if "/traduções" in page_title: # skip translation page
+ return []
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text)
page_data: list[WordEntry] = []
diff --git a/src/wiktextract/extractor/pt/translation.py b/src/wiktextract/extractor/pt/translation.py
new file mode 100644
index 00000000..91c563bd
--- /dev/null
+++ b/src/wiktextract/extractor/pt/translation.py
@@ -0,0 +1,232 @@
+import re
+
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Translation, WordEntry
+
+
+def extract_translation_section(
+ wxr: WiktextractContext,
+ word_entry: WordEntry,
+ level_node: LevelNode,
+) -> None:
+ sense = ""
+ sense_index = 0
+ for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
+ match node.kind:
+ case NodeKind.TEMPLATE:
+ if node.template_name == "tradini":
+ sense, sense_index = extract_tradini_template(wxr, node)
+ case NodeKind.LIST:
+ for list_item in node.find_child(NodeKind.LIST_ITEM):
+ extract_translation_list_item(
+ wxr, word_entry, list_item, sense, sense_index
+ )
+
+
+def extract_tradini_template(
+ wxr: WiktextractContext, t_node: TemplateNode
+) -> tuple[str, str]:
+ # https://pt.wiktionary.org/wiki/Predefinição:tradini
+ sense = ""
+ sense_index = 0
+ first_arg_str = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ m = re.match(r"De (\d+)", first_arg_str)
+ if m is not None:
+ sense_index = int(m.group(1))
+ sense = first_arg_str[m.end() :].strip("() ")
+ else:
+ sense = first_arg_str
+ return sense, sense_index
+
+
+def extract_translation_list_item(
+ wxr: WiktextractContext,
+ word_entry: WordEntry,
+ list_item: WikiNode,
+ sense: str,
+ sense_index: int,
+) -> None:
+ translations = []
+ for node in list_item.children:
+ if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
+ link_str = clean_node(wxr, None, node)
+ if "/traduções" in link_str:
+ extract_translation_subpage(wxr, word_entry, link_str)
+ elif isinstance(node, TemplateNode):
+ match node.template_name:
+ case "trad":
+ translations.extend(
+ extract_trad_template(wxr, node, sense, sense_index)
+ )
+ case "trad-":
+ translations.extend(
+ extract_trad_minus_template(
+ wxr, node, sense, sense_index
+ )
+ )
+ case "t":
+ translations.extend(
+ extract_t_template(wxr, node, sense, sense_index)
+ )
+ case "xlatio":
+ translations.extend(
+ extract_xlatio_template(
+ wxr,
+ node,
+ sense,
+ sense_index,
+ translations[-1].lang
+ if len(translations) > 0
+ else "unknown",
+ )
+ )
+ elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None:
+ roman = node.strip("() ")
+ for tr_data in translations:
+ tr_data.roman = roman
+ elif (
+ isinstance(node, WikiNode)
+ and node.kind == NodeKind.ITALIC
+ and len(translations) > 0
+ ):
+ raw_tag = clean_node(wxr, None, node)
+ if raw_tag != "":
+ translations[-1].raw_tags.append(raw_tag)
+
+ word_entry.translations.extend(translations)
+
+
+def extract_trad_template(
+ wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
+) -> list[Translation]:
+ # https://pt.wiktionary.org/wiki/Predefinição:trad
+ translations = []
+ roman = clean_node(wxr, None, t_node.template_parameters.get("t", ""))
+ lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ expanded_node = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(t_node), expand_all=True
+ )
+ for link_node in expanded_node.find_child(NodeKind.LINK):
+ lang_name = clean_node(wxr, None, link_node)
+ break
+ for arg in range(2, 12):
+ if arg not in t_node.template_parameters:
+ break
+ tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, ""))
+ translations.append(
+ Translation(
+ word=tr_str,
+ lang=lang_name,
+ lang_code=lang_code,
+ roman=roman,
+ sense=sense,
+ sense_index=sense_index,
+ )
+ )
+ return translations
+
+
+def extract_trad_minus_template(
+ wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
+) -> list[Translation]:
+ # https://pt.wiktionary.org/wiki/Predefinição:trad-
+ translations = []
+ lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ lang_name = "unknown"
+ expanded_node = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(t_node), expand_all=True
+ )
+ for link_node in expanded_node.find_child(NodeKind.LINK):
+ lang_name = clean_node(wxr, None, link_node)
+ break
+ tr_data = Translation(
+ word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
+ lang=lang_name,
+ lang_code=lang_code,
+ roman=clean_node(
+ wxr, None, t_node.template_parameters.get(3, "")
+ ).strip("() "),
+ sense=sense,
+ sense_index=sense_index,
+ )
+ if tr_data.word != "":
+ translations.append(tr_data)
+ return translations
+
+
+TRANSLATION_GENDER_TAGS = {
+ "c": "common",
+ "f": "feminine",
+ "m": "masculine",
+ "n": "neuter",
+}
+
+
+def extract_t_template(
+ wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
+) -> list[Translation]:
+ # https://pt.wiktionary.org/wiki/Predefinição:t
+ translations = []
+ lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ lang_name = "unknown"
+ expanded_node = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(t_node), expand_all=True
+ )
+ for link_node in expanded_node.find_child(NodeKind.LINK):
+ lang_name = clean_node(wxr, None, link_node)
+ break
+ tr_data = Translation(
+ word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
+ lang=lang_name,
+ lang_code=lang_code,
+ roman=clean_node(
+ wxr, None, t_node.template_parameters.get(4, "")
+ ).strip("() "),
+ sense=sense,
+ sense_index=sense_index,
+ )
+ gender_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
+ if gender_arg in TRANSLATION_GENDER_TAGS:
+ tr_data.tags.append(TRANSLATION_GENDER_TAGS[gender_arg])
+ if tr_data.word != "":
+ translations.append(tr_data)
+ return translations
+
+
+def extract_xlatio_template(
+ wxr: WiktextractContext,
+ t_node: TemplateNode,
+ sense: str,
+ sense_index: int,
+ lang_name: str,
+) -> list[Translation]:
+ # https://pt.wiktionary.org/wiki/Predefinição:xlatio
+ translations = []
+ lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ tr_data = Translation(
+ word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
+ lang=lang_name,
+ lang_code=lang_code,
+ sense=sense,
+ sense_index=sense_index,
+ )
+ third_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
+ if third_arg.strip(".") in TRANSLATION_GENDER_TAGS:
+ tr_data.tags.append(TRANSLATION_GENDER_TAGS[third_arg.strip(".")])
+ else:
+ tr_data.roman = third_arg.strip("() ")
+ if tr_data.word != "":
+ translations.append(tr_data)
+ return translations
+
+
+def extract_translation_subpage(
+ wxr: WiktextractContext, word_entry: WordEntry, page_title: str
+) -> None:
+ page = wxr.wtp.get_page(page_title, 0)
+ if page is not None and page.body is not None:
+ root = wxr.wtp.parse(page.body)
+ extract_translation_section(wxr, word_entry, root)
diff --git a/tests/test_pt_translation.py b/tests/test_pt_translation.py
new file mode 100644
index 00000000..196c6189
--- /dev/null
+++ b/tests/test_pt_translation.py
@@ -0,0 +1,105 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.pt.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestPtTranslation(TestCase):
+ maxDiff = None
+
+ def setUp(self) -> None:
+ conf = WiktionaryConfig(
+ dump_file_lang_code="pt",
+ capture_language_codes=None,
+ )
+ self.wxr = WiktextractContext(
+ Wtp(
+ lang_code="pt",
+ parser_function_aliases=conf.parser_function_aliases,
+ ),
+ conf,
+ )
+
+ def test_subpage(self):
+ self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+ self.wxr.wtp.add_page(
+ "Predefinição:trad",
+ 10,
+ """[[abenaque|Abenaque]] : [[adia#Abenaque|adia]] , [[alemos#Abenaque|alemos]]""",
+ )
+ self.wxr.wtp.add_page(
+ "Predefinição:t",
+ 10,
+ """[[aino|Aino]]: [[セタ#ain|セタ]] ''(seta)''""",
+ )
+ self.wxr.wtp.add_page(
+ "Predefinição:trad-",
+ 10,
+ """[[búlgaro|Búlgaro]] : [[куче#Búlgaro|куче]] ''(kutche)'' [[:bg:куче|(bg)]]""",
+ )
+ self.wxr.wtp.add_page(
+ "cão/traduções 1",
+ 0,
+ """{{tradini|De 1 (mamífero domesticado - ''Canis lupus familiaris'')}}
+* {{trad|abe|adia|alemos}}
+* {{t|ain|セタ||seta}}
+* {{trad-|bg|куче|(kutche)}}; {{xlatio|bg|пес|(pes)}} (''coloquial'')
+{{tradfim}}""",
+ )
+ data = parse_page(
+ self.wxr,
+ "cão",
+ """={{-pt-}}=
+==Substantivo==
+# animal
+===Tradução===
+Vide traduções nas seguintes páginas:
+* [[cão/traduções 1]]""",
+ )
+ self.assertEqual(
+ data[0]["translations"],
+ [
+ {
+ "lang": "Abenaque",
+ "lang_code": "abe",
+ "sense": "mamífero domesticado - Canis lupus familiaris",
+ "sense_index": 1,
+ "word": "adia",
+ },
+ {
+ "lang": "Abenaque",
+ "lang_code": "abe",
+ "sense": "mamífero domesticado - Canis lupus familiaris",
+ "sense_index": 1,
+ "word": "alemos",
+ },
+ {
+ "lang": "Aino",
+ "lang_code": "ain",
+ "sense": "mamífero domesticado - Canis lupus familiaris",
+ "sense_index": 1,
+ "roman": "seta",
+ "word": "セタ",
+ },
+ {
+ "lang": "Búlgaro",
+ "lang_code": "bg",
+ "sense": "mamífero domesticado - Canis lupus familiaris",
+ "sense_index": 1,
+ "roman": "kutche",
+ "word": "куче",
+ },
+ {
+ "lang": "Búlgaro",
+ "lang_code": "bg",
+ "sense": "mamífero domesticado - Canis lupus familiaris",
+ "sense_index": 1,
+ "roman": "pes",
+ "word": "пес",
+ "raw_tags": ["coloquial"],
+ },
+ ],
+ )