diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index 72251087..8c50d4d7 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -25,6 +25,22 @@ class Sense(PortugueseBaseModel): examples: list[Example] = [] +class Translation(PortugueseBaseModel): + lang_code: str = Field( + default="", + description="Wiktionary language code of the translation term", + ) + lang: str = Field(default="", description="Translation language name") + word: str = Field(default="", description="Translation term") + sense: str = Field(default="", description="Translation gloss") + sense_index: int = Field( + default=0, ge=0, description="Number of the definition, start from 1" + ) + tags: list[str] = [] + raw_tags: list[str] = [] + roman: str = "" + + class WordEntry(PortugueseBaseModel): model_config = ConfigDict(title="Portuguese Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -36,3 +52,4 @@ class WordEntry(PortugueseBaseModel): categories: list[str] = [] tags: list[str] = [] raw_tags: list[str] = [] + translations: list[Translation] = [] diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index a626b08f..87632e31 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -11,6 +11,7 @@ from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import POS_DATA +from .translation import extract_translation_section def parse_section( @@ -30,6 +31,20 @@ def parse_section( title_text, cats.get("categories", []), ) + elif title_text == "Tradução": + extract_translation_section( + wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + ) + + cats = {} + for link_node in level_node.find_child(NodeKind.LINK): + clean_node(wxr, cats, link_node) + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.categories.extend(cats.get("categories", [])) + + for next_level in level_node.find_child(LEVEL_KIND_FLAGS): + parse_section(wxr, page_data, base_data, next_level) def parse_page( @@ -37,6 +52,8 @@ def parse_page( ) -> list[dict[str, Any]]: # page layout # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo + if "/traduções" in page_title: # skip translation page + return [] wxr.wtp.start_page(page_title) tree = wxr.wtp.parse(page_text) page_data: list[WordEntry] = [] diff --git a/src/wiktextract/extractor/pt/translation.py b/src/wiktextract/extractor/pt/translation.py new file mode 100644 index 00000000..91c563bd --- /dev/null +++ b/src/wiktextract/extractor/pt/translation.py @@ -0,0 +1,232 @@ +import re + +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Translation, WordEntry + + +def extract_translation_section( + wxr: WiktextractContext, + word_entry: WordEntry, + level_node: LevelNode, +) -> None: + sense = "" + sense_index = 0 + for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): + match node.kind: + case NodeKind.TEMPLATE: + if node.template_name == "tradini": + sense, sense_index = extract_tradini_template(wxr, node) + case NodeKind.LIST: + for list_item in node.find_child(NodeKind.LIST_ITEM): + extract_translation_list_item( + wxr, word_entry, list_item, sense, sense_index + ) + + +def extract_tradini_template( + wxr: WiktextractContext, t_node: TemplateNode +) -> tuple[str, str]: + # https://pt.wiktionary.org/wiki/Predefinição:tradini + sense = "" + sense_index = 0 + first_arg_str = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + m = re.match(r"De (\d+)", first_arg_str) + if m is not None: + sense_index = int(m.group(1)) + sense = first_arg_str[m.end() :].strip("() ") + else: + sense = first_arg_str + return sense, sense_index + + +def extract_translation_list_item( + wxr: WiktextractContext, + word_entry: WordEntry, + list_item: WikiNode, + sense: str, + sense_index: int, +) -> None: + translations = [] + for node in list_item.children: + if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: + link_str = clean_node(wxr, None, node) + if "/traduções" in link_str: + extract_translation_subpage(wxr, word_entry, link_str) + elif isinstance(node, TemplateNode): + match node.template_name: + case "trad": + translations.extend( + extract_trad_template(wxr, node, sense, sense_index) + ) + case "trad-": + translations.extend( + extract_trad_minus_template( + wxr, node, sense, sense_index + ) + ) + case "t": + translations.extend( + extract_t_template(wxr, node, sense, sense_index) + ) + case "xlatio": + translations.extend( + extract_xlatio_template( + wxr, + node, + sense, + sense_index, + translations[-1].lang + if len(translations) > 0 + else "unknown", + ) + ) + elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None: + roman = node.strip("() ") + for tr_data in translations: + tr_data.roman = roman + elif ( + isinstance(node, WikiNode) + and node.kind == NodeKind.ITALIC + and len(translations) > 0 + ): + raw_tag = clean_node(wxr, None, node) + if raw_tag != "": + translations[-1].raw_tags.append(raw_tag) + + word_entry.translations.extend(translations) + + +def extract_trad_template( + wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int +) -> list[Translation]: + # https://pt.wiktionary.org/wiki/Predefinição:trad + translations = [] + roman = clean_node(wxr, None, t_node.template_parameters.get("t", "")) + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for link_node in expanded_node.find_child(NodeKind.LINK): + lang_name = clean_node(wxr, None, link_node) + break + for arg in range(2, 12): + if arg not in t_node.template_parameters: + break + tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, "")) + translations.append( + Translation( + word=tr_str, + lang=lang_name, + lang_code=lang_code, + roman=roman, + sense=sense, + sense_index=sense_index, + ) + ) + return translations + + +def extract_trad_minus_template( + wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int +) -> list[Translation]: + # https://pt.wiktionary.org/wiki/Predefinição:trad- + translations = [] + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + lang_name = "unknown" + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for link_node in expanded_node.find_child(NodeKind.LINK): + lang_name = clean_node(wxr, None, link_node) + break + tr_data = Translation( + word=clean_node(wxr, None, t_node.template_parameters.get(2, "")), + lang=lang_name, + lang_code=lang_code, + roman=clean_node( + wxr, None, t_node.template_parameters.get(3, "") + ).strip("() "), + sense=sense, + sense_index=sense_index, + ) + if tr_data.word != "": + translations.append(tr_data) + return translations + + +TRANSLATION_GENDER_TAGS = { + "c": "common", + "f": "feminine", + "m": "masculine", + "n": "neuter", +} + + +def extract_t_template( + wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int +) -> list[Translation]: + # https://pt.wiktionary.org/wiki/Predefinição:t + translations = [] + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + lang_name = "unknown" + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for link_node in expanded_node.find_child(NodeKind.LINK): + lang_name = clean_node(wxr, None, link_node) + break + tr_data = Translation( + word=clean_node(wxr, None, t_node.template_parameters.get(2, "")), + lang=lang_name, + lang_code=lang_code, + roman=clean_node( + wxr, None, t_node.template_parameters.get(4, "") + ).strip("() "), + sense=sense, + sense_index=sense_index, + ) + gender_arg = clean_node(wxr, None, t_node.template_parameters.get(3, "")) + if gender_arg in TRANSLATION_GENDER_TAGS: + tr_data.tags.append(TRANSLATION_GENDER_TAGS[gender_arg]) + if tr_data.word != "": + translations.append(tr_data) + return translations + + +def extract_xlatio_template( + wxr: WiktextractContext, + t_node: TemplateNode, + sense: str, + sense_index: int, + lang_name: str, +) -> list[Translation]: + # https://pt.wiktionary.org/wiki/Predefinição:xlatio + translations = [] + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + tr_data = Translation( + word=clean_node(wxr, None, t_node.template_parameters.get(2, "")), + lang=lang_name, + lang_code=lang_code, + sense=sense, + sense_index=sense_index, + ) + third_arg = clean_node(wxr, None, t_node.template_parameters.get(3, "")) + if third_arg.strip(".") in TRANSLATION_GENDER_TAGS: + tr_data.tags.append(TRANSLATION_GENDER_TAGS[third_arg.strip(".")]) + else: + tr_data.roman = third_arg.strip("() ") + if tr_data.word != "": + translations.append(tr_data) + return translations + + +def extract_translation_subpage( + wxr: WiktextractContext, word_entry: WordEntry, page_title: str +) -> None: + page = wxr.wtp.get_page(page_title, 0) + if page is not None and page.body is not None: + root = wxr.wtp.parse(page.body) + extract_translation_section(wxr, word_entry, root) diff --git a/tests/test_pt_translation.py b/tests/test_pt_translation.py new file mode 100644 index 00000000..196c6189 --- /dev/null +++ b/tests/test_pt_translation.py @@ -0,0 +1,105 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.pt.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestPtTranslation(TestCase): + maxDiff = None + + def setUp(self) -> None: + conf = WiktionaryConfig( + dump_file_lang_code="pt", + capture_language_codes=None, + ) + self.wxr = WiktextractContext( + Wtp( + lang_code="pt", + parser_function_aliases=conf.parser_function_aliases, + ), + conf, + ) + + def test_subpage(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + self.wxr.wtp.add_page( + "Predefinição:trad", + 10, + """[[abenaque|Abenaque]] : [[adia#Abenaque|adia]] , [[alemos#Abenaque|alemos]]""", + ) + self.wxr.wtp.add_page( + "Predefinição:t", + 10, + """[[aino|Aino]]: [[セタ#ain|セタ]] ''(seta)''""", + ) + self.wxr.wtp.add_page( + "Predefinição:trad-", + 10, + """[[búlgaro|Búlgaro]] : [[куче#Búlgaro|куче]] ''(kutche)'' [[:bg:куче|(bg)]]""", + ) + self.wxr.wtp.add_page( + "cão/traduções 1", + 0, + """{{tradini|De 1 (mamífero domesticado - ''Canis lupus familiaris'')}} +* {{trad|abe|adia|alemos}} +* {{t|ain|セタ||seta}} +* {{trad-|bg|куче|(kutche)}}; {{xlatio|bg|пес|(pes)}} (''coloquial'') +{{tradfim}}""", + ) + data = parse_page( + self.wxr, + "cão", + """={{-pt-}}= +==Substantivo== +# animal +===Tradução=== +Vide traduções nas seguintes páginas: +* [[cão/traduções 1]]""", + ) + self.assertEqual( + data[0]["translations"], + [ + { + "lang": "Abenaque", + "lang_code": "abe", + "sense": "mamífero domesticado - Canis lupus familiaris", + "sense_index": 1, + "word": "adia", + }, + { + "lang": "Abenaque", + "lang_code": "abe", + "sense": "mamífero domesticado - Canis lupus familiaris", + "sense_index": 1, + "word": "alemos", + }, + { + "lang": "Aino", + "lang_code": "ain", + "sense": "mamífero domesticado - Canis lupus familiaris", + "sense_index": 1, + "roman": "seta", + "word": "セタ", + }, + { + "lang": "Búlgaro", + "lang_code": "bg", + "sense": "mamífero domesticado - Canis lupus familiaris", + "sense_index": 1, + "roman": "kutche", + "word": "куче", + }, + { + "lang": "Búlgaro", + "lang_code": "bg", + "sense": "mamífero domesticado - Canis lupus familiaris", + "sense_index": 1, + "roman": "pes", + "word": "пес", + "raw_tags": ["coloquial"], + }, + ], + )