[pt] extract translation section

tatuylonen · Dec 3, 2024 · b07ffde · b07ffde
1 parent b518906
commit b07ffde
Show file tree

Hide file tree

Showing 4 changed files with 371 additions and 0 deletions.
diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py
@@ -25,6 +25,22 @@ class Sense(PortugueseBaseModel):
     examples: list[Example] = []
 
 
+class Translation(PortugueseBaseModel):
+    lang_code: str = Field(
+        default="",
+        description="Wiktionary language code of the translation term",
+    )
+    lang: str = Field(default="", description="Translation language name")
+    word: str = Field(default="", description="Translation term")
+    sense: str = Field(default="", description="Translation gloss")
+    sense_index: int = Field(
+        default=0, ge=0, description="Number of the definition, start from 1"
+    )
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    roman: str = ""
+
+
 class WordEntry(PortugueseBaseModel):
     model_config = ConfigDict(title="Portuguese Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -36,3 +52,4 @@ class WordEntry(PortugueseBaseModel):
     categories: list[str] = []
     tags: list[str] = []
     raw_tags: list[str] = []
+    translations: list[Translation] = []
diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py
@@ -11,6 +11,7 @@
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
 from .section_titles import POS_DATA
+from .translation import extract_translation_section
 
 
 def parse_section(
@@ -30,13 +31,29 @@ def parse_section(
             title_text,
             cats.get("categories", []),
         )
+    elif title_text == "Tradução":
+        extract_translation_section(
+            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+        )
+
+    cats = {}
+    for link_node in level_node.find_child(NodeKind.LINK):
+        clean_node(wxr, cats, link_node)
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            data.categories.extend(cats.get("categories", []))
+
+    for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
+        parse_section(wxr, page_data, base_data, next_level)
 
 
 def parse_page(
     wxr: WiktextractContext, page_title: str, page_text: str
 ) -> list[dict[str, Any]]:
     # page layout
     # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
+    if "/traduções" in page_title:  # skip translation page
+        return []
     wxr.wtp.start_page(page_title)
     tree = wxr.wtp.parse(page_text)
     page_data: list[WordEntry] = []

diff --git a/src/wiktextract/extractor/pt/translation.py b/src/wiktextract/extractor/pt/translation.py
@@ -0,0 +1,232 @@
+import re
+
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Translation, WordEntry
+
+
+def extract_translation_section(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    level_node: LevelNode,
+) -> None:
+    sense = ""
+    sense_index = 0
+    for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
+        match node.kind:
+            case NodeKind.TEMPLATE:
+                if node.template_name == "tradini":
+                    sense, sense_index = extract_tradini_template(wxr, node)
+            case NodeKind.LIST:
+                for list_item in node.find_child(NodeKind.LIST_ITEM):
+                    extract_translation_list_item(
+                        wxr, word_entry, list_item, sense, sense_index
+                    )
+
+
+def extract_tradini_template(
+    wxr: WiktextractContext, t_node: TemplateNode
+) -> tuple[str, str]:
+    # https://pt.wiktionary.org/wiki/Predefinição:tradini
+    sense = ""
+    sense_index = 0
+    first_arg_str = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    m = re.match(r"De (\d+)", first_arg_str)
+    if m is not None:
+        sense_index = int(m.group(1))
+        sense = first_arg_str[m.end() :].strip("() ")
+    else:
+        sense = first_arg_str
+    return sense, sense_index
+
+
+def extract_translation_list_item(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    list_item: WikiNode,
+    sense: str,
+    sense_index: int,
+) -> None:
+    translations = []
+    for node in list_item.children:
+        if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
+            link_str = clean_node(wxr, None, node)
+            if "/traduções" in link_str:
+                extract_translation_subpage(wxr, word_entry, link_str)
+        elif isinstance(node, TemplateNode):
+            match node.template_name:
+                case "trad":
+                    translations.extend(
+                        extract_trad_template(wxr, node, sense, sense_index)
+                    )
+                case "trad-":
+                    translations.extend(
+                        extract_trad_minus_template(
+                            wxr, node, sense, sense_index
+                        )
+                    )
+                case "t":
+                    translations.extend(
+                        extract_t_template(wxr, node, sense, sense_index)
+                    )
+                case "xlatio":
+                    translations.extend(
+                        extract_xlatio_template(
+                            wxr,
+                            node,
+                            sense,
+                            sense_index,
+                            translations[-1].lang
+                            if len(translations) > 0
+                            else "unknown",
+                        )
+                    )
+        elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None:
+            roman = node.strip("() ")
+            for tr_data in translations:
+                tr_data.roman = roman
+        elif (
+            isinstance(node, WikiNode)
+            and node.kind == NodeKind.ITALIC
+            and len(translations) > 0
+        ):
+            raw_tag = clean_node(wxr, None, node)
+            if raw_tag != "":
+                translations[-1].raw_tags.append(raw_tag)
+
+    word_entry.translations.extend(translations)
+
+
+def extract_trad_template(
+    wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
+) -> list[Translation]:
+    # https://pt.wiktionary.org/wiki/Predefinição:trad
+    translations = []
+    roman = clean_node(wxr, None, t_node.template_parameters.get("t", ""))
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for link_node in expanded_node.find_child(NodeKind.LINK):
+        lang_name = clean_node(wxr, None, link_node)
+        break
+    for arg in range(2, 12):
+        if arg not in t_node.template_parameters:
+            break
+        tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, ""))
+        translations.append(
+            Translation(
+                word=tr_str,
+                lang=lang_name,
+                lang_code=lang_code,
+                roman=roman,
+                sense=sense,
+                sense_index=sense_index,
+            )
+        )
+    return translations
+
+
+def extract_trad_minus_template(
+    wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
+) -> list[Translation]:
+    # https://pt.wiktionary.org/wiki/Predefinição:trad-
+    translations = []
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    lang_name = "unknown"
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for link_node in expanded_node.find_child(NodeKind.LINK):
+        lang_name = clean_node(wxr, None, link_node)
+        break
+    tr_data = Translation(
+        word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
+        lang=lang_name,
+        lang_code=lang_code,
+        roman=clean_node(
+            wxr, None, t_node.template_parameters.get(3, "")
+        ).strip("() "),
+        sense=sense,
+        sense_index=sense_index,
+    )
+    if tr_data.word != "":
+        translations.append(tr_data)
+    return translations
+
+
+TRANSLATION_GENDER_TAGS = {
+    "c": "common",
+    "f": "feminine",
+    "m": "masculine",
+    "n": "neuter",
+}
+
+
+def extract_t_template(
+    wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int
+) -> list[Translation]:
+    # https://pt.wiktionary.org/wiki/Predefinição:t
+    translations = []
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    lang_name = "unknown"
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for link_node in expanded_node.find_child(NodeKind.LINK):
+        lang_name = clean_node(wxr, None, link_node)
+        break
+    tr_data = Translation(
+        word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
+        lang=lang_name,
+        lang_code=lang_code,
+        roman=clean_node(
+            wxr, None, t_node.template_parameters.get(4, "")
+        ).strip("() "),
+        sense=sense,
+        sense_index=sense_index,
+    )
+    gender_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
+    if gender_arg in TRANSLATION_GENDER_TAGS:
+        tr_data.tags.append(TRANSLATION_GENDER_TAGS[gender_arg])
+    if tr_data.word != "":
+        translations.append(tr_data)
+    return translations
+
+
+def extract_xlatio_template(
+    wxr: WiktextractContext,
+    t_node: TemplateNode,
+    sense: str,
+    sense_index: int,
+    lang_name: str,
+) -> list[Translation]:
+    # https://pt.wiktionary.org/wiki/Predefinição:xlatio
+    translations = []
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    tr_data = Translation(
+        word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),
+        lang=lang_name,
+        lang_code=lang_code,
+        sense=sense,
+        sense_index=sense_index,
+    )
+    third_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
+    if third_arg.strip(".") in TRANSLATION_GENDER_TAGS:
+        tr_data.tags.append(TRANSLATION_GENDER_TAGS[third_arg.strip(".")])
+    else:
+        tr_data.roman = third_arg.strip("() ")
+    if tr_data.word != "":
+        translations.append(tr_data)
+    return translations
+
+
+def extract_translation_subpage(
+    wxr: WiktextractContext, word_entry: WordEntry, page_title: str
+) -> None:
+    page = wxr.wtp.get_page(page_title, 0)
+    if page is not None and page.body is not None:
+        root = wxr.wtp.parse(page.body)
+        extract_translation_section(wxr, word_entry, root)