Merge pull request #960 from xxyzz/pt

[pt] extract "conj.en*" templates and two new sections
tatuylonen · Dec 27, 2024 · b169b2b · b169b2b
2 parents 5ac3627 + 3479756
commit b169b2b
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 9 deletions.
diff --git a/src/wiktextract/extractor/pt/inflection.py b/src/wiktextract/extractor/pt/inflection.py
@@ -81,6 +81,8 @@ def extract_conjugation_section(
     for t_node in level_node.find_child(NodeKind.TEMPLATE):
         if t_node.template_name.startswith(("conj.pt", "conj/pt")):
             extract_conj_pt_template(wxr, word_entry, t_node)
+        elif t_node.template_name.startswith("conj.en"):
+            extract_conj_en_template(wxr, word_entry, t_node)
 
 
 def extract_conj_pt_template(
@@ -214,3 +216,53 @@ def add_conj_pt_form(
             form.raw_tags.append(row_header.text)
     translate_raw_tags(form)
     word_entry.forms.append(form)
+
+
+def extract_conj_en_template(
+    wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
+) -> None:
+    # https://pt.wiktionary.org/wiki/Predefinição:conj.en
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for table in expanded_node.find_child(NodeKind.TABLE):
+        for row in table.find_child(NodeKind.TABLE_ROW):
+            for cell in row.find_child(NodeKind.TABLE_CELL):
+                raw_tag = ""
+                for sup_tag in cell.find_html("sup"):
+                    raw_tag = clean_node(wxr, None, sup_tag.children).strip(
+                        ": "
+                    )
+                for list_node in cell.find_child(NodeKind.LIST):
+                    for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+                        for bold_node in list_item.find_child(NodeKind.BOLD):
+                            form_str = clean_node(wxr, None, bold_node)
+                            if form_str not in ["", wxr.wtp.title]:
+                                form = Form(form=form_str)
+                                if raw_tag != "":
+                                    form.raw_tags.append(raw_tag)
+                                translate_raw_tags(form)
+                                word_entry.forms.append(form)
+
+
+def extract_degree_section(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    level_node: LevelNode,
+) -> None:
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            for index, bold_node in list_item.find_child(NodeKind.BOLD, True):
+                bold_str = clean_node(wxr, None, bold_node)
+                forms_str = clean_node(
+                    wxr, None, list_item.children[index + 1 :]
+                ).strip(": ")
+                for form_str in forms_str.split(","):
+                    form_str = form_str.strip()
+                    if form_str not in ["", wxr.wtp.title]:
+                        form = Form(form=form_str)
+                        if form_str != "":
+                            form.raw_tags.append(bold_str)
+                        translate_raw_tags(form)
+                        word_entry.forms.append(form)
+                break
diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py
@@ -102,3 +102,5 @@ class WordEntry(PortugueseBaseModel):
     sounds: list[Sound] = []
     forms: list[Form] = []
     notes: list[str] = []
+    cognates: list[Translation] = []
+    descendants: list[Translation] = []
diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py
@@ -9,7 +9,7 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .etymology import extract_etymology_section
-from .inflection import extract_conjugation_section
+from .inflection import extract_conjugation_section, extract_degree_section
 from .linkage import (
     extract_expression_section,
     extract_linkage_section,
@@ -41,9 +41,12 @@ def parse_section(
             title_text,
             cats.get("categories", []),
         )
-    elif title_text in ["Tradução", "Traduções", "Cognatos"]:
+    elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]:
         extract_translation_section(
-            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+            wxr,
+            page_data[-1] if len(page_data) > 0 else base_data,
+            level_node,
+            title_text,
         )
     elif title_text == "Expressões":
         extract_expression_section(
@@ -68,12 +71,16 @@ def parse_section(
         extract_phraseology_section(
             wxr, page_data[-1] if len(page_data) else base_data, level_node
         )
-    elif title_text.startswith("Nota"):
+    elif title_text.startswith(("Nota", "Uso")):
         extract_note_section(wxr, page_data, level_node)
     elif title_text == "Conjugação":
         extract_conjugation_section(
             wxr, page_data[-1] if len(page_data) else base_data, level_node
         )
+    elif title_text == "Graus":
+        extract_degree_section(
+            wxr, page_data[-1] if len(page_data) else base_data, level_node
+        )
     elif title_text.lower() not in [
         "ver também",
         "ligação externa",

diff --git a/src/wiktextract/extractor/pt/tags.py b/src/wiktextract/extractor/pt/tags.py
@@ -131,6 +131,9 @@
     "Afirmativo": "affirmative",
     "Negativo": "negative",
     "Infinitivo pessoal": ["personal", "infinitive"],
+    # Predefinição:conj.en
+    "Infinitivo": "infinitive",
+    "Passado simples": "past",
 }
 
 # https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
@@ -209,7 +212,17 @@
     "plural": "plural",
 }
 
-TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS}
+OTHER_TAGS = {
+    "comparativo de superioridade": ["comparative", "superior"],
+    "superlativo absoluto sintético": ["absolute", "superlative"],
+    "superlativo relativo de superioridade": [
+        "relative",
+        "superlative",
+        "superior",
+    ],
+}
+
+TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS, **OTHER_TAGS}
 
 # https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
 TOPICS = {

diff --git a/src/wiktextract/extractor/pt/translation.py b/src/wiktextract/extractor/pt/translation.py
@@ -11,9 +11,17 @@ def extract_translation_section(
     wxr: WiktextractContext,
     word_entry: WordEntry,
     level_node: LevelNode,
+    title_text: str,
 ) -> None:
     sense = ""
     sense_index = 0
+    target_field = "translations"
+    match title_text:
+        case "Cognatos":
+            target_field = "cognates"
+        case "Descendentes":
+            target_field = "descendants"
+
     for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
         match node.kind:
             case NodeKind.TEMPLATE:
@@ -22,7 +30,12 @@ def extract_translation_section(
             case NodeKind.LIST:
                 for list_item in node.find_child(NodeKind.LIST_ITEM):
                     extract_translation_list_item(
-                        wxr, word_entry, list_item, sense, sense_index
+                        wxr,
+                        word_entry,
+                        list_item,
+                        sense,
+                        sense_index,
+                        target_field,
                     )
 
 
@@ -48,6 +61,7 @@ def extract_translation_list_item(
     list_item: WikiNode,
     sense: str,
     sense_index: int,
+    target_field: str,
 ) -> None:
     translations = []
     lang_name = "unknown"
@@ -101,10 +115,15 @@ def extract_translation_list_item(
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
             for next_list_item in node.find_child(NodeKind.LIST_ITEM):
                 extract_translation_list_item(
-                    wxr, word_entry, next_list_item, sense, sense_index
+                    wxr,
+                    word_entry,
+                    next_list_item,
+                    sense,
+                    sense_index,
+                    target_field,
                 )
 
-    word_entry.translations.extend(translations)
+    getattr(word_entry, target_field).extend(translations)
 
 
 def extract_trad_template(
@@ -239,4 +258,4 @@ def extract_translation_subpage(
     page = wxr.wtp.get_page(page_title, 0)
     if page is not None and page.body is not None:
         root = wxr.wtp.parse(page.body)
-        extract_translation_section(wxr, word_entry, root)
+        extract_translation_section(wxr, word_entry, root, "Tradução")
diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py
@@ -219,3 +219,57 @@ def test_conj_pt(self):
                 },
             ],
         )
+
+    def test_conj_en(self):
+        self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês")
+        self.wxr.wtp.add_page(
+            "Predefinição:conj.en.2",
+            10,
+            """{|
+|-
+| <sup>Passado simples:</sup>
+: '''[[red]]''' / '''[[redd]]'''
+|}""",
+        )
+        data = parse_page(
+            self.wxr,
+            "rede",
+            """={{-en-}}=
+==Verbo==
+# {{escopo|en|Arcaísmo}} [[governar]], [[proteger]]
+===Conjugação===
+{{conj.en.2|rede|redes|red|redd|red|redd|reding}}""",
+        )
+        self.assertEqual(
+            data[0]["forms"],
+            [
+                {"form": "red", "tags": ["past"]},
+                {"form": "redd", "tags": ["past"]},
+            ],
+        )
+
+    def test_degree_section(self):
+        self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+        data = parse_page(
+            self.wxr,
+            "bom",
+            """={{-pt-}}=
+==Adjetivo==
+# que
+===Graus===
+* '''comparativo de superioridade''': [[melhor]] do que
+* '''superlativo absoluto sintético''': [[boníssimo]], [[ótimo]]
+* '''superlativo relativo de superioridade''': melhor""",
+        )
+        self.assertEqual(
+            data[0]["forms"],
+            [
+                {"form": "melhor do que", "tags": ["comparative", "superior"]},
+                {"form": "boníssimo", "tags": ["absolute", "superlative"]},
+                {"form": "ótimo", "tags": ["absolute", "superlative"]},
+                {
+                    "form": "melhor",
+                    "tags": ["relative", "superlative", "superior"],
+                },
+            ],
+        )