Merge pull request #955 from xxyzz/pt

[pt] extract phraseology and note sections, nested gloss lists
tatuylonen · Dec 24, 2024 · 41e285e · 41e285e
2 parents 7efe08e + f5952ba
commit 41e285e
Show file tree

Hide file tree

Showing 7 changed files with 232 additions and 12 deletions.
diff --git a/src/wiktextract/extractor/pt/linkage.py b/src/wiktextract/extractor/pt/linkage.py
@@ -97,10 +97,13 @@ def extract_fraseini_template(
     sense = ""
     sense_index = 0
     first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
-    m = re.search(r"(\d+)$", first_arg)
+    m = re.search(r"\((\d+)\)$", first_arg)
     if m is not None:
         sense_index = int(m.group(1))
         sense = first_arg[: m.start()].strip()
+    elif (m := re.match(r"De (\d+)", first_arg)) is not None:
+        sense_index = int(m.group(1))
+        sense = first_arg[m.end() :].strip("() \n")
     else:
         sense = first_arg
     return sense, sense_index
@@ -230,3 +233,61 @@ def extract_wikisaurus_page(
                     page_title,
                     tags,
                 )
+
+
+def extract_phraseology_section(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    level_node: LevelNode,
+) -> None:
+    sense = ""
+    sense_index = 0
+    for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
+        if isinstance(node, TemplateNode) and node.template_name == "fraseini":
+            sense, sense_index = extract_fraseini_template(wxr, node)
+        elif node.kind == NodeKind.LIST:
+            for list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_phraseology_list_item(
+                    wxr, word_entry, list_item, sense, sense_index
+                )
+
+
+def extract_phraseology_list_item(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    list_item: WikiNode,
+    sense: str,
+    sense_index: int,
+) -> None:
+    l_data = Linkage(word="", sense=sense, sense_index=sense_index)
+    for index, node in enumerate(list_item.children):
+        if (
+            isinstance(node, WikiNode)
+            and node.kind in NodeKind.BOLD | NodeKind.LINK
+            and l_data.word == ""
+        ):
+            l_data.word = clean_node(wxr, None, node)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+            l_data.roman = clean_node(wxr, None, node)
+        elif isinstance(node, str) and ("=" in node or ":" in node):
+            sense_start = node.index("=" if "=" in node else ":") + 1
+            l_data.sense = clean_node(
+                wxr,
+                None,
+                [node[sense_start:]]
+                + [
+                    n
+                    for n in list_item.children[index + 1 :]
+                    if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
+                ],
+            )
+            break
+
+    if l_data.word != "":
+        word_entry.phraseology.append(l_data)
+
+    for child_list in list_item.find_child(NodeKind.LIST):
+        for next_list_item in child_list.find_child(NodeKind.LIST_ITEM):
+            extract_phraseology_list_item(
+                wxr, word_entry, next_list_item, sense, sense_index
+            )
diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py
@@ -51,6 +51,7 @@ class Linkage(PortugueseBaseModel):
         default=0, ge=0, description="Number of the definition, start from 1"
     )
     source: str = ""
+    roman: str = ""
 
 
 class Sound(PortugueseBaseModel):
@@ -92,6 +93,11 @@ class WordEntry(PortugueseBaseModel):
     hypernyms: list[Linkage] = []
     related: list[Linkage] = []
     hyponyms: list[Linkage] = []
+    homophones: list[Linkage] = []
+    homonyms: list[Linkage] = []
+    paronyms: list[Linkage] = []
+    phraseology: list[Linkage] = []
     etymology_texts: list[str] = []
     sounds: list[Sound] = []
     forms: list[Form] = []
+    notes: list[str] = []
diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py
@@ -9,7 +9,11 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .etymology import extract_etymology_section
-from .linkage import extract_expression_section, extract_linkage_section
+from .linkage import (
+    extract_expression_section,
+    extract_linkage_section,
+    extract_phraseology_section,
+)
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
 from .pronunciation import extract_pronunciation_section
@@ -25,7 +29,7 @@ def parse_section(
 ) -> None:
     cats = {}
     title_text = clean_node(wxr, cats, level_node.largs).strip(
-        "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789"
+        "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789:"
     )
     if title_text.lower() in POS_DATA:
         extract_pos_section(
@@ -59,11 +63,17 @@ def parse_section(
         extract_etymology_section(wxr, page_data, level_node)
     elif title_text == "Pronúncia":
         extract_pronunciation_section(wxr, page_data, level_node)
-    elif title_text in ["Nota", "Notas", "Nota de uso"]:
-        pass
+    elif title_text == "Fraseologia":
+        extract_phraseology_section(
+            wxr, page_data[-1] if len(page_data) else base_data, level_node
+        )
+    elif title_text.startswith("Nota"):
+        extract_note_section(wxr, page_data, level_node)
     elif title_text.lower() not in [
         "ver também",
+        "ligação externa",
         "ligações externas",
+        "ligação extena",
         "referências",
         "referência",
         "no wikcionário",
@@ -73,7 +83,9 @@ def parse_section(
         "no wikisaurus",
         "no commons",
         "no wikimedia commons",
+        "na internet",
         "galeria",
+        "galeria de imagens",
     ]:
         wxr.wtp.debug(f"unknown section: {title_text}")
 
@@ -86,7 +98,7 @@ def parse_section(
         clean_node(wxr, cats, link_node)
     save_section_cats(cats.get("categories", []), page_data, level_node, False)
 
-    if title_text != "Pronúncia":
+    if title_text.lower() not in ["pronúncia", "ver também"]:
         for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
             parse_section(wxr, page_data, base_data, next_level)
 
@@ -147,3 +159,20 @@ def parse_page(
         if len(data.senses) == 0:
             data.senses.append(Sense(tags=["no-gloss"]))
     return [m.model_dump(exclude_defaults=True) for m in page_data]
+
+
+def extract_note_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    level_node: LevelNode,
+) -> None:
+    notes = []
+    for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
+        note = clean_node(
+            wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
+        )
+        if note != "":
+            notes.append(note)
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            data.notes.extend(notes)
diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py
@@ -49,11 +49,11 @@ def extract_gloss_list_item(
     wxr: WiktextractContext,
     word_entry: WordEntry | Linkage,
     list_item: WikiNode,
+    parent_gloss: list[str] = [],
 ) -> None:
     gloss_nodes = []
-    sense = Sense()
-    first_gloss_index = len(list_item.children)
-    for index, node in enumerate(list_item.children):
+    sense = Sense(glosses=parent_gloss)
+    for node in list_item.children:
         if isinstance(node, TemplateNode):
             if node.template_name == "escopo":
                 extract_escopo_template(wxr, sense, node)
@@ -65,8 +65,6 @@ def extract_gloss_list_item(
             if node.sarg.endswith(("*", ":")):
                 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
                     extract_example_list_item(wxr, sense, next_list_item)
-                if index < first_gloss_index:
-                    first_gloss_index = index
         else:
             gloss_nodes.append(node)
 
@@ -75,6 +73,13 @@ def extract_gloss_list_item(
         sense.glosses.append(gloss_str)
         word_entry.senses.append(sense)
 
+    for child_list in list_item.find_child(NodeKind.LIST):
+        if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
+            for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
+                extract_gloss_list_item(
+                    wxr, word_entry, child_list_item, sense.glosses
+                )
+
 
 def extract_escopo_template(
     wxr: WiktextractContext,

diff --git a/src/wiktextract/extractor/pt/section_titles.py b/src/wiktextract/extractor/pt/section_titles.py
@@ -10,7 +10,7 @@
     "posposição": {"pos": "postp"},
     "pronome": {"pos": "pron"},
     "substantivo": {"pos": "noun"},
-    "berbo": {"pos": "verb"},
+    "verbo": {"pos": "verb"},
     "forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
     "forma verbal": {"pos": "verb", "tags": ["form-of"]},
     "locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
@@ -19,6 +19,7 @@
     "locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
     "expressão": {"pos": "phrase"},
     "abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
+    "abreviação": {"pos": "abbrev", "tags": ["abbreviation"]},
     "contração": {"pos": "contraction", "tags": ["contraction"]},
     "prefixo": {"pos": "prefix", "tags": ["morpheme"]},
     "sufixo": {"pos": "suffix", "tags": ["morpheme"]},
@@ -39,19 +40,27 @@
     },
     "forma de pronome": {"pos": "pron", "tags": ["form-of"]},
     "advérbio numeral": {"pos": "adv", "tags": ["numeral"]},
+    "verbo preposicionado": {"pos": "verb", "tags": ["prepositional"]},
+    "caractere han": {"pos": "character", "tags": ["han"]},
+    "hanja": {"pos": "character", "tags": ["Hanja"]},
+    "kanji": {"pos": "character", "tags": ["kanji"]},
+    "pronome pessoal": {"pos": "pron", "tags": ["person"]},
+    "pronome possessivo": {"pos": "det", "tags": ["possessive"]},
 }
 
 
 LINKAGE_SECTIONS = {
     "antônimos": "antonyms",
     "antônimo": "antonyms",
     "antónimo": "antonyms",
+    "antónimos": "antonyms",
     "antónimos/antônimos": "antonyms",
     "sinônimos": "synonyms",
     "sinônimo": "synonyms",
     "sinónimos/sinônimos": "synonyms",
     "sinónimos": "synonyms",
     "sinónimo": "synonyms",
+    "sinônimos e variantes": "synonyms",
     "verbetes derivados": "derived",
     "verbete derivado": "derived",
     "formas alternativas": "synonyms",
@@ -61,6 +70,7 @@
     "hiperônimos": "hypernyms",
     "hiperónimos": "hypernyms",
     "termos derivados": "derived",
+    "termos derivadoss": "derived",
     "grafia antiga": "synonyms",
     "diminutivo": "synonyms",
     "diminutivos": "synonyms",
@@ -70,11 +80,25 @@
     "entradas relacionadas": "related",
     "hipônimos": "hyponyms",
     "hiponímias": "hyponyms",
+    "hipónimos": "hyponyms",
     "ortografias obsoletas": "synonyms",
     "superlativo": "synonyms",
     "outros verbetes": "related",
     "cardinal equivalente": "synonyms",
+    "cardinais equivalentes": "synonyms",
     "aumentativo": "synonyms",
+    "advérbios derivados": "derived",
+    "derivações": "derived",
+    "homófonos": "homophones",
+    "homófono": "homophones",
+    "homónimos/homônimos": "homonyms",
+    "homônimos": "homonyms",
+    "parônimos": "paronyms",
+    "caracteres derivados": "derived",
+    "caracteres relacionados": "related",
+    "palavras com o kanji": "related",
+    "compostos": "derived",
+    "vermos derivados": "derived",
 }
 
 LINKAGE_TAGS = {
@@ -84,4 +108,5 @@
     "ortografias obsoletas": ["obsolete"],
     "superlativo": ["superlative"],
     "aumentativo": ["augmentative"],
+    "advérbios derivados": ["adverb"],
 }
diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py
@@ -74,3 +74,21 @@ def test_escopo(self):
                 }
             ],
         )
+
+    def test_nested_list(self):
+        self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês")
+        data = parse_page(
+            self.wxr,
+            "average",
+            """={{-en-}}=
+==Adjetivo==
+# [[médio]]
+## [[relativo à]] [[média]];''""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {"glosses": ["médio"]},
+                {"glosses": ["médio", "relativo à média;"]},
+            ],
+        )