From 8302770048aa9db84f9fccca4085bdbc1ded3c77 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 25 Dec 2024 09:05:59 +0800
Subject: [PATCH 1/5] [pt] handle nested example lists

---
 src/wiktextract/extractor/pt/pos.py |  9 ++++++++-
 tests/test_pt_example.py            | 24 ++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py
index 290baf91..01f04818 100644
--- a/src/wiktextract/extractor/pt/pos.py
+++ b/src/wiktextract/extractor/pt/pos.py
@@ -118,7 +118,8 @@ def extract_example_list_item(
 ) -> None:
     example = Example()
     ref_nodes = []
-    for node in list_item.children:
+
+    for index, node in enumerate(list_item.children):
         if (
             isinstance(node, WikiNode)
             and node.kind == NodeKind.ITALIC
@@ -147,6 +148,12 @@ def extract_example_list_item(
                     example.text = clean_node(
                         wxr, sense, node.template_parameters.get(1, "")
                     )
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            ref_nodes.clear()
+            example.ref = clean_node(wxr, None, list_item.children[:index])
+            for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+                example.text = clean_node(wxr, None, child_list_item.children)
+                break
         else:
             ref_nodes.append(node)
 
diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py
index 1def5fae..3472ab8f 100644
--- a/tests/test_pt_example.py
+++ b/tests/test_pt_example.py
@@ -127,3 +127,27 @@ def test_double_italic_nodes(self):
                 ],
             },
         )
+
+    def test_nested_example_list(self):
+        self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+        data = parse_page(
+            self.wxr,
+            "amor",
+            """={{-pt-}}=
+==Substantivo==
+# [[sentimento]]
+#* '''1595''', [[w:Luís de Camões|Luís de Camões]], ''Rimas'':
+#*: "'''''Amor''' é fogo que arde sem se ver<br>é ferida que dói, e não se sente,<br>é um contentamento descontente,<br>é dor que desatina sem doer.''\"""",
+        )
+        self.assertEqual(
+            data[0]["senses"][0],
+            {
+                "glosses": ["sentimento"],
+                "examples": [
+                    {
+                        "text": '"Amor é fogo que arde sem se ver\né ferida que dói, e não se sente,\né um contentamento descontente,\né dor que desatina sem doer."',
+                        "ref": "1595, Luís de Camões, Rimas:",
+                    }
+                ],
+            },
+        )

From 8828c6d66fed995ba38a4eaafa070064fbafa961 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 25 Dec 2024 10:10:01 +0800
Subject: [PATCH 2/5] [pt] handle example text above source child list layout

---
 src/wiktextract/extractor/pt/pos.py | 23 ++++++++++++++++++++---
 tests/test_pt_example.py            | 26 +++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py
index 01f04818..44283a61 100644
--- a/src/wiktextract/extractor/pt/pos.py
+++ b/src/wiktextract/extractor/pt/pos.py
@@ -1,3 +1,5 @@
+import re
+
 from wikitextprocessor import (
     HTMLNode,
     LevelNode,
@@ -148,12 +150,27 @@ def extract_example_list_item(
                     example.text = clean_node(
                         wxr, sense, node.template_parameters.get(1, "")
                     )
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
+            bold_str = clean_node(wxr, None, node)
+            if re.fullmatch(r"\d+", bold_str) is not None:
+                list_item_str = clean_node(
+                    wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
+                )
+                if list_item_str.endswith(":"):
+                    ref_nodes.clear()
+                    example.ref = list_item_str
+                    for child_list in list_item.find_child(NodeKind.LIST):
+                        for child_list_item in child_list.find_child(
+                            NodeKind.LIST_ITEM
+                        ):
+                            example.text = clean_node(
+                                wxr, None, child_list_item.children
+                            )
+                    break
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
             ref_nodes.clear()
-            example.ref = clean_node(wxr, None, list_item.children[:index])
             for child_list_item in node.find_child(NodeKind.LIST_ITEM):
-                example.text = clean_node(wxr, None, child_list_item.children)
-                break
+                ref_nodes.append(child_list_item.children)
         else:
             ref_nodes.append(node)
 
diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py
index 3472ab8f..bedd59e9 100644
--- a/tests/test_pt_example.py
+++ b/tests/test_pt_example.py
@@ -128,7 +128,7 @@ def test_double_italic_nodes(self):
             },
         )
 
-    def test_nested_example_list(self):
+    def test_source_above_text_child_list(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         data = parse_page(
             self.wxr,
@@ -151,3 +151,27 @@ def test_nested_example_list(self):
                 ],
             },
         )
+
+    def test_text_above_ref_child_list(self):
+        self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+        data = parse_page(
+            self.wxr,
+            "mar",
+            """={{-pt-}}=
+==Substantivo==
+# grande quantidade de
+#:"''Ó '''mar''' salgado, quanto do teu sal<br />São lágrimas de Portugal!<br />Por te cruzarmos, quantas mães choraram,<br />Quantos filhos em vão rezaram!<br />Quantas noivas ficaram por casar<br />Para que fosses nosso, ó '''mar'''!''"
+#:: ''-Mensagem, de Fernando Pessoa''""",
+        )
+        self.assertEqual(
+            data[0]["senses"][0],
+            {
+                "glosses": ["grande quantidade de"],
+                "examples": [
+                    {
+                        "text": "Ó mar salgado, quanto do teu sal\nSão lágrimas de Portugal!\nPor te cruzarmos, quantas mães choraram,\nQuantos filhos em vão rezaram!\nQuantas noivas ficaram por casar\nPara que fosses nosso, ó mar!",
+                        "ref": "-Mensagem, de Fernando Pessoa",
+                    }
+                ],
+            },
+        )

From 5ce0c6a0445b4bf89becb9c0a50c5b2eb3429dd3 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 25 Dec 2024 11:07:12 +0800
Subject: [PATCH 3/5] [pt] handle forms separated by slash in "flex.*"
 templates

---
 src/wiktextract/extractor/pt/inflection.py | 32 ++++++-----
 tests/test_pt_form.py                      | 65 ++++++++++++++++++++++
 2 files changed, 82 insertions(+), 15 deletions(-)

diff --git a/src/wiktextract/extractor/pt/inflection.py b/src/wiktextract/extractor/pt/inflection.py
index 17d56d2f..f33ae8d1 100644
--- a/src/wiktextract/extractor/pt/inflection.py
+++ b/src/wiktextract/extractor/pt/inflection.py
@@ -54,20 +54,22 @@ def extract_flex_template(
                 elif cell_node.attrs.get("style") == "background:#f4f4f4;":
                     row_header = cell_text
                     col_header_index += col_span
-                elif cell_text in ["–", wxr.wtp.title]:
-                    col_cell_index += col_span
-                    continue
                 else:
-                    form = Form(form=cell_text)
-                    if row_header != "":
-                        form.raw_tags.append(row_header)
-                    for col_header in col_headers:
-                        if (
-                            col_cell_index >= col_header.col_index
-                            and col_cell_index
-                            < col_header.col_index + col_header.colspan
-                        ):
-                            form.raw_tags.append(col_header.text)
-                    translate_raw_tags(form)
-                    word_entry.forms.append(form)
+                    for link_node in cell_node.find_child(NodeKind.LINK):
+                        form_str = clean_node(wxr, None, link_node)
+                        if form_str in ["", "–", "-", wxr.wtp.title]:
+                            continue
+                        form_data = Form(form=form_str)
+                        if row_header != "":
+                            form_data.raw_tags.append(row_header)
+                        for col_header in col_headers:
+                            if (
+                                col_cell_index >= col_header.col_index
+                                and col_cell_index
+                                < col_header.col_index + col_header.colspan
+                            ):
+                                form_data.raw_tags.append(col_header.text)
+                        translate_raw_tags(form_data)
+                        word_entry.forms.append(form_data)
+
                     col_cell_index += col_span
diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py
index e6716386..aa3635bb 100644
--- a/tests/test_pt_form.py
+++ b/tests/test_pt_form.py
@@ -73,3 +73,68 @@ def test_flex_pt_subst_completa(self):
                 {"form": "matilha", "tags": ["standard", "collective"]},
             ],
         )
+
+    def test_slash_cell(self):
+        self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+        self.wxr.wtp.add_page(
+            "Predefinição:flex.pt.subst.completa",
+            10,
+            """{|
+|-
+! style="background:#f4f4f4;" rowspan="2" |
+! style="background:#ffffe0;" colspan="2" | [[masculino|Masculino]]
+! style="background:#ffffe0;" colspan="2" | [[feminino|Feminino]]
+! style="background:#ffffe0;" rowspan="2" | [[coletivo|Coletivo]]
+|-
+! style="background:#ffffe0;" | [[singular|Singular]]
+! style="background:#ffffe0;" | [[plural|Plural]]
+! style="background:#ffffe0;" | [[singular|Singular]]
+! style="background:#ffffe0;" | [[plural|Plural]]
+|-
+! style="background:#f4f4f4;" | [[normal|Normal]]
+| style="background:#ffffff; text-align:center;" | [[parvo]]
+| style="background:#ffffff; text-align:center;" | [[parvos#Português|<span style="color:black">parvos</span>]]
+| style="background:#ffffff; text-align:center;" | [[parva#Português|parva]] / [[párvoa#Português|<span style="color:black">párvoa</span>]]
+| style="background:#ffffff; text-align:center;" | [[parvas#Português|<span style="color:black">parvas</span>]] / [[párvoas#Português|<span style="color:black">párvoas</span>]]
+| style="background:#ffffff; text-align:center;" rowspan="3" | [[-#Português|-]]
+|}""",
+        )
+        data = parse_page(
+            self.wxr,
+            "parvo",
+            """={{-pt-}}=
+==Substantivo==
+{{flex.pt.subst.completa
+|alinhamento=left
+|ms=parvo
+|msa=parvalhão|msa2=parvoalho|msa3=parvoeirão
+|msd=parvinho
+|mp=parvos
+|mpa=parvalhões|mpa2=parvoalhos|mpa3=parvoeirões
+|mpd=parvinhos
+|fs=parva|fs2=párvoa
+|fsa=parvalhona|fsa2=parvoalha|fsa3=parvoeirona
+|fsd=parvinha
+|fp=parvas|fp2=párvoas
+|fpa=parvalhonas|fpa2=parvoalhas|fpa3=pavoeironas
+|fpd=parvinhas
+|col=-
+}}
+# [[pessoa]]""",
+        )
+        self.assertEqual(
+            data[0]["forms"],
+            [
+                {"form": "parvos", "tags": ["standard", "masculine", "plural"]},
+                {
+                    "form": "parva",
+                    "tags": ["standard", "feminine", "singular"],
+                },
+                {
+                    "form": "párvoa",
+                    "tags": ["standard", "feminine", "singular"],
+                },
+                {"form": "parvas", "tags": ["standard", "feminine", "plural"]},
+                {"form": "párvoas", "tags": ["standard", "feminine", "plural"]},
+            ],
+        )

From 7a5173dcfae09601e28dc456df214f9513ac2420 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 25 Dec 2024 11:42:35 +0800
Subject: [PATCH 4/5] [pt] relax gloss child list check condition

---
 src/wiktextract/extractor/pt/pos.py |  2 +-
 tests/test_pt_linkage.py            | 31 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py
index 44283a61..9d56617e 100644
--- a/src/wiktextract/extractor/pt/pos.py
+++ b/src/wiktextract/extractor/pt/pos.py
@@ -76,7 +76,7 @@ def extract_gloss_list_item(
         word_entry.senses.append(sense)
 
     for child_list in list_item.find_child(NodeKind.LIST):
-        if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
+        if child_list.sarg.endswith("#"):
             for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
                 extract_gloss_list_item(
                     wxr, word_entry, child_list_item, sense.glosses
diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py
index 822157ec..5539ce34 100644
--- a/tests/test_pt_linkage.py
+++ b/tests/test_pt_linkage.py
@@ -221,3 +221,34 @@ def test_phraseology_nested_list(self):
                 },
             ],
         )
+
+    def test_expression_gloss_child_list(self):
+        self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+        data = parse_page(
+            self.wxr,
+            "testa",
+            """={{-pt-}}=
+==Substantivo==
+# [[parte]]
+
+===Expressões===
+* '''[[testa de boi]]''': (Portugal, Douro)
+*# indivíduo com a testa avantajada;""",
+        )
+        self.assertEqual(
+            data[0]["expressions"],
+            [
+                {
+                    "word": "testa de boi",
+                    "senses": [
+                        {"glosses": ["(Portugal, Douro)"]},
+                        {
+                            "glosses": [
+                                "(Portugal, Douro)",
+                                "indivíduo com a testa avantajada;",
+                            ]
+                        },
+                    ],
+                }
+            ],
+        )

From f6c106f307b8a02073be80f2b67e9bab9a038205 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 25 Dec 2024 16:47:38 +0800
Subject: [PATCH 5/5] [pt] translate some tags and topics in template "escopo"

---
 src/wiktextract/extractor/pt/pos.py  | 11 ++---
 src/wiktextract/extractor/pt/tags.py | 69 +++++++++++++++++++++++++++-
 2 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py
index 9d56617e..1a4bd3b8 100644
--- a/src/wiktextract/extractor/pt/pos.py
+++ b/src/wiktextract/extractor/pt/pos.py
@@ -89,13 +89,10 @@ def extract_escopo_template(
     t_node: TemplateNode,
 ) -> None:
     # https://pt.wiktionary.org/wiki/Predefinição:escopo
-    for arg in range(2, 9):
-        if arg not in t_node.template_parameters:
-            break
-        raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
-        if raw_tag != "":
-            sense.raw_tags.append(raw_tag)
-    clean_node(wxr, sense, t_node)
+    expanded_str = clean_node(wxr, sense, t_node).strip("()")
+    for raw_tag in re.split(r", | e ", expanded_str):
+        if raw_tag.strip() != "":
+            sense.raw_tags.append(raw_tag.strip())
 
 
 def extract_escopo2_template(
diff --git a/src/wiktextract/extractor/pt/tags.py b/src/wiktextract/extractor/pt/tags.py
index e48ae907..ab7987a8 100644
--- a/src/wiktextract/extractor/pt/tags.py
+++ b/src/wiktextract/extractor/pt/tags.py
@@ -113,7 +113,72 @@
     "Diminutivo": "diminutive",
 }
 
-TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS}
+# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
+GLOSS_TAGS = {
+    "Grafia portuguesa": "Portugal",
+    "Grafia brasileira": "Brazil",
+    "histórico": "historical",
+}
+
+TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS}
+
+# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
+TOPICS = {
+    "botânica": "botany",
+    "ciência da computação": "computing",
+    # "ciência dos materiais": "",
+    "engenharia": "engineering",
+    # "pedagogia": "pedagogy",
+    # "cronônimo": "chrononym",
+    "basquetebol": "basketball",
+    "beisebol": "baseball",
+    "críquete": "cricket",
+    "desporto": "sports",
+    "esporte": "sports",
+    "ténis": "tennis",
+    "tênis": "tennis",
+    "esgrima": "fencing",
+    "geografia": "geography",
+    # "toponímia": "",
+    # "territory": "",
+    "zoologia": "zoology",
+    "ornitologia": "ornithology",
+    # "artrópodes": "",
+    "entomologia": "entomology",
+    "ictiologia": "ichthyology",
+    "veterinária": "veterinary",
+    # "antropónimo": "",
+    "alimentação": "food",
+    "arte": "arts",
+    "aeronáutica": "aeronautics",
+    "aritmética": "arithmetic",
+    "Meteorologia": "meteorology",
+    "design": "design",
+    "patologia": "pathology",
+    "etnologia": "ethnology",
+    "farmacologia": "pharmacology",
+    "transporte": "transport",
+    "Ginecologia": "gynecology",
+    "linguística": "linguistics",
+    "indústria têxtil": "textiles",
+    "mídia": "media",
+    "ciência da informação": "information-science",
+    "ludologia": "ludology",
+    "náutica": "nautical",
+    "mitologia": "mythology",
+    "mineralogia": "mineralogy",
+    "mobiliário": "furniture",
+    "numismática": "numismatics",
+    # "Esoterismo": "",
+    "profissão": "profession",
+    # "parapsiquismo": "",
+    "vestuário": "clothing",
+    "direito": "law",
+    "química": "chemistry",
+    "videojogo": "video-games",
+    "vídeo game": "video-games",
+    "viticultura": "viticulture",
+}
 
 
 def translate_raw_tags(data: WordEntry) -> None:
@@ -125,6 +190,8 @@ def translate_raw_tags(data: WordEntry) -> None:
                 data.tags.append(tr_tag)
             elif isinstance(tr_tag, list):
                 data.tags.extend(tr_tag)
+        elif raw_tag in TOPICS and hasattr(data, "topics"):
+            data.topics.append(TOPICS[raw_tag])
         else:
             raw_tags.append(raw_tag)
     data.raw_tags = raw_tags