From e52328866c4ba682208ab01c9804c2f357b90c9e Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 27 Dec 2024 15:02:29 +0800 Subject: [PATCH 1/3] [pt] extract "conj.en*" table templates --- src/wiktextract/extractor/pt/inflection.py | 29 ++++++++++++++++++++++ src/wiktextract/extractor/pt/tags.py | 3 +++ tests/test_pt_form.py | 28 +++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/src/wiktextract/extractor/pt/inflection.py b/src/wiktextract/extractor/pt/inflection.py index 7bd72ace..dd84c0eb 100644 --- a/src/wiktextract/extractor/pt/inflection.py +++ b/src/wiktextract/extractor/pt/inflection.py @@ -81,6 +81,8 @@ def extract_conjugation_section( for t_node in level_node.find_child(NodeKind.TEMPLATE): if t_node.template_name.startswith(("conj.pt", "conj/pt")): extract_conj_pt_template(wxr, word_entry, t_node) + elif t_node.template_name.startswith("conj.en"): + extract_conj_en_template(wxr, word_entry, t_node) def extract_conj_pt_template( @@ -214,3 +216,30 @@ def add_conj_pt_form( form.raw_tags.append(row_header.text) translate_raw_tags(form) word_entry.forms.append(form) + + +def extract_conj_en_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://pt.wiktionary.org/wiki/Predefinição:conj.en + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for table in expanded_node.find_child(NodeKind.TABLE): + for row in table.find_child(NodeKind.TABLE_ROW): + for cell in row.find_child(NodeKind.TABLE_CELL): + raw_tag = "" + for sup_tag in cell.find_html("sup"): + raw_tag = clean_node(wxr, None, sup_tag.children).strip( + ": " + ) + for list_node in cell.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + for bold_node in list_item.find_child(NodeKind.BOLD): + form_str = clean_node(wxr, None, bold_node) + if form_str not in ["", wxr.wtp.title]: + form = Form(form=form_str) + if raw_tag != "": + form.raw_tags.append(raw_tag) + translate_raw_tags(form) + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/pt/tags.py b/src/wiktextract/extractor/pt/tags.py index fae32ff6..20fc8cba 100644 --- a/src/wiktextract/extractor/pt/tags.py +++ b/src/wiktextract/extractor/pt/tags.py @@ -131,6 +131,9 @@ "Afirmativo": "affirmative", "Negativo": "negative", "Infinitivo pessoal": ["personal", "infinitive"], + # Predefinição:conj.en + "Infinitivo": "infinitive", + "Passado simples": "past", } # https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py index 97d07517..5a3a0bca 100644 --- a/tests/test_pt_form.py +++ b/tests/test_pt_form.py @@ -219,3 +219,31 @@ def test_conj_pt(self): }, ], ) + + def test_conj_en(self): + self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês") + self.wxr.wtp.add_page( + "Predefinição:conj.en.2", + 10, + """{| +|- +| Passado simples: +: '''[[red]]''' / '''[[redd]]''' +|}""", + ) + data = parse_page( + self.wxr, + "rede", + """={{-en-}}= +==Verbo== +# {{escopo|en|Arcaísmo}} [[governar]], [[proteger]] +===Conjugação=== +{{conj.en.2|rede|redes|red|redd|red|redd|reding}}""", + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "red", "tags": ["past"]}, + {"form": "redd", "tags": ["past"]}, + ], + ) From 4ff9d998530c651413803b2e1d187e11c814465f Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 27 Dec 2024 15:37:51 +0800 Subject: [PATCH 2/3] [pt] extract degree section --- src/wiktextract/extractor/pt/inflection.py | 23 +++++++++++++++++++ src/wiktextract/extractor/pt/page.py | 8 +++++-- src/wiktextract/extractor/pt/tags.py | 12 +++++++++- tests/test_pt_form.py | 26 ++++++++++++++++++++++ 4 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/pt/inflection.py b/src/wiktextract/extractor/pt/inflection.py index dd84c0eb..86b0d631 100644 --- a/src/wiktextract/extractor/pt/inflection.py +++ b/src/wiktextract/extractor/pt/inflection.py @@ -243,3 +243,26 @@ def extract_conj_en_template( form.raw_tags.append(raw_tag) translate_raw_tags(form) word_entry.forms.append(form) + + +def extract_degree_section( + wxr: WiktextractContext, + word_entry: WordEntry, + level_node: LevelNode, +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + for index, bold_node in list_item.find_child(NodeKind.BOLD, True): + bold_str = clean_node(wxr, None, bold_node) + forms_str = clean_node( + wxr, None, list_item.children[index + 1 :] + ).strip(": ") + for form_str in forms_str.split(","): + form_str = form_str.strip() + if form_str not in ["", wxr.wtp.title]: + form = Form(form=form_str) + if form_str != "": + form.raw_tags.append(bold_str) + translate_raw_tags(form) + word_entry.forms.append(form) + break diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index 82d8f31d..4c4329af 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -9,7 +9,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .etymology import extract_etymology_section -from .inflection import extract_conjugation_section +from .inflection import extract_conjugation_section, extract_degree_section from .linkage import ( extract_expression_section, extract_linkage_section, @@ -68,12 +68,16 @@ def parse_section( extract_phraseology_section( wxr, page_data[-1] if len(page_data) else base_data, level_node ) - elif title_text.startswith("Nota"): + elif title_text.startswith(("Nota", "Uso")): extract_note_section(wxr, page_data, level_node) elif title_text == "Conjugação": extract_conjugation_section( wxr, page_data[-1] if len(page_data) else base_data, level_node ) + elif title_text == "Graus": + extract_degree_section( + wxr, page_data[-1] if len(page_data) else base_data, level_node + ) elif title_text.lower() not in [ "ver também", "ligação externa", diff --git a/src/wiktextract/extractor/pt/tags.py b/src/wiktextract/extractor/pt/tags.py index 20fc8cba..40860fd9 100644 --- a/src/wiktextract/extractor/pt/tags.py +++ b/src/wiktextract/extractor/pt/tags.py @@ -212,7 +212,17 @@ "plural": "plural", } -TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS} +OTHER_TAGS = { + "comparativo de superioridade": ["comparative", "superior"], + "superlativo absoluto sintético": ["absolute", "superlative"], + "superlativo relativo de superioridade": [ + "relative", + "superlative", + "superior", + ], +} + +TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS, **OTHER_TAGS} # https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo TOPICS = { diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py index 5a3a0bca..2b3fce00 100644 --- a/tests/test_pt_form.py +++ b/tests/test_pt_form.py @@ -247,3 +247,29 @@ def test_conj_en(self): {"form": "redd", "tags": ["past"]}, ], ) + + def test_degree_section(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "bom", + """={{-pt-}}= +==Adjetivo== +# que +===Graus=== +* '''comparativo de superioridade''': [[melhor]] do que +* '''superlativo absoluto sintético''': [[boníssimo]], [[ótimo]] +* '''superlativo relativo de superioridade''': melhor""", + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "melhor do que", "tags": ["comparative", "superior"]}, + {"form": "boníssimo", "tags": ["absolute", "superlative"]}, + {"form": "ótimo", "tags": ["absolute", "superlative"]}, + { + "form": "melhor", + "tags": ["relative", "superlative", "superior"], + }, + ], + ) From 3479756d47d276e1191686dbab68442a5d7bd0db Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 27 Dec 2024 15:59:50 +0800 Subject: [PATCH 3/3] [pt] extract "Descendentes" section --- src/wiktextract/extractor/pt/models.py | 2 ++ src/wiktextract/extractor/pt/page.py | 7 ++++-- src/wiktextract/extractor/pt/translation.py | 27 ++++++++++++++++++--- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index 8fdc6750..01ff3aaa 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -102,3 +102,5 @@ class WordEntry(PortugueseBaseModel): sounds: list[Sound] = [] forms: list[Form] = [] notes: list[str] = [] + cognates: list[Translation] = [] + descendants: list[Translation] = [] diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index 4c4329af..70925974 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -41,9 +41,12 @@ def parse_section( title_text, cats.get("categories", []), ) - elif title_text in ["Tradução", "Traduções", "Cognatos"]: + elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]: extract_translation_section( - wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + wxr, + page_data[-1] if len(page_data) > 0 else base_data, + level_node, + title_text, ) elif title_text == "Expressões": extract_expression_section( diff --git a/src/wiktextract/extractor/pt/translation.py b/src/wiktextract/extractor/pt/translation.py index c2251c92..b138e29c 100644 --- a/src/wiktextract/extractor/pt/translation.py +++ b/src/wiktextract/extractor/pt/translation.py @@ -11,9 +11,17 @@ def extract_translation_section( wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode, + title_text: str, ) -> None: sense = "" sense_index = 0 + target_field = "translations" + match title_text: + case "Cognatos": + target_field = "cognates" + case "Descendentes": + target_field = "descendants" + for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): match node.kind: case NodeKind.TEMPLATE: @@ -22,7 +30,12 @@ def extract_translation_section( case NodeKind.LIST: for list_item in node.find_child(NodeKind.LIST_ITEM): extract_translation_list_item( - wxr, word_entry, list_item, sense, sense_index + wxr, + word_entry, + list_item, + sense, + sense_index, + target_field, ) @@ -48,6 +61,7 @@ def extract_translation_list_item( list_item: WikiNode, sense: str, sense_index: int, + target_field: str, ) -> None: translations = [] lang_name = "unknown" @@ -101,10 +115,15 @@ def extract_translation_list_item( elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: for next_list_item in node.find_child(NodeKind.LIST_ITEM): extract_translation_list_item( - wxr, word_entry, next_list_item, sense, sense_index + wxr, + word_entry, + next_list_item, + sense, + sense_index, + target_field, ) - word_entry.translations.extend(translations) + getattr(word_entry, target_field).extend(translations) def extract_trad_template( @@ -239,4 +258,4 @@ def extract_translation_subpage( page = wxr.wtp.get_page(page_title, 0) if page is not None and page.body is not None: root = wxr.wtp.parse(page.body) - extract_translation_section(wxr, word_entry, root) + extract_translation_section(wxr, word_entry, root, "Tradução")