From 92f11b4cf083922aa9b92955a4dac9270ca88128 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 26 Dec 2024 16:28:36 +0800 Subject: [PATCH 1/2] =?UTF-8?q?[pt]=20translate=20more=20tags=20data=20in?= =?UTF-8?q?=20"Predefini=C3=A7=C3=A3o:escopo/n=C3=BAcleo"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/wiktextract/extractor/pt/models.py | 1 + src/wiktextract/extractor/pt/pos.py | 2 + src/wiktextract/extractor/pt/tags.py | 82 ++++++++++++++++++++- src/wiktextract/extractor/zh/translation.py | 2 +- tests/test_pt_gloss.py | 3 +- tests/test_pt_linkage.py | 5 +- tests/test_pt_sound.py | 7 +- 7 files changed, 91 insertions(+), 11 deletions(-) diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index d665da1d..8fdc6750 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -84,6 +84,7 @@ class WordEntry(PortugueseBaseModel): categories: list[str] = [] tags: list[str] = [] raw_tags: list[str] = [] + topics: list[str] = [] translations: list[Translation] = [] expressions: list[Linkage] = [] antonyms: list[Linkage] = [] diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 1a4bd3b8..50350838 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -14,6 +14,7 @@ from .inflection import extract_flex_template from .models import Example, Linkage, Sense, WordEntry from .section_titles import POS_DATA +from .tags import translate_raw_tags def extract_pos_section( @@ -73,6 +74,7 @@ def extract_gloss_list_item( gloss_str = clean_node(wxr, sense, gloss_nodes) if len(gloss_str) > 0: sense.glosses.append(gloss_str) + translate_raw_tags(sense) word_entry.senses.append(sense) for child_list in list_item.find_child(NodeKind.LIST): diff --git a/src/wiktextract/extractor/pt/tags.py b/src/wiktextract/extractor/pt/tags.py index ab7987a8..a6b67310 100644 --- a/src/wiktextract/extractor/pt/tags.py +++ b/src/wiktextract/extractor/pt/tags.py @@ -118,14 +118,86 @@ "Grafia portuguesa": "Portugal", "Grafia brasileira": "Brazil", "histórico": "historical", + "antigo": "archaic", + "arcaico": "archaic", + "em desuso": "obsolete", + "obsoleto": "obsolete", + "pouco comum": "uncommon", + "raro": "rare", + "obsceno": "vulgar", + "coloquial": "colloquial", + "familiar": "familiar", + "informal": "informal", + # "popular": "", + "figurado": "figuratively", + "depreciativo": "derogatory", + "pejorativo": "pejorative", + "poético": "poetic", + "internetês": ["Internet", "slang"], + "ironia": "ironic", + # "alemanismo": "", + # "italianismo": "Italianism", + # "germanismo": "Germanism", + # "francesismo": "", + # "galicismo": "Gallicism", + "anglicismo": "Anglicism", + # "portuguesismo": "Portuguesism", + # "estrangeirismo": "loanword", + "regionalism": "regional", + "Angola": "Angola", + "Brasil": "Brazil", + # "Amazônia": "Amazonia", + "Nordeste do Brasil": "Northeast-Brazil", + "Norte do Brasil": "North-Brazil", + "Centro-Oeste do Brasil": "Central-West-Brazil", + "Sudeste do Brasil": "Southeast-Brazil", + "Sul do Brasil": "Southern-Brazil", + "Acre": "Acre", + "Alagoas": "Alagoas", + "Amapá": "Amapá", + "Amazonas": "Amazonas", + "Bahia": "Bahia", + "dialeto caipira": "dialectal", + "Ceará": "Ceará", + # "Distrito Federal": "Federal District", + "Espírito Santo": "Espírito Santo", + "Goiás": "Goias", + "Maranhão": "Maranhão", + "Mato Grosso": "Mato Grosso", + "Mato Grosso do Sul": "Mato Grosso do Sul", + "Minas Gerais": "Minas Gerais", + "Pará": "Pará", + "Paraíba": "Paraíba", + "Paraná": "Paraná", + "Pernambuco": "Pernambuco", + "Piauí": "Piauí", + "Rio de Janeiro": "Rio de Janeiro", + "Rio Grande do Norte": "Rio Grande do Norte", + "Rio Grande do Sul": "Rio Grande do Sul", + "Rondônia": "Rondônia", + "Roraima": "Roraima", + # "baralhete": "", + # "canteiros": "", + # "alvanéis": "", + # "telheiros": "", + # "músicos": "", + # "cesteiros": "", + "transitivo": "transitive", + "intransitivo": "intransitive", + "reflexivo": "reflexive", + "pronominal": "pronominal", + "plural": "plural", } TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS} # https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo TOPICS = { + "anatomia": "anatomy", + "arquitetura": "architecture", "botânica": "botany", "ciência da computação": "computing", + "comunicação": "communications", # "ciência dos materiais": "", "engenharia": "engineering", # "pedagogia": "pedagogy", @@ -184,14 +256,16 @@ def translate_raw_tags(data: WordEntry) -> None: raw_tags = [] for raw_tag in data.raw_tags: - if raw_tag in TAGS: - tr_tag = TAGS[raw_tag] + if raw_tag in TAGS or raw_tag.lower() in TAGS: + tr_tag = TAGS.get(raw_tag, TAGS.get(raw_tag.lower())) if isinstance(tr_tag, str): data.tags.append(tr_tag) elif isinstance(tr_tag, list): data.tags.extend(tr_tag) - elif raw_tag in TOPICS and hasattr(data, "topics"): - data.topics.append(TOPICS[raw_tag]) + elif (raw_tag in TOPICS or raw_tag.lower() in TOPICS) and hasattr( + data, "topics" + ): + data.topics.append(TOPICS.get(raw_tag, TOPICS.get(raw_tag.lower()))) else: raw_tags.append(raw_tag) data.raw_tags = raw_tags diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 710b16ca..fe456cac 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -162,7 +162,7 @@ def translation_subpage( wxr, None, template_node.template_parameters.get(2, wxr.wtp.title) ) if "#" in page_title: - page_title = page_title[:page_title.index("#")] + page_title = page_title[: page_title.index("#")] translation_subpage_title = page_title if page_title == wxr.wtp.title: diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py index 85be2de8..50e2313c 100644 --- a/tests/test_pt_gloss.py +++ b/tests/test_pt_gloss.py @@ -66,7 +66,8 @@ def test_escopo(self): "Coloquialismo (Português)", ], "glosses": ['gênio do mal em geral ("capeta")'], - "raw_tags": ["Brasil", "popular"], + "raw_tags": ["popular"], + "tags": ["Brazil"], "examples": [{"text": "O cão em forma de gente."}], } ], diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py index 5539ce34..f260b4cd 100644 --- a/tests/test_pt_linkage.py +++ b/tests/test_pt_linkage.py @@ -56,7 +56,7 @@ def test_expression(self): "senses": [ { "glosses": ["perceber a verdade"], - "raw_tags": ["intransitivo"], + "tags": ["intransitive"], "examples": [ { "text": "Entre amores, trapaças e muitas confusões, Katie terá que lutar para conquistar seu espaço e abrir os olhos para a realidade da cidade grande." @@ -141,7 +141,8 @@ def test_nested_list(self): "word": "cusco", "sense": "animal mamífero, carnívoro e quadrúpede", "sense_index": 1, - "raw_tags": ["Brasil", "RS"], + "tags": ["Brazil"], + "raw_tags": ["RS"], }, ], ) diff --git a/tests/test_pt_sound.py b/tests/test_pt_sound.py index a384690f..2078c8df 100644 --- a/tests/test_pt_sound.py +++ b/tests/test_pt_sound.py @@ -48,12 +48,13 @@ def test_subsection(self): [ { "ipa": "/ˈɔ.ʎʊ/", - "raw_tags": ["Brasil", "Forma verbal"], + "tags": ["Brazil"], + "raw_tags": ["Forma verbal"], }, { "ipa": '/"O.LU/', - "raw_tags": ["Brasil", "Forma verbal"], - "tags": ["X-SAMPA"], + "raw_tags": ["Forma verbal"], + "tags": ["X-SAMPA", "Brazil"], }, ], ) From 4c29a27c801f9c024ea482444e624779ba839792 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 26 Dec 2024 17:57:52 +0800 Subject: [PATCH 2/2] [pt] extract "conj.pt" table template --- src/wiktextract/extractor/pt/inflection.py | 143 ++++++++++++++++++++- src/wiktextract/extractor/pt/page.py | 5 + src/wiktextract/extractor/pt/tags.py | 20 +++ tests/test_pt_form.py | 81 ++++++++++++ 4 files changed, 248 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/pt/inflection.py b/src/wiktextract/extractor/pt/inflection.py index f33ae8d1..7bd72ace 100644 --- a/src/wiktextract/extractor/pt/inflection.py +++ b/src/wiktextract/extractor/pt/inflection.py @@ -1,7 +1,7 @@ import re from dataclasses import dataclass -from wikitextprocessor import NodeKind, TemplateNode +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext @@ -73,3 +73,144 @@ def extract_flex_template( word_entry.forms.append(form_data) col_cell_index += col_span + + +def extract_conjugation_section( + wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode +) -> None: + for t_node in level_node.find_child(NodeKind.TEMPLATE): + if t_node.template_name.startswith(("conj.pt", "conj/pt")): + extract_conj_pt_template(wxr, word_entry, t_node) + + +def extract_conj_pt_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://pt.wiktionary.org/wiki/Predefinição:conj.pt + # https://pt.wiktionary.org/wiki/Predefinição:conj/pt + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for index, table_node in enumerate( + expanded_node.find_child_recursively(NodeKind.TABLE) + ): + match index: + case 0: + extract_conj_pt_template_first_table( + wxr, word_entry, table_node + ) + case 1: + extract_conj_pt_template_second_table( + wxr, word_entry, table_node + ) + + +def extract_conj_pt_template_first_table( + wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode +) -> None: + for row in table_node.find_child(NodeKind.TABLE_ROW): + row_header = "" + for cell in row.find_child( + NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL + ): + match cell.kind: + case NodeKind.TABLE_HEADER_CELL: + row_header = clean_node(wxr, None, cell) + case NodeKind.TABLE_CELL: + form_str = clean_node(wxr, None, cell) + if form_str not in ["", wxr.wtp.title]: + form = Form(form=form_str) + if row_header != "": + form.raw_tags.append(row_header) + translate_raw_tags(form) + word_entry.forms.append(form) + + +def extract_conj_pt_template_second_table( + wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode +) -> None: + col_headers = [] + row_headers = [] + row_index = 0 + for row in table_node.find_child(NodeKind.TABLE_ROW): + col_index = 0 + for cell in row.find_child( + NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL + ): + match cell.kind: + case NodeKind.TABLE_HEADER_CELL: + colspan = 1 + colspan_str = cell.attrs.get("colspan", "1") + if re.fullmatch(r"\d+", colspan_str): + colspan = int(colspan_str) + rowspan = 1 + rowspan_str = cell.attrs.get("rowspan", "1") + if re.fullmatch(r"\d+", rowspan_str): + rowspan = int(rowspan_str) + header_str = clean_node(wxr, None, cell) + if header_str == "": + continue + if rowspan > 1: + row_index = 0 + row_headers.clear() + header = TableHeader( + header_str, col_index, colspan, row_index, rowspan + ) + if not row.contain_node(NodeKind.TABLE_CELL): + col_headers.append(header) + col_index += colspan + else: + row_headers.append(header) + case NodeKind.TABLE_CELL: + has_link = False + for link_node in cell.find_child(NodeKind.LINK): + link_str = clean_node(wxr, None, link_node) + if link_str not in ["", wxr.wtp.title]: + add_conj_pt_form( + word_entry, + link_str, + col_index, + row_index, + col_headers, + row_headers, + ) + has_link = True + if not has_link: + cell_str = clean_node(wxr, None, cell) + if cell_str not in ["", wxr.wtp.title]: + add_conj_pt_form( + word_entry, + cell_str, + col_index, + row_index, + col_headers, + row_headers, + ) + col_index += 1 + + row_index += 1 + + +def add_conj_pt_form( + word_entry: WordEntry, + form_str: str, + col_index: int, + row_index: int, + col_headers: list[TableHeader], + row_headers: list[TableHeader], +) -> None: + form = Form(form=form_str) + for col_header in col_headers: + if ( + col_index >= col_header.col_index + and col_index < col_header.col_index + col_header.colspan + ): + form.raw_tags.append(col_header.text) + for row_header in row_headers: + if ( + row_index >= row_header.row_index + and row_index < row_header.row_index + row_header.rowspan + ): + form.raw_tags.append(row_header.text) + translate_raw_tags(form) + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index 4fa16d4e..82d8f31d 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -9,6 +9,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .etymology import extract_etymology_section +from .inflection import extract_conjugation_section from .linkage import ( extract_expression_section, extract_linkage_section, @@ -69,6 +70,10 @@ def parse_section( ) elif title_text.startswith("Nota"): extract_note_section(wxr, page_data, level_node) + elif title_text == "Conjugação": + extract_conjugation_section( + wxr, page_data[-1] if len(page_data) else base_data, level_node + ) elif title_text.lower() not in [ "ver também", "ligação externa", diff --git a/src/wiktextract/extractor/pt/tags.py b/src/wiktextract/extractor/pt/tags.py index a6b67310..fae32ff6 100644 --- a/src/wiktextract/extractor/pt/tags.py +++ b/src/wiktextract/extractor/pt/tags.py @@ -111,6 +111,26 @@ "Normal": "standard", "Aumentativo": "augmentative", "Diminutivo": "diminutive", + # Predefinição:conj.pt + "Infinitivo impessoal": ["impersonal", "infinitive"], + "Gerúndio": "gerund", + "Particípio": "participle", + "primeira": "first-person", + "segunda": "second-person", + "terceira": "third-person", + "Modo\nIndicativo": "indicative", + "Presente": "present", + "Pretérito imperfeito": ["past", "continuative"], + "Pretérito perfeito": "past", + "Pretérito mais-que-perfeito": "pluperfect", + "Futuro do presente": ["future", "present"], + "Futuro do pretérito": ["future", "past"], + "Modo\nSubjuntivo\n(Conjuntivo)": ["subjunctive", "conjunctive"], + "Futuro": "future", + "Modo\nImperativo": "imperative", + "Afirmativo": "affirmative", + "Negativo": "negative", + "Infinitivo pessoal": ["personal", "infinitive"], } # https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py index aa3635bb..97d07517 100644 --- a/tests/test_pt_form.py +++ b/tests/test_pt_form.py @@ -138,3 +138,84 @@ def test_slash_cell(self): {"form": "párvoas", "tags": ["standard", "feminine", "plural"]}, ], ) + + def test_conj_pt(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + self.wxr.wtp.add_page( + "Predefinição:conj/pt", + 10, + """
+
    Verbo regular da 1.ª conjugação (-ar)    
+
""", + ) + data = parse_page( + self.wxr, + "ababalhar", + """={{-pt-}}= +==Verbo== +# {{escopo|pt|Popular}} [[babar]]; [[conspurcar]] +===Conjugação=== +{{conj/pt|ababalh|ar}}""", + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "ababalhando", "tags": ["gerund"]}, + {"form": "ababalhado", "tags": ["participle"]}, + { + "form": "ababalhei", + "tags": ["singular", "first-person", "indicative", "past"], + }, + { + "form": "ababalhaste", + "tags": ["singular", "second-person", "indicative", "past"], + }, + { + "form": "ababalhou", + "tags": ["singular", "third-person", "indicative", "past"], + }, + { + "form": "ababalhamos", + "tags": ["plural", "first-person", "indicative", "past"], + }, + { + "form": "ababalhámos", + "tags": ["plural", "first-person", "indicative", "past"], + }, + ], + )