From 8302770048aa9db84f9fccca4085bdbc1ded3c77 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 25 Dec 2024 09:05:59 +0800 Subject: [PATCH 1/5] [pt] handle nested example lists --- src/wiktextract/extractor/pt/pos.py | 9 ++++++++- tests/test_pt_example.py | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 290baf91..01f04818 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -118,7 +118,8 @@ def extract_example_list_item( ) -> None: example = Example() ref_nodes = [] - for node in list_item.children: + + for index, node in enumerate(list_item.children): if ( isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC @@ -147,6 +148,12 @@ def extract_example_list_item( example.text = clean_node( wxr, sense, node.template_parameters.get(1, "") ) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + ref_nodes.clear() + example.ref = clean_node(wxr, None, list_item.children[:index]) + for child_list_item in node.find_child(NodeKind.LIST_ITEM): + example.text = clean_node(wxr, None, child_list_item.children) + break else: ref_nodes.append(node) diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py index 1def5fae..3472ab8f 100644 --- a/tests/test_pt_example.py +++ b/tests/test_pt_example.py @@ -127,3 +127,27 @@ def test_double_italic_nodes(self): ], }, ) + + def test_nested_example_list(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "amor", + """={{-pt-}}= +==Substantivo== +# [[sentimento]] +#* '''1595''', [[w:Luís de Camões|Luís de Camões]], ''Rimas'': +#*: "'''''Amor''' é fogo que arde sem se ver
é ferida que dói, e não se sente,
é um contentamento descontente,
é dor que desatina sem doer.''\"""", + ) + self.assertEqual( + data[0]["senses"][0], + { + "glosses": ["sentimento"], + "examples": [ + { + "text": '"Amor é fogo que arde sem se ver\né ferida que dói, e não se sente,\né um contentamento descontente,\né dor que desatina sem doer."', + "ref": "1595, Luís de Camões, Rimas:", + } + ], + }, + ) From 8828c6d66fed995ba38a4eaafa070064fbafa961 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 25 Dec 2024 10:10:01 +0800 Subject: [PATCH 2/5] [pt] handle example text above source child list layout --- src/wiktextract/extractor/pt/pos.py | 23 ++++++++++++++++++++--- tests/test_pt_example.py | 26 +++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 01f04818..44283a61 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -1,3 +1,5 @@ +import re + from wikitextprocessor import ( HTMLNode, LevelNode, @@ -148,12 +150,27 @@ def extract_example_list_item( example.text = clean_node( wxr, sense, node.template_parameters.get(1, "") ) + elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: + bold_str = clean_node(wxr, None, node) + if re.fullmatch(r"\d+", bold_str) is not None: + list_item_str = clean_node( + wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) + ) + if list_item_str.endswith(":"): + ref_nodes.clear() + example.ref = list_item_str + for child_list in list_item.find_child(NodeKind.LIST): + for child_list_item in child_list.find_child( + NodeKind.LIST_ITEM + ): + example.text = clean_node( + wxr, None, child_list_item.children + ) + break elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: ref_nodes.clear() - example.ref = clean_node(wxr, None, list_item.children[:index]) for child_list_item in node.find_child(NodeKind.LIST_ITEM): - example.text = clean_node(wxr, None, child_list_item.children) - break + ref_nodes.append(child_list_item.children) else: ref_nodes.append(node) diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py index 3472ab8f..bedd59e9 100644 --- a/tests/test_pt_example.py +++ b/tests/test_pt_example.py @@ -128,7 +128,7 @@ def test_double_italic_nodes(self): }, ) - def test_nested_example_list(self): + def test_source_above_text_child_list(self): self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") data = parse_page( self.wxr, @@ -151,3 +151,27 @@ def test_nested_example_list(self): ], }, ) + + def test_text_above_ref_child_list(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "mar", + """={{-pt-}}= +==Substantivo== +# grande quantidade de +#:"''Ó '''mar''' salgado, quanto do teu sal
São lágrimas de Portugal!
Por te cruzarmos, quantas mães choraram,
Quantos filhos em vão rezaram!
Quantas noivas ficaram por casar
Para que fosses nosso, ó '''mar'''!''" +#:: ''-Mensagem, de Fernando Pessoa''""", + ) + self.assertEqual( + data[0]["senses"][0], + { + "glosses": ["grande quantidade de"], + "examples": [ + { + "text": "Ó mar salgado, quanto do teu sal\nSão lágrimas de Portugal!\nPor te cruzarmos, quantas mães choraram,\nQuantos filhos em vão rezaram!\nQuantas noivas ficaram por casar\nPara que fosses nosso, ó mar!", + "ref": "-Mensagem, de Fernando Pessoa", + } + ], + }, + ) From 5ce0c6a0445b4bf89becb9c0a50c5b2eb3429dd3 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 25 Dec 2024 11:07:12 +0800 Subject: [PATCH 3/5] [pt] handle forms separated by slash in "flex.*" templates --- src/wiktextract/extractor/pt/inflection.py | 32 ++++++----- tests/test_pt_form.py | 65 ++++++++++++++++++++++ 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/src/wiktextract/extractor/pt/inflection.py b/src/wiktextract/extractor/pt/inflection.py index 17d56d2f..f33ae8d1 100644 --- a/src/wiktextract/extractor/pt/inflection.py +++ b/src/wiktextract/extractor/pt/inflection.py @@ -54,20 +54,22 @@ def extract_flex_template( elif cell_node.attrs.get("style") == "background:#f4f4f4;": row_header = cell_text col_header_index += col_span - elif cell_text in ["–", wxr.wtp.title]: - col_cell_index += col_span - continue else: - form = Form(form=cell_text) - if row_header != "": - form.raw_tags.append(row_header) - for col_header in col_headers: - if ( - col_cell_index >= col_header.col_index - and col_cell_index - < col_header.col_index + col_header.colspan - ): - form.raw_tags.append(col_header.text) - translate_raw_tags(form) - word_entry.forms.append(form) + for link_node in cell_node.find_child(NodeKind.LINK): + form_str = clean_node(wxr, None, link_node) + if form_str in ["", "–", "-", wxr.wtp.title]: + continue + form_data = Form(form=form_str) + if row_header != "": + form_data.raw_tags.append(row_header) + for col_header in col_headers: + if ( + col_cell_index >= col_header.col_index + and col_cell_index + < col_header.col_index + col_header.colspan + ): + form_data.raw_tags.append(col_header.text) + translate_raw_tags(form_data) + word_entry.forms.append(form_data) + col_cell_index += col_span diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py index e6716386..aa3635bb 100644 --- a/tests/test_pt_form.py +++ b/tests/test_pt_form.py @@ -73,3 +73,68 @@ def test_flex_pt_subst_completa(self): {"form": "matilha", "tags": ["standard", "collective"]}, ], ) + + def test_slash_cell(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + self.wxr.wtp.add_page( + "Predefinição:flex.pt.subst.completa", + 10, + """{| +|- +! style="background:#f4f4f4;" rowspan="2" | +! style="background:#ffffe0;" colspan="2" | [[masculino|Masculino]] +! style="background:#ffffe0;" colspan="2" | [[feminino|Feminino]] +! style="background:#ffffe0;" rowspan="2" | [[coletivo|Coletivo]] +|- +! style="background:#ffffe0;" | [[singular|Singular]] +! style="background:#ffffe0;" | [[plural|Plural]] +! style="background:#ffffe0;" | [[singular|Singular]] +! style="background:#ffffe0;" | [[plural|Plural]] +|- +! style="background:#f4f4f4;" | [[normal|Normal]] +| style="background:#ffffff; text-align:center;" | [[parvo]] +| style="background:#ffffff; text-align:center;" | [[parvos#Português|parvos]] +| style="background:#ffffff; text-align:center;" | [[parva#Português|parva]] / [[párvoa#Português|párvoa]] +| style="background:#ffffff; text-align:center;" | [[parvas#Português|parvas]] / [[párvoas#Português|párvoas]] +| style="background:#ffffff; text-align:center;" rowspan="3" | [[-#Português|-]] +|}""", + ) + data = parse_page( + self.wxr, + "parvo", + """={{-pt-}}= +==Substantivo== +{{flex.pt.subst.completa +|alinhamento=left +|ms=parvo +|msa=parvalhão|msa2=parvoalho|msa3=parvoeirão +|msd=parvinho +|mp=parvos +|mpa=parvalhões|mpa2=parvoalhos|mpa3=parvoeirões +|mpd=parvinhos +|fs=parva|fs2=párvoa +|fsa=parvalhona|fsa2=parvoalha|fsa3=parvoeirona +|fsd=parvinha +|fp=parvas|fp2=párvoas +|fpa=parvalhonas|fpa2=parvoalhas|fpa3=pavoeironas +|fpd=parvinhas +|col=- +}} +# [[pessoa]]""", + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "parvos", "tags": ["standard", "masculine", "plural"]}, + { + "form": "parva", + "tags": ["standard", "feminine", "singular"], + }, + { + "form": "párvoa", + "tags": ["standard", "feminine", "singular"], + }, + {"form": "parvas", "tags": ["standard", "feminine", "plural"]}, + {"form": "párvoas", "tags": ["standard", "feminine", "plural"]}, + ], + ) From 7a5173dcfae09601e28dc456df214f9513ac2420 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 25 Dec 2024 11:42:35 +0800 Subject: [PATCH 4/5] [pt] relax gloss child list check condition --- src/wiktextract/extractor/pt/pos.py | 2 +- tests/test_pt_linkage.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 44283a61..9d56617e 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -76,7 +76,7 @@ def extract_gloss_list_item( word_entry.senses.append(sense) for child_list in list_item.find_child(NodeKind.LIST): - if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): + if child_list.sarg.endswith("#"): for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): extract_gloss_list_item( wxr, word_entry, child_list_item, sense.glosses diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py index 822157ec..5539ce34 100644 --- a/tests/test_pt_linkage.py +++ b/tests/test_pt_linkage.py @@ -221,3 +221,34 @@ def test_phraseology_nested_list(self): }, ], ) + + def test_expression_gloss_child_list(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "testa", + """={{-pt-}}= +==Substantivo== +# [[parte]] + +===Expressões=== +* '''[[testa de boi]]''': (Portugal, Douro) +*# indivíduo com a testa avantajada;""", + ) + self.assertEqual( + data[0]["expressions"], + [ + { + "word": "testa de boi", + "senses": [ + {"glosses": ["(Portugal, Douro)"]}, + { + "glosses": [ + "(Portugal, Douro)", + "indivíduo com a testa avantajada;", + ] + }, + ], + } + ], + ) From f6c106f307b8a02073be80f2b67e9bab9a038205 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 25 Dec 2024 16:47:38 +0800 Subject: [PATCH 5/5] [pt] translate some tags and topics in template "escopo" --- src/wiktextract/extractor/pt/pos.py | 11 ++--- src/wiktextract/extractor/pt/tags.py | 69 +++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 9d56617e..1a4bd3b8 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -89,13 +89,10 @@ def extract_escopo_template( t_node: TemplateNode, ) -> None: # https://pt.wiktionary.org/wiki/Predefinição:escopo - for arg in range(2, 9): - if arg not in t_node.template_parameters: - break - raw_tag = clean_node(wxr, None, t_node.template_parameters[arg]) - if raw_tag != "": - sense.raw_tags.append(raw_tag) - clean_node(wxr, sense, t_node) + expanded_str = clean_node(wxr, sense, t_node).strip("()") + for raw_tag in re.split(r", | e ", expanded_str): + if raw_tag.strip() != "": + sense.raw_tags.append(raw_tag.strip()) def extract_escopo2_template( diff --git a/src/wiktextract/extractor/pt/tags.py b/src/wiktextract/extractor/pt/tags.py index e48ae907..ab7987a8 100644 --- a/src/wiktextract/extractor/pt/tags.py +++ b/src/wiktextract/extractor/pt/tags.py @@ -113,7 +113,72 @@ "Diminutivo": "diminutive", } -TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS} +# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo +GLOSS_TAGS = { + "Grafia portuguesa": "Portugal", + "Grafia brasileira": "Brazil", + "histórico": "historical", +} + +TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS} + +# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo +TOPICS = { + "botânica": "botany", + "ciência da computação": "computing", + # "ciência dos materiais": "", + "engenharia": "engineering", + # "pedagogia": "pedagogy", + # "cronônimo": "chrononym", + "basquetebol": "basketball", + "beisebol": "baseball", + "críquete": "cricket", + "desporto": "sports", + "esporte": "sports", + "ténis": "tennis", + "tênis": "tennis", + "esgrima": "fencing", + "geografia": "geography", + # "toponímia": "", + # "territory": "", + "zoologia": "zoology", + "ornitologia": "ornithology", + # "artrópodes": "", + "entomologia": "entomology", + "ictiologia": "ichthyology", + "veterinária": "veterinary", + # "antropónimo": "", + "alimentação": "food", + "arte": "arts", + "aeronáutica": "aeronautics", + "aritmética": "arithmetic", + "Meteorologia": "meteorology", + "design": "design", + "patologia": "pathology", + "etnologia": "ethnology", + "farmacologia": "pharmacology", + "transporte": "transport", + "Ginecologia": "gynecology", + "linguística": "linguistics", + "indústria têxtil": "textiles", + "mídia": "media", + "ciência da informação": "information-science", + "ludologia": "ludology", + "náutica": "nautical", + "mitologia": "mythology", + "mineralogia": "mineralogy", + "mobiliário": "furniture", + "numismática": "numismatics", + # "Esoterismo": "", + "profissão": "profession", + # "parapsiquismo": "", + "vestuário": "clothing", + "direito": "law", + "química": "chemistry", + "videojogo": "video-games", + "vídeo game": "video-games", + "viticultura": "viticulture", +} def translate_raw_tags(data: WordEntry) -> None: @@ -125,6 +190,8 @@ def translate_raw_tags(data: WordEntry) -> None: data.tags.append(tr_tag) elif isinstance(tr_tag, list): data.tags.extend(tr_tag) + elif raw_tag in TOPICS and hasattr(data, "topics"): + data.topics.append(TOPICS[raw_tag]) else: raw_tags.append(raw_tag) data.raw_tags = raw_tags