diff --git a/src/wiktextract/extractor/pt/linkage.py b/src/wiktextract/extractor/pt/linkage.py index 1e61c712..11eb49bf 100644 --- a/src/wiktextract/extractor/pt/linkage.py +++ b/src/wiktextract/extractor/pt/linkage.py @@ -97,10 +97,13 @@ def extract_fraseini_template( sense = "" sense_index = 0 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, "")) - m = re.search(r"(\d+)$", first_arg) + m = re.search(r"\((\d+)\)$", first_arg) if m is not None: sense_index = int(m.group(1)) sense = first_arg[: m.start()].strip() + elif (m := re.match(r"De (\d+)", first_arg)) is not None: + sense_index = int(m.group(1)) + sense = first_arg[m.end() :].strip("() \n") else: sense = first_arg return sense, sense_index @@ -230,3 +233,61 @@ def extract_wikisaurus_page( page_title, tags, ) + + +def extract_phraseology_section( + wxr: WiktextractContext, + word_entry: WordEntry, + level_node: LevelNode, +) -> None: + sense = "" + sense_index = 0 + for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): + if isinstance(node, TemplateNode) and node.template_name == "fraseini": + sense, sense_index = extract_fraseini_template(wxr, node) + elif node.kind == NodeKind.LIST: + for list_item in node.find_child(NodeKind.LIST_ITEM): + extract_phraseology_list_item( + wxr, word_entry, list_item, sense, sense_index + ) + + +def extract_phraseology_list_item( + wxr: WiktextractContext, + word_entry: WordEntry, + list_item: WikiNode, + sense: str, + sense_index: int, +) -> None: + l_data = Linkage(word="", sense=sense, sense_index=sense_index) + for index, node in enumerate(list_item.children): + if ( + isinstance(node, WikiNode) + and node.kind in NodeKind.BOLD | NodeKind.LINK + and l_data.word == "" + ): + l_data.word = clean_node(wxr, None, node) + elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: + l_data.roman = clean_node(wxr, None, node) + elif isinstance(node, str) and ("=" in node or ":" in node): + sense_start = node.index("=" if "=" in node else ":") + 1 + l_data.sense = clean_node( + wxr, + None, + [node[sense_start:]] + + [ + n + for n in list_item.children[index + 1 :] + if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST) + ], + ) + break + + if l_data.word != "": + word_entry.phraseology.append(l_data) + + for child_list in list_item.find_child(NodeKind.LIST): + for next_list_item in child_list.find_child(NodeKind.LIST_ITEM): + extract_phraseology_list_item( + wxr, word_entry, next_list_item, sense, sense_index + ) diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index 9b97caea..d665da1d 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -51,6 +51,7 @@ class Linkage(PortugueseBaseModel): default=0, ge=0, description="Number of the definition, start from 1" ) source: str = "" + roman: str = "" class Sound(PortugueseBaseModel): @@ -92,6 +93,11 @@ class WordEntry(PortugueseBaseModel): hypernyms: list[Linkage] = [] related: list[Linkage] = [] hyponyms: list[Linkage] = [] + homophones: list[Linkage] = [] + homonyms: list[Linkage] = [] + paronyms: list[Linkage] = [] + phraseology: list[Linkage] = [] etymology_texts: list[str] = [] sounds: list[Sound] = [] forms: list[Form] = [] + notes: list[str] = [] diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index afe0f8c4..4fa16d4e 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -9,7 +9,11 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .etymology import extract_etymology_section -from .linkage import extract_expression_section, extract_linkage_section +from .linkage import ( + extract_expression_section, + extract_linkage_section, + extract_phraseology_section, +) from .models import Sense, WordEntry from .pos import extract_pos_section from .pronunciation import extract_pronunciation_section @@ -25,7 +29,7 @@ def parse_section( ) -> None: cats = {} title_text = clean_node(wxr, cats, level_node.largs).strip( - "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789" + "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789:" ) if title_text.lower() in POS_DATA: extract_pos_section( @@ -59,11 +63,17 @@ def parse_section( extract_etymology_section(wxr, page_data, level_node) elif title_text == "Pronúncia": extract_pronunciation_section(wxr, page_data, level_node) - elif title_text in ["Nota", "Notas", "Nota de uso"]: - pass + elif title_text == "Fraseologia": + extract_phraseology_section( + wxr, page_data[-1] if len(page_data) else base_data, level_node + ) + elif title_text.startswith("Nota"): + extract_note_section(wxr, page_data, level_node) elif title_text.lower() not in [ "ver também", + "ligação externa", "ligações externas", + "ligação extena", "referências", "referência", "no wikcionário", @@ -73,7 +83,9 @@ def parse_section( "no wikisaurus", "no commons", "no wikimedia commons", + "na internet", "galeria", + "galeria de imagens", ]: wxr.wtp.debug(f"unknown section: {title_text}") @@ -86,7 +98,7 @@ def parse_section( clean_node(wxr, cats, link_node) save_section_cats(cats.get("categories", []), page_data, level_node, False) - if title_text != "Pronúncia": + if title_text.lower() not in ["pronúncia", "ver também"]: for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) @@ -147,3 +159,20 @@ def parse_page( if len(data.senses) == 0: data.senses.append(Sense(tags=["no-gloss"])) return [m.model_dump(exclude_defaults=True) for m in page_data] + + +def extract_note_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + level_node: LevelNode, +) -> None: + notes = [] + for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): + note = clean_node( + wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) + ) + if note != "": + notes.append(note) + for data in page_data: + if data.lang_code == page_data[-1].lang_code: + data.notes.extend(notes) diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index f7cff382..290baf91 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -49,11 +49,11 @@ def extract_gloss_list_item( wxr: WiktextractContext, word_entry: WordEntry | Linkage, list_item: WikiNode, + parent_gloss: list[str] = [], ) -> None: gloss_nodes = [] - sense = Sense() - first_gloss_index = len(list_item.children) - for index, node in enumerate(list_item.children): + sense = Sense(glosses=parent_gloss) + for node in list_item.children: if isinstance(node, TemplateNode): if node.template_name == "escopo": extract_escopo_template(wxr, sense, node) @@ -65,8 +65,6 @@ def extract_gloss_list_item( if node.sarg.endswith(("*", ":")): for next_list_item in node.find_child(NodeKind.LIST_ITEM): extract_example_list_item(wxr, sense, next_list_item) - if index < first_gloss_index: - first_gloss_index = index else: gloss_nodes.append(node) @@ -75,6 +73,13 @@ def extract_gloss_list_item( sense.glosses.append(gloss_str) word_entry.senses.append(sense) + for child_list in list_item.find_child(NodeKind.LIST): + if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): + for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): + extract_gloss_list_item( + wxr, word_entry, child_list_item, sense.glosses + ) + def extract_escopo_template( wxr: WiktextractContext, diff --git a/src/wiktextract/extractor/pt/section_titles.py b/src/wiktextract/extractor/pt/section_titles.py index b3763991..cd61dacc 100644 --- a/src/wiktextract/extractor/pt/section_titles.py +++ b/src/wiktextract/extractor/pt/section_titles.py @@ -10,7 +10,7 @@ "posposição": {"pos": "postp"}, "pronome": {"pos": "pron"}, "substantivo": {"pos": "noun"}, - "berbo": {"pos": "verb"}, + "verbo": {"pos": "verb"}, "forma de substantivo": {"pos": "noun", "tags": ["form-of"]}, "forma verbal": {"pos": "verb", "tags": ["form-of"]}, "locução substantiva": {"pos": "phrase", "tags": ["substantive"]}, @@ -19,6 +19,7 @@ "locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]}, "expressão": {"pos": "phrase"}, "abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]}, + "abreviação": {"pos": "abbrev", "tags": ["abbreviation"]}, "contração": {"pos": "contraction", "tags": ["contraction"]}, "prefixo": {"pos": "prefix", "tags": ["morpheme"]}, "sufixo": {"pos": "suffix", "tags": ["morpheme"]}, @@ -39,6 +40,12 @@ }, "forma de pronome": {"pos": "pron", "tags": ["form-of"]}, "advérbio numeral": {"pos": "adv", "tags": ["numeral"]}, + "verbo preposicionado": {"pos": "verb", "tags": ["prepositional"]}, + "caractere han": {"pos": "character", "tags": ["han"]}, + "hanja": {"pos": "character", "tags": ["Hanja"]}, + "kanji": {"pos": "character", "tags": ["kanji"]}, + "pronome pessoal": {"pos": "pron", "tags": ["person"]}, + "pronome possessivo": {"pos": "det", "tags": ["possessive"]}, } @@ -46,12 +53,14 @@ "antônimos": "antonyms", "antônimo": "antonyms", "antónimo": "antonyms", + "antónimos": "antonyms", "antónimos/antônimos": "antonyms", "sinônimos": "synonyms", "sinônimo": "synonyms", "sinónimos/sinônimos": "synonyms", "sinónimos": "synonyms", "sinónimo": "synonyms", + "sinônimos e variantes": "synonyms", "verbetes derivados": "derived", "verbete derivado": "derived", "formas alternativas": "synonyms", @@ -61,6 +70,7 @@ "hiperônimos": "hypernyms", "hiperónimos": "hypernyms", "termos derivados": "derived", + "termos derivadoss": "derived", "grafia antiga": "synonyms", "diminutivo": "synonyms", "diminutivos": "synonyms", @@ -70,11 +80,25 @@ "entradas relacionadas": "related", "hipônimos": "hyponyms", "hiponímias": "hyponyms", + "hipónimos": "hyponyms", "ortografias obsoletas": "synonyms", "superlativo": "synonyms", "outros verbetes": "related", "cardinal equivalente": "synonyms", + "cardinais equivalentes": "synonyms", "aumentativo": "synonyms", + "advérbios derivados": "derived", + "derivações": "derived", + "homófonos": "homophones", + "homófono": "homophones", + "homónimos/homônimos": "homonyms", + "homônimos": "homonyms", + "parônimos": "paronyms", + "caracteres derivados": "derived", + "caracteres relacionados": "related", + "palavras com o kanji": "related", + "compostos": "derived", + "vermos derivados": "derived", } LINKAGE_TAGS = { @@ -84,4 +108,5 @@ "ortografias obsoletas": ["obsolete"], "superlativo": ["superlative"], "aumentativo": ["augmentative"], + "advérbios derivados": ["adverb"], } diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py index 8e5d0e4a..85be2de8 100644 --- a/tests/test_pt_gloss.py +++ b/tests/test_pt_gloss.py @@ -74,3 +74,21 @@ def test_escopo(self): } ], ) + + def test_nested_list(self): + self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês") + data = parse_page( + self.wxr, + "average", + """={{-en-}}= +==Adjetivo== +# [[médio]] +## [[relativo à]] [[média]];''""", + ) + self.assertEqual( + data[0]["senses"], + [ + {"glosses": ["médio"]}, + {"glosses": ["médio", "relativo à média;"]}, + ], + ) diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py index a4cc1a7b..822157ec 100644 --- a/tests/test_pt_linkage.py +++ b/tests/test_pt_linkage.py @@ -145,3 +145,79 @@ def test_nested_list(self): }, ], ) + + def test_phraseology_equal(self): + self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês") + data = parse_page( + self.wxr, + "aboard", + """={{-en-}}= +== Advérbio == +'''a.board''' +#[[a bordo]] + +===Fraseologia=== +* '''aboard the train''' (''locução adverbial'') = a bordo do trem""", + ) + self.assertEqual( + data[0]["phraseology"], + [ + { + "word": "aboard the train", + "roman": "locução adverbial", + "sense": "a bordo do trem", + } + ], + ) + + def test_phraseology_colon(self): + self.wxr.wtp.add_page("Predefinição:-la-", 10, "Latim") + data = parse_page( + self.wxr, + "secundus", + """={{-la-}}= +==Adjetivo== +'''se.cun.dus''' +# que [[seguir|segue]] + +==Fraseologia== +* '''secundae [[res]]''': ''[[felicidade]]'' +* [[secunda mensa]]: [[sobremesa]]""", + ) + self.assertEqual( + data[0]["phraseology"], + [ + {"word": "secundae res", "sense": "felicidade"}, + {"word": "secunda mensa", "sense": "sobremesa"}, + ], + ) + + def test_phraseology_nested_list(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "gota", + """={{-pt-}}= +==Substantivo== +# [[fragmento]] + +===Fraseologia=== +{{fraseini|De 1 (gota: pingo)}} +# ''' [[até a última gota]] ''' ([[locução]]): [[até]] [[ser]] [[usado]] ou [[bebido]] [[totalmente]] (um [[líquido]]) +#* ''' [[este|Este]] [[café]] é [[bom]] até a [[última]] gota ''' (frase comum)""", + ) + self.assertEqual( + data[0]["phraseology"], + [ + { + "word": "até a última gota", + "sense": "até ser usado ou bebido totalmente (um líquido)", + "sense_index": 1, + }, + { + "word": "Este café é bom até a última gota", + "sense": "gota: pingo", + "sense_index": 1, + }, + ], + )