diff --git a/src/wiktextract/extractor/pt/linkage.py b/src/wiktextract/extractor/pt/linkage.py index cebd5a84..1e61c712 100644 --- a/src/wiktextract/extractor/pt/linkage.py +++ b/src/wiktextract/extractor/pt/linkage.py @@ -72,6 +72,7 @@ def extract_linkage_section( sense: str, sense_index: int, source: str, + tags: list[str], ) -> None: for node in level_node.children: if isinstance(node, TemplateNode) and node.template_name == "fraseini": @@ -86,6 +87,7 @@ def extract_linkage_section( sense, sense_index, source, + tags, ) @@ -112,6 +114,7 @@ def extract_linkage_list_item( sense: str, sense_index: int, source: str, + tags: list[str], ) -> None: linkage_words = [] raw_tags = [] @@ -140,6 +143,7 @@ def extract_linkage_list_item( linkage_type, sense, sense_index, + tags, ) elif word != "": linkage_words.append(word) @@ -157,6 +161,7 @@ def extract_linkage_list_item( linkage_type, sense, sense_index, + tags, ) elif raw_tag != "": raw_tags.append(raw_tag) @@ -170,6 +175,7 @@ def extract_linkage_list_item( sense, sense_index, source, + tags, ) elif isinstance(node, str): m = re.search(r"\((.+)\)", node) @@ -183,6 +189,7 @@ def extract_linkage_list_item( sense_index=sense_index, raw_tags=raw_tags, source=source, + tags=tags, ) translate_raw_tags(linkage) getattr(word_entry, linkage_type).append(linkage) @@ -195,6 +202,7 @@ def extract_wikisaurus_page( linkage_type: str, sense: str, sense_index: int, + tags: list[str], ) -> None: page = wxr.wtp.get_page(page_title, 0) if page is None or page.body is None: @@ -220,4 +228,5 @@ def extract_wikisaurus_page( sense, sense_index, page_title, + tags, ) diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index bb98a924..9b97caea 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -88,6 +88,10 @@ class WordEntry(PortugueseBaseModel): antonyms: list[Linkage] = [] synonyms: list[Linkage] = [] derived: list[Linkage] = [] + anagrams: list[Linkage] = [] + hypernyms: list[Linkage] = [] + related: list[Linkage] = [] + hyponyms: list[Linkage] = [] etymology_texts: list[str] = [] sounds: list[Sound] = [] forms: list[Form] = [] diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index b6432e04..afe0f8c4 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -13,7 +13,7 @@ from .models import Sense, WordEntry from .pos import extract_pos_section from .pronunciation import extract_pronunciation_section -from .section_titles import LINKAGE_SECTIONS, POS_DATA +from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA from .translation import extract_translation_section @@ -24,8 +24,10 @@ def parse_section( level_node: LevelNode, ) -> None: cats = {} - title_text = clean_node(wxr, cats, level_node.largs).strip("⁰¹²³⁴⁵⁶⁷⁸⁹") - if title_text in POS_DATA: + title_text = clean_node(wxr, cats, level_node.largs).strip( + "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789" + ) + if title_text.lower() in POS_DATA: extract_pos_section( wxr, page_data, @@ -34,7 +36,7 @@ def parse_section( title_text, cats.get("categories", []), ) - elif title_text in ["Tradução", "Cognatos"]: + elif title_text in ["Tradução", "Traduções", "Cognatos"]: extract_translation_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) @@ -42,22 +44,40 @@ def parse_section( extract_expression_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) - elif title_text in LINKAGE_SECTIONS: + elif title_text.lower() in LINKAGE_SECTIONS: extract_linkage_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node, - LINKAGE_SECTIONS[title_text], + LINKAGE_SECTIONS[title_text.lower()], "", 0, "", + LINKAGE_TAGS.get(title_text.lower(), []), ) elif title_text == "Etimologia": extract_etymology_section(wxr, page_data, level_node) elif title_text == "Pronúncia": extract_pronunciation_section(wxr, page_data, level_node) + elif title_text in ["Nota", "Notas", "Nota de uso"]: + pass + elif title_text.lower() not in [ + "ver também", + "ligações externas", + "referências", + "referência", + "no wikcionário", + "na wikipédia", + "no wikiquote", + "no wikispecies", + "no wikisaurus", + "no commons", + "no wikimedia commons", + "galeria", + ]: + wxr.wtp.debug(f"unknown section: {title_text}") - if title_text not in POS_DATA: + if title_text.lower() not in POS_DATA: save_section_cats( cats.get("categories", []), page_data, level_node, True ) diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 05292ef4..f7cff382 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -24,7 +24,7 @@ def extract_pos_section( ) -> None: page_data.append(base_data.model_copy(deep=True)) page_data[-1].pos_title = pos_title - pos_data = POS_DATA[pos_title] + pos_data = POS_DATA[pos_title.lower()] page_data[-1].pos = pos_data["pos"] page_data[-1].tags.extend(pos_data.get("tags", [])) page_data[-1].categories.extend(categories) diff --git a/src/wiktextract/extractor/pt/section_titles.py b/src/wiktextract/extractor/pt/section_titles.py index f65b817b..b3763991 100644 --- a/src/wiktextract/extractor/pt/section_titles.py +++ b/src/wiktextract/extractor/pt/section_titles.py @@ -1,36 +1,87 @@ POS_DATA = { - "Artigo": {"pos": "article"}, - "Adjetivo": {"pos": "adj"}, - "Advérbio": {"pos": "adv"}, - "Conjunção": {"pos": "conj"}, - "Interjeição": {"pos": "intj"}, - "Numeral": {"pos": "num"}, - "Partícula": {"pos": "particle"}, - "Preposição": {"pos": "prep"}, - "Posposição": {"pos": "postp"}, - "Pronome": {"pos": "pron"}, - "Substantivo": {"pos": "noun"}, - "Verbo": {"pos": "verb"}, - "Forma de substantivo": {"pos": "noun", "tags": ["form-of"]}, - "Forma verbal": {"pos": "verb", "tags": ["form-of"]}, - "Locução substantiva": {"pos": "phrase", "tags": ["substantive"]}, - "Locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]}, - "Locução adverbial": {"pos": "phrase", "tags": ["adverbial"]}, - "Locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]}, - "Expressão": {"pos": "phrase"}, - "Abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]}, - "Contração": {"pos": "contraction", "tags": ["contraction"]}, - "Prefixo": {"pos": "prefix", "tags": ["morpheme"]}, - "Sufixo": {"pos": "suffix", "tags": ["morpheme"]}, - "Sigla": {"pos": "abbrev", "tags": ["abbreviation"]}, - "Símbolo": {"pos": "symbol"}, + "artigo": {"pos": "article"}, + "adjetivo": {"pos": "adj"}, + "advérbio": {"pos": "adv"}, + "conjunção": {"pos": "conj"}, + "interjeição": {"pos": "intj"}, + "numeral": {"pos": "num"}, + "partícula": {"pos": "particle"}, + "preposição": {"pos": "prep"}, + "posposição": {"pos": "postp"}, + "pronome": {"pos": "pron"}, + "substantivo": {"pos": "noun"}, + "berbo": {"pos": "verb"}, + "forma de substantivo": {"pos": "noun", "tags": ["form-of"]}, + "forma verbal": {"pos": "verb", "tags": ["form-of"]}, + "locução substantiva": {"pos": "phrase", "tags": ["substantive"]}, + "locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]}, + "locução adverbial": {"pos": "phrase", "tags": ["adverbial"]}, + "locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]}, + "expressão": {"pos": "phrase"}, + "abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]}, + "contração": {"pos": "contraction", "tags": ["contraction"]}, + "prefixo": {"pos": "prefix", "tags": ["morpheme"]}, + "sufixo": {"pos": "suffix", "tags": ["morpheme"]}, + "sigla": {"pos": "abbrev", "tags": ["abbreviation"]}, + "símbolo": {"pos": "symbol"}, + "substantivo próprio": {"pos": "name"}, + "adjetivo próprio": {"pos": "adj", "tags": ["name"]}, + "forma de adjetivo": {"pos": "adj", "tags": ["form-of"]}, + "letra": {"pos": "character", "tags": ["letter"]}, + "transliteração": {"pos": "romanization"}, + "numeral ordinal": {"pos": "adj"}, + "numeral cardinal": {"pos": "adj"}, + "ordinal equivalente": {"pos": "adj"}, + "locução interjetiva": {"pos": "phrase", "tags": ["interjection"]}, + "adjetivo numeral distributivo": { + "pos": "adj", + "tags": ["distributive", "numeral"], + }, + "forma de pronome": {"pos": "pron", "tags": ["form-of"]}, + "advérbio numeral": {"pos": "adv", "tags": ["numeral"]}, } LINKAGE_SECTIONS = { - "Antônimos": "antonyms", - "Sinônimos": "synonyms", - "Sinónimos/Sinônimos": "synonyms", - "Sinónimos": "synonyms", - "Verbetes derivados": "derived", + "antônimos": "antonyms", + "antônimo": "antonyms", + "antónimo": "antonyms", + "antónimos/antônimos": "antonyms", + "sinônimos": "synonyms", + "sinônimo": "synonyms", + "sinónimos/sinônimos": "synonyms", + "sinónimos": "synonyms", + "sinónimo": "synonyms", + "verbetes derivados": "derived", + "verbete derivado": "derived", + "formas alternativas": "synonyms", + "anagramas": "anagrams", + "anagrama": "anagrams", + "hiperônimo": "hypernyms", + "hiperônimos": "hypernyms", + "hiperónimos": "hypernyms", + "termos derivados": "derived", + "grafia antiga": "synonyms", + "diminutivo": "synonyms", + "diminutivos": "synonyms", + "termos relacionados": "related", + "variante ortográfica": "synonyms", + "verbetes relacionados": "related", + "entradas relacionadas": "related", + "hipônimos": "hyponyms", + "hiponímias": "hyponyms", + "ortografias obsoletas": "synonyms", + "superlativo": "synonyms", + "outros verbetes": "related", + "cardinal equivalente": "synonyms", + "aumentativo": "synonyms", +} + +LINKAGE_TAGS = { + "grafia antiga": ["obsolete"], + "diminutivo": ["diminutive"], + "diminutivos": ["diminutive"], + "ortografias obsoletas": ["obsolete"], + "superlativo": ["superlative"], + "aumentativo": ["augmentative"], }