Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pt] add more section title data #954

Merged
merged 2 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def extract_linkage_section(
sense: str,
sense_index: int,
source: str,
tags: list[str],
) -> None:
for node in level_node.children:
if isinstance(node, TemplateNode) and node.template_name == "fraseini":
Expand All @@ -86,6 +87,7 @@ def extract_linkage_section(
sense,
sense_index,
source,
tags,
)


Expand All @@ -112,6 +114,7 @@ def extract_linkage_list_item(
sense: str,
sense_index: int,
source: str,
tags: list[str],
) -> None:
linkage_words = []
raw_tags = []
Expand Down Expand Up @@ -140,6 +143,7 @@ def extract_linkage_list_item(
linkage_type,
sense,
sense_index,
tags,
)
elif word != "":
linkage_words.append(word)
Expand All @@ -157,6 +161,7 @@ def extract_linkage_list_item(
linkage_type,
sense,
sense_index,
tags,
)
elif raw_tag != "":
raw_tags.append(raw_tag)
Expand All @@ -170,6 +175,7 @@ def extract_linkage_list_item(
sense,
sense_index,
source,
tags,
)
elif isinstance(node, str):
m = re.search(r"\((.+)\)", node)
Expand All @@ -183,6 +189,7 @@ def extract_linkage_list_item(
sense_index=sense_index,
raw_tags=raw_tags,
source=source,
tags=tags,
)
translate_raw_tags(linkage)
getattr(word_entry, linkage_type).append(linkage)
Expand All @@ -195,6 +202,7 @@ def extract_wikisaurus_page(
linkage_type: str,
sense: str,
sense_index: int,
tags: list[str],
) -> None:
page = wxr.wtp.get_page(page_title, 0)
if page is None or page.body is None:
Expand All @@ -220,4 +228,5 @@ def extract_wikisaurus_page(
sense,
sense_index,
page_title,
tags,
)
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ class WordEntry(PortugueseBaseModel):
antonyms: list[Linkage] = []
synonyms: list[Linkage] = []
derived: list[Linkage] = []
anagrams: list[Linkage] = []
hypernyms: list[Linkage] = []
related: list[Linkage] = []
hyponyms: list[Linkage] = []
etymology_texts: list[str] = []
sounds: list[Sound] = []
forms: list[Form] = []
34 changes: 27 additions & 7 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .pronunciation import extract_pronunciation_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA
from .translation import extract_translation_section


Expand All @@ -24,8 +24,10 @@ def parse_section(
level_node: LevelNode,
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs).strip("⁰¹²³⁴⁵⁶⁷⁸⁹")
if title_text in POS_DATA:
title_text = clean_node(wxr, cats, level_node.largs).strip(
"⁰¹²³⁴⁵⁶⁷⁸⁹0123456789"
)
if title_text.lower() in POS_DATA:
extract_pos_section(
wxr,
page_data,
Expand All @@ -34,30 +36,48 @@ def parse_section(
title_text,
cats.get("categories", []),
)
elif title_text in ["Tradução", "Cognatos"]:
elif title_text in ["Tradução", "Traduções", "Cognatos"]:
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "Expressões":
extract_expression_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text in LINKAGE_SECTIONS:
elif title_text.lower() in LINKAGE_SECTIONS:
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
LINKAGE_SECTIONS[title_text],
LINKAGE_SECTIONS[title_text.lower()],
"",
0,
"",
LINKAGE_TAGS.get(title_text.lower(), []),
)
elif title_text == "Etimologia":
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Pronúncia":
extract_pronunciation_section(wxr, page_data, level_node)
elif title_text in ["Nota", "Notas", "Nota de uso"]:
pass
elif title_text.lower() not in [
"ver também",
"ligações externas",
"referências",
"referência",
"no wikcionário",
"na wikipédia",
"no wikiquote",
"no wikispecies",
"no wikisaurus",
"no commons",
"no wikimedia commons",
"galeria",
]:
wxr.wtp.debug(f"unknown section: {title_text}")

if title_text not in POS_DATA:
if title_text.lower() not in POS_DATA:
save_section_cats(
cats.get("categories", []), page_data, level_node, True
)
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def extract_pos_section(
) -> None:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
pos_data = POS_DATA[pos_title.lower()]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))
page_data[-1].categories.extend(categories)
Expand Down
111 changes: 81 additions & 30 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,87 @@
POS_DATA = {
"Artigo": {"pos": "article"},
"Adjetivo": {"pos": "adj"},
"Advérbio": {"pos": "adv"},
"Conjunção": {"pos": "conj"},
"Interjeição": {"pos": "intj"},
"Numeral": {"pos": "num"},
"Partícula": {"pos": "particle"},
"Preposição": {"pos": "prep"},
"Posposição": {"pos": "postp"},
"Pronome": {"pos": "pron"},
"Substantivo": {"pos": "noun"},
"Verbo": {"pos": "verb"},
"Forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
"Forma verbal": {"pos": "verb", "tags": ["form-of"]},
"Locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
"Locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]},
"Locução adverbial": {"pos": "phrase", "tags": ["adverbial"]},
"Locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
"Expressão": {"pos": "phrase"},
"Abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
"Contração": {"pos": "contraction", "tags": ["contraction"]},
"Prefixo": {"pos": "prefix", "tags": ["morpheme"]},
"Sufixo": {"pos": "suffix", "tags": ["morpheme"]},
"Sigla": {"pos": "abbrev", "tags": ["abbreviation"]},
"Símbolo": {"pos": "symbol"},
"artigo": {"pos": "article"},
"adjetivo": {"pos": "adj"},
"advérbio": {"pos": "adv"},
"conjunção": {"pos": "conj"},
"interjeição": {"pos": "intj"},
"numeral": {"pos": "num"},
"partícula": {"pos": "particle"},
"preposição": {"pos": "prep"},
"posposição": {"pos": "postp"},
"pronome": {"pos": "pron"},
"substantivo": {"pos": "noun"},
"berbo": {"pos": "verb"},
"forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
"forma verbal": {"pos": "verb", "tags": ["form-of"]},
"locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
"locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]},
"locução adverbial": {"pos": "phrase", "tags": ["adverbial"]},
"locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
"expressão": {"pos": "phrase"},
"abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
"contração": {"pos": "contraction", "tags": ["contraction"]},
"prefixo": {"pos": "prefix", "tags": ["morpheme"]},
"sufixo": {"pos": "suffix", "tags": ["morpheme"]},
"sigla": {"pos": "abbrev", "tags": ["abbreviation"]},
"símbolo": {"pos": "symbol"},
"substantivo próprio": {"pos": "name"},
"adjetivo próprio": {"pos": "adj", "tags": ["name"]},
"forma de adjetivo": {"pos": "adj", "tags": ["form-of"]},
"letra": {"pos": "character", "tags": ["letter"]},
"transliteração": {"pos": "romanization"},
"numeral ordinal": {"pos": "adj"},
"numeral cardinal": {"pos": "adj"},
"ordinal equivalente": {"pos": "adj"},
"locução interjetiva": {"pos": "phrase", "tags": ["interjection"]},
"adjetivo numeral distributivo": {
"pos": "adj",
"tags": ["distributive", "numeral"],
},
"forma de pronome": {"pos": "pron", "tags": ["form-of"]},
"advérbio numeral": {"pos": "adv", "tags": ["numeral"]},
}


LINKAGE_SECTIONS = {
"Antônimos": "antonyms",
"Sinônimos": "synonyms",
"Sinónimos/Sinônimos": "synonyms",
"Sinónimos": "synonyms",
"Verbetes derivados": "derived",
"antônimos": "antonyms",
"antônimo": "antonyms",
"antónimo": "antonyms",
"antónimos/antônimos": "antonyms",
"sinônimos": "synonyms",
"sinônimo": "synonyms",
"sinónimos/sinônimos": "synonyms",
"sinónimos": "synonyms",
"sinónimo": "synonyms",
"verbetes derivados": "derived",
"verbete derivado": "derived",
"formas alternativas": "synonyms",
"anagramas": "anagrams",
"anagrama": "anagrams",
"hiperônimo": "hypernyms",
"hiperônimos": "hypernyms",
"hiperónimos": "hypernyms",
"termos derivados": "derived",
"grafia antiga": "synonyms",
"diminutivo": "synonyms",
"diminutivos": "synonyms",
"termos relacionados": "related",
"variante ortográfica": "synonyms",
"verbetes relacionados": "related",
"entradas relacionadas": "related",
"hipônimos": "hyponyms",
"hiponímias": "hyponyms",
"ortografias obsoletas": "synonyms",
"superlativo": "synonyms",
"outros verbetes": "related",
"cardinal equivalente": "synonyms",
"aumentativo": "synonyms",
}

LINKAGE_TAGS = {
"grafia antiga": ["obsolete"],
"diminutivo": ["diminutive"],
"diminutivos": ["diminutive"],
"ortografias obsoletas": ["obsolete"],
"superlativo": ["superlative"],
"aumentativo": ["augmentative"],
}
Loading