Skip to content

Commit

Permalink
Merge pull request #954 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] add more section title data
  • Loading branch information
xxyzz authored Dec 23, 2024
2 parents bb1d1ea + 268cef0 commit 7efe08e
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 38 deletions.
9 changes: 9 additions & 0 deletions src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def extract_linkage_section(
sense: str,
sense_index: int,
source: str,
tags: list[str],
) -> None:
for node in level_node.children:
if isinstance(node, TemplateNode) and node.template_name == "fraseini":
Expand All @@ -86,6 +87,7 @@ def extract_linkage_section(
sense,
sense_index,
source,
tags,
)


Expand All @@ -112,6 +114,7 @@ def extract_linkage_list_item(
sense: str,
sense_index: int,
source: str,
tags: list[str],
) -> None:
linkage_words = []
raw_tags = []
Expand Down Expand Up @@ -140,6 +143,7 @@ def extract_linkage_list_item(
linkage_type,
sense,
sense_index,
tags,
)
elif word != "":
linkage_words.append(word)
Expand All @@ -157,6 +161,7 @@ def extract_linkage_list_item(
linkage_type,
sense,
sense_index,
tags,
)
elif raw_tag != "":
raw_tags.append(raw_tag)
Expand All @@ -170,6 +175,7 @@ def extract_linkage_list_item(
sense,
sense_index,
source,
tags,
)
elif isinstance(node, str):
m = re.search(r"\((.+)\)", node)
Expand All @@ -183,6 +189,7 @@ def extract_linkage_list_item(
sense_index=sense_index,
raw_tags=raw_tags,
source=source,
tags=tags,
)
translate_raw_tags(linkage)
getattr(word_entry, linkage_type).append(linkage)
Expand All @@ -195,6 +202,7 @@ def extract_wikisaurus_page(
linkage_type: str,
sense: str,
sense_index: int,
tags: list[str],
) -> None:
page = wxr.wtp.get_page(page_title, 0)
if page is None or page.body is None:
Expand All @@ -220,4 +228,5 @@ def extract_wikisaurus_page(
sense,
sense_index,
page_title,
tags,
)
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ class WordEntry(PortugueseBaseModel):
antonyms: list[Linkage] = []
synonyms: list[Linkage] = []
derived: list[Linkage] = []
anagrams: list[Linkage] = []
hypernyms: list[Linkage] = []
related: list[Linkage] = []
hyponyms: list[Linkage] = []
etymology_texts: list[str] = []
sounds: list[Sound] = []
forms: list[Form] = []
34 changes: 27 additions & 7 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .pronunciation import extract_pronunciation_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA
from .translation import extract_translation_section


Expand All @@ -24,8 +24,10 @@ def parse_section(
level_node: LevelNode,
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs).strip("⁰¹²³⁴⁵⁶⁷⁸⁹")
if title_text in POS_DATA:
title_text = clean_node(wxr, cats, level_node.largs).strip(
"⁰¹²³⁴⁵⁶⁷⁸⁹0123456789"
)
if title_text.lower() in POS_DATA:
extract_pos_section(
wxr,
page_data,
Expand All @@ -34,30 +36,48 @@ def parse_section(
title_text,
cats.get("categories", []),
)
elif title_text in ["Tradução", "Cognatos"]:
elif title_text in ["Tradução", "Traduções", "Cognatos"]:
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "Expressões":
extract_expression_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text in LINKAGE_SECTIONS:
elif title_text.lower() in LINKAGE_SECTIONS:
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
LINKAGE_SECTIONS[title_text],
LINKAGE_SECTIONS[title_text.lower()],
"",
0,
"",
LINKAGE_TAGS.get(title_text.lower(), []),
)
elif title_text == "Etimologia":
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Pronúncia":
extract_pronunciation_section(wxr, page_data, level_node)
elif title_text in ["Nota", "Notas", "Nota de uso"]:
pass
elif title_text.lower() not in [
"ver também",
"ligações externas",
"referências",
"referência",
"no wikcionário",
"na wikipédia",
"no wikiquote",
"no wikispecies",
"no wikisaurus",
"no commons",
"no wikimedia commons",
"galeria",
]:
wxr.wtp.debug(f"unknown section: {title_text}")

if title_text not in POS_DATA:
if title_text.lower() not in POS_DATA:
save_section_cats(
cats.get("categories", []), page_data, level_node, True
)
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def extract_pos_section(
) -> None:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
pos_data = POS_DATA[pos_title.lower()]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))
page_data[-1].categories.extend(categories)
Expand Down
111 changes: 81 additions & 30 deletions src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,87 @@
POS_DATA = {
"Artigo": {"pos": "article"},
"Adjetivo": {"pos": "adj"},
"Advérbio": {"pos": "adv"},
"Conjunção": {"pos": "conj"},
"Interjeição": {"pos": "intj"},
"Numeral": {"pos": "num"},
"Partícula": {"pos": "particle"},
"Preposição": {"pos": "prep"},
"Posposição": {"pos": "postp"},
"Pronome": {"pos": "pron"},
"Substantivo": {"pos": "noun"},
"Verbo": {"pos": "verb"},
"Forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
"Forma verbal": {"pos": "verb", "tags": ["form-of"]},
"Locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
"Locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]},
"Locução adverbial": {"pos": "phrase", "tags": ["adverbial"]},
"Locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
"Expressão": {"pos": "phrase"},
"Abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
"Contração": {"pos": "contraction", "tags": ["contraction"]},
"Prefixo": {"pos": "prefix", "tags": ["morpheme"]},
"Sufixo": {"pos": "suffix", "tags": ["morpheme"]},
"Sigla": {"pos": "abbrev", "tags": ["abbreviation"]},
"Símbolo": {"pos": "symbol"},
"artigo": {"pos": "article"},
"adjetivo": {"pos": "adj"},
"advérbio": {"pos": "adv"},
"conjunção": {"pos": "conj"},
"interjeição": {"pos": "intj"},
"numeral": {"pos": "num"},
"partícula": {"pos": "particle"},
"preposição": {"pos": "prep"},
"posposição": {"pos": "postp"},
"pronome": {"pos": "pron"},
"substantivo": {"pos": "noun"},
"berbo": {"pos": "verb"},
"forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
"forma verbal": {"pos": "verb", "tags": ["form-of"]},
"locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
"locução adjetiva": {"pos": "phrase", "tags": ["adjectival"]},
"locução adverbial": {"pos": "phrase", "tags": ["adverbial"]},
"locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
"expressão": {"pos": "phrase"},
"abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
"contração": {"pos": "contraction", "tags": ["contraction"]},
"prefixo": {"pos": "prefix", "tags": ["morpheme"]},
"sufixo": {"pos": "suffix", "tags": ["morpheme"]},
"sigla": {"pos": "abbrev", "tags": ["abbreviation"]},
"símbolo": {"pos": "symbol"},
"substantivo próprio": {"pos": "name"},
"adjetivo próprio": {"pos": "adj", "tags": ["name"]},
"forma de adjetivo": {"pos": "adj", "tags": ["form-of"]},
"letra": {"pos": "character", "tags": ["letter"]},
"transliteração": {"pos": "romanization"},
"numeral ordinal": {"pos": "adj"},
"numeral cardinal": {"pos": "adj"},
"ordinal equivalente": {"pos": "adj"},
"locução interjetiva": {"pos": "phrase", "tags": ["interjection"]},
"adjetivo numeral distributivo": {
"pos": "adj",
"tags": ["distributive", "numeral"],
},
"forma de pronome": {"pos": "pron", "tags": ["form-of"]},
"advérbio numeral": {"pos": "adv", "tags": ["numeral"]},
}


LINKAGE_SECTIONS = {
"Antônimos": "antonyms",
"Sinônimos": "synonyms",
"Sinónimos/Sinônimos": "synonyms",
"Sinónimos": "synonyms",
"Verbetes derivados": "derived",
"antônimos": "antonyms",
"antônimo": "antonyms",
"antónimo": "antonyms",
"antónimos/antônimos": "antonyms",
"sinônimos": "synonyms",
"sinônimo": "synonyms",
"sinónimos/sinônimos": "synonyms",
"sinónimos": "synonyms",
"sinónimo": "synonyms",
"verbetes derivados": "derived",
"verbete derivado": "derived",
"formas alternativas": "synonyms",
"anagramas": "anagrams",
"anagrama": "anagrams",
"hiperônimo": "hypernyms",
"hiperônimos": "hypernyms",
"hiperónimos": "hypernyms",
"termos derivados": "derived",
"grafia antiga": "synonyms",
"diminutivo": "synonyms",
"diminutivos": "synonyms",
"termos relacionados": "related",
"variante ortográfica": "synonyms",
"verbetes relacionados": "related",
"entradas relacionadas": "related",
"hipônimos": "hyponyms",
"hiponímias": "hyponyms",
"ortografias obsoletas": "synonyms",
"superlativo": "synonyms",
"outros verbetes": "related",
"cardinal equivalente": "synonyms",
"aumentativo": "synonyms",
}

LINKAGE_TAGS = {
"grafia antiga": ["obsolete"],
"diminutivo": ["diminutive"],
"diminutivos": ["diminutive"],
"ortografias obsoletas": ["obsolete"],
"superlativo": ["superlative"],
"aumentativo": ["augmentative"],
}

0 comments on commit 7efe08e

Please sign in to comment.