Skip to content

Commit

Permalink
Merge pull request #955 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] extract phraseology and note sections, nested gloss lists
  • Loading branch information
xxyzz authored Dec 24, 2024
2 parents 7efe08e + f5952ba commit 41e285e
Show file tree
Hide file tree
Showing 7 changed files with 232 additions and 12 deletions.
63 changes: 62 additions & 1 deletion src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,13 @@ def extract_fraseini_template(
sense = ""
sense_index = 0
first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
m = re.search(r"(\d+)$", first_arg)
m = re.search(r"\((\d+)\)$", first_arg)
if m is not None:
sense_index = int(m.group(1))
sense = first_arg[: m.start()].strip()
elif (m := re.match(r"De (\d+)", first_arg)) is not None:
sense_index = int(m.group(1))
sense = first_arg[m.end() :].strip("() \n")
else:
sense = first_arg
return sense, sense_index
Expand Down Expand Up @@ -230,3 +233,61 @@ def extract_wikisaurus_page(
page_title,
tags,
)


def extract_phraseology_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
sense = ""
sense_index = 0
for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
if isinstance(node, TemplateNode) and node.template_name == "fraseini":
sense, sense_index = extract_fraseini_template(wxr, node)
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_phraseology_list_item(
wxr, word_entry, list_item, sense, sense_index
)


def extract_phraseology_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
sense: str,
sense_index: int,
) -> None:
l_data = Linkage(word="", sense=sense, sense_index=sense_index)
for index, node in enumerate(list_item.children):
if (
isinstance(node, WikiNode)
and node.kind in NodeKind.BOLD | NodeKind.LINK
and l_data.word == ""
):
l_data.word = clean_node(wxr, None, node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
l_data.roman = clean_node(wxr, None, node)
elif isinstance(node, str) and ("=" in node or ":" in node):
sense_start = node.index("=" if "=" in node else ":") + 1
l_data.sense = clean_node(
wxr,
None,
[node[sense_start:]]
+ [
n
for n in list_item.children[index + 1 :]
if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
],
)
break

if l_data.word != "":
word_entry.phraseology.append(l_data)

for child_list in list_item.find_child(NodeKind.LIST):
for next_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_phraseology_list_item(
wxr, word_entry, next_list_item, sense, sense_index
)
6 changes: 6 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class Linkage(PortugueseBaseModel):
default=0, ge=0, description="Number of the definition, start from 1"
)
source: str = ""
roman: str = ""


class Sound(PortugueseBaseModel):
Expand Down Expand Up @@ -92,6 +93,11 @@ class WordEntry(PortugueseBaseModel):
hypernyms: list[Linkage] = []
related: list[Linkage] = []
hyponyms: list[Linkage] = []
homophones: list[Linkage] = []
homonyms: list[Linkage] = []
paronyms: list[Linkage] = []
phraseology: list[Linkage] = []
etymology_texts: list[str] = []
sounds: list[Sound] = []
forms: list[Form] = []
notes: list[str] = []
39 changes: 34 additions & 5 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .linkage import extract_expression_section, extract_linkage_section
from .linkage import (
extract_expression_section,
extract_linkage_section,
extract_phraseology_section,
)
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .pronunciation import extract_pronunciation_section
Expand All @@ -25,7 +29,7 @@ def parse_section(
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs).strip(
"⁰¹²³⁴⁵⁶⁷⁸⁹0123456789"
"⁰¹²³⁴⁵⁶⁷⁸⁹0123456789:"
)
if title_text.lower() in POS_DATA:
extract_pos_section(
Expand Down Expand Up @@ -59,11 +63,17 @@ def parse_section(
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Pronúncia":
extract_pronunciation_section(wxr, page_data, level_node)
elif title_text in ["Nota", "Notas", "Nota de uso"]:
pass
elif title_text == "Fraseologia":
extract_phraseology_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text.startswith("Nota"):
extract_note_section(wxr, page_data, level_node)
elif title_text.lower() not in [
"ver também",
"ligação externa",
"ligações externas",
"ligação extena",
"referências",
"referência",
"no wikcionário",
Expand All @@ -73,7 +83,9 @@ def parse_section(
"no wikisaurus",
"no commons",
"no wikimedia commons",
"na internet",
"galeria",
"galeria de imagens",
]:
wxr.wtp.debug(f"unknown section: {title_text}")

Expand All @@ -86,7 +98,7 @@ def parse_section(
clean_node(wxr, cats, link_node)
save_section_cats(cats.get("categories", []), page_data, level_node, False)

if title_text != "Pronúncia":
if title_text.lower() not in ["pronúncia", "ver também"]:
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)

Expand Down Expand Up @@ -147,3 +159,20 @@ def parse_page(
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [m.model_dump(exclude_defaults=True) for m in page_data]


def extract_note_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
notes = []
for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
note = clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
if note != "":
notes.append(note)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.notes.extend(notes)
15 changes: 10 additions & 5 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry | Linkage,
list_item: WikiNode,
parent_gloss: list[str] = [],
) -> None:
gloss_nodes = []
sense = Sense()
first_gloss_index = len(list_item.children)
for index, node in enumerate(list_item.children):
sense = Sense(glosses=parent_gloss)
for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name == "escopo":
extract_escopo_template(wxr, sense, node)
Expand All @@ -65,8 +65,6 @@ def extract_gloss_list_item(
if node.sarg.endswith(("*", ":")):
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, next_list_item)
if index < first_gloss_index:
first_gloss_index = index
else:
gloss_nodes.append(node)

Expand All @@ -75,6 +73,13 @@ def extract_gloss_list_item(
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)

for child_list in list_item.find_child(NodeKind.LIST):
if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(
wxr, word_entry, child_list_item, sense.glosses
)


def extract_escopo_template(
wxr: WiktextractContext,
Expand Down
27 changes: 26 additions & 1 deletion src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"posposição": {"pos": "postp"},
"pronome": {"pos": "pron"},
"substantivo": {"pos": "noun"},
"berbo": {"pos": "verb"},
"verbo": {"pos": "verb"},
"forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
"forma verbal": {"pos": "verb", "tags": ["form-of"]},
"locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
Expand All @@ -19,6 +19,7 @@
"locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
"expressão": {"pos": "phrase"},
"abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
"abreviação": {"pos": "abbrev", "tags": ["abbreviation"]},
"contração": {"pos": "contraction", "tags": ["contraction"]},
"prefixo": {"pos": "prefix", "tags": ["morpheme"]},
"sufixo": {"pos": "suffix", "tags": ["morpheme"]},
Expand All @@ -39,19 +40,27 @@
},
"forma de pronome": {"pos": "pron", "tags": ["form-of"]},
"advérbio numeral": {"pos": "adv", "tags": ["numeral"]},
"verbo preposicionado": {"pos": "verb", "tags": ["prepositional"]},
"caractere han": {"pos": "character", "tags": ["han"]},
"hanja": {"pos": "character", "tags": ["Hanja"]},
"kanji": {"pos": "character", "tags": ["kanji"]},
"pronome pessoal": {"pos": "pron", "tags": ["person"]},
"pronome possessivo": {"pos": "det", "tags": ["possessive"]},
}


LINKAGE_SECTIONS = {
"antônimos": "antonyms",
"antônimo": "antonyms",
"antónimo": "antonyms",
"antónimos": "antonyms",
"antónimos/antônimos": "antonyms",
"sinônimos": "synonyms",
"sinônimo": "synonyms",
"sinónimos/sinônimos": "synonyms",
"sinónimos": "synonyms",
"sinónimo": "synonyms",
"sinônimos e variantes": "synonyms",
"verbetes derivados": "derived",
"verbete derivado": "derived",
"formas alternativas": "synonyms",
Expand All @@ -61,6 +70,7 @@
"hiperônimos": "hypernyms",
"hiperónimos": "hypernyms",
"termos derivados": "derived",
"termos derivadoss": "derived",
"grafia antiga": "synonyms",
"diminutivo": "synonyms",
"diminutivos": "synonyms",
Expand All @@ -70,11 +80,25 @@
"entradas relacionadas": "related",
"hipônimos": "hyponyms",
"hiponímias": "hyponyms",
"hipónimos": "hyponyms",
"ortografias obsoletas": "synonyms",
"superlativo": "synonyms",
"outros verbetes": "related",
"cardinal equivalente": "synonyms",
"cardinais equivalentes": "synonyms",
"aumentativo": "synonyms",
"advérbios derivados": "derived",
"derivações": "derived",
"homófonos": "homophones",
"homófono": "homophones",
"homónimos/homônimos": "homonyms",
"homônimos": "homonyms",
"parônimos": "paronyms",
"caracteres derivados": "derived",
"caracteres relacionados": "related",
"palavras com o kanji": "related",
"compostos": "derived",
"vermos derivados": "derived",
}

LINKAGE_TAGS = {
Expand All @@ -84,4 +108,5 @@
"ortografias obsoletas": ["obsolete"],
"superlativo": ["superlative"],
"aumentativo": ["augmentative"],
"advérbios derivados": ["adverb"],
}
18 changes: 18 additions & 0 deletions tests/test_pt_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,21 @@ def test_escopo(self):
}
],
)

def test_nested_list(self):
self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês")
data = parse_page(
self.wxr,
"average",
"""={{-en-}}=
==Adjetivo==
# [[médio]]
## [[relativo à]] [[média]];''""",
)
self.assertEqual(
data[0]["senses"],
[
{"glosses": ["médio"]},
{"glosses": ["médio", "relativo à média;"]},
],
)
Loading

0 comments on commit 41e285e

Please sign in to comment.