Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pt] improve pos and linkage section code #956

Merged
merged 5 commits into from
Dec 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 17 additions & 15 deletions src/wiktextract/extractor/pt/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,22 @@ def extract_flex_template(
elif cell_node.attrs.get("style") == "background:#f4f4f4;":
row_header = cell_text
col_header_index += col_span
elif cell_text in ["–", wxr.wtp.title]:
col_cell_index += col_span
continue
else:
form = Form(form=cell_text)
if row_header != "":
form.raw_tags.append(row_header)
for col_header in col_headers:
if (
col_cell_index >= col_header.col_index
and col_cell_index
< col_header.col_index + col_header.colspan
):
form.raw_tags.append(col_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)
for link_node in cell_node.find_child(NodeKind.LINK):
form_str = clean_node(wxr, None, link_node)
if form_str in ["", "–", "-", wxr.wtp.title]:
continue
form_data = Form(form=form_str)
if row_header != "":
form_data.raw_tags.append(row_header)
for col_header in col_headers:
if (
col_cell_index >= col_header.col_index
and col_cell_index
< col_header.col_index + col_header.colspan
):
form_data.raw_tags.append(col_header.text)
translate_raw_tags(form_data)
word_entry.forms.append(form_data)

col_cell_index += col_span
39 changes: 30 additions & 9 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from wikitextprocessor import (
HTMLNode,
LevelNode,
Expand Down Expand Up @@ -74,7 +76,7 @@ def extract_gloss_list_item(
word_entry.senses.append(sense)

for child_list in list_item.find_child(NodeKind.LIST):
if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
if child_list.sarg.endswith("#"):
for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(
wxr, word_entry, child_list_item, sense.glosses
Expand All @@ -87,13 +89,10 @@ def extract_escopo_template(
t_node: TemplateNode,
) -> None:
# https://pt.wiktionary.org/wiki/Predefinição:escopo
for arg in range(2, 9):
if arg not in t_node.template_parameters:
break
raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
if raw_tag != "":
sense.raw_tags.append(raw_tag)
clean_node(wxr, sense, t_node)
expanded_str = clean_node(wxr, sense, t_node).strip("()")
for raw_tag in re.split(r", | e ", expanded_str):
if raw_tag.strip() != "":
sense.raw_tags.append(raw_tag.strip())


def extract_escopo2_template(
Expand All @@ -118,7 +117,8 @@ def extract_example_list_item(
) -> None:
example = Example()
ref_nodes = []
for node in list_item.children:

for index, node in enumerate(list_item.children):
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
Expand Down Expand Up @@ -147,6 +147,27 @@ def extract_example_list_item(
example.text = clean_node(
wxr, sense, node.template_parameters.get(1, "")
)
elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str) is not None:
list_item_str = clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
if list_item_str.endswith(":"):
ref_nodes.clear()
example.ref = list_item_str
for child_list in list_item.find_child(NodeKind.LIST):
for child_list_item in child_list.find_child(
NodeKind.LIST_ITEM
):
example.text = clean_node(
wxr, None, child_list_item.children
)
break
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
ref_nodes.clear()
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
ref_nodes.append(child_list_item.children)
else:
ref_nodes.append(node)

Expand Down
69 changes: 68 additions & 1 deletion src/wiktextract/extractor/pt/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,72 @@
"Diminutivo": "diminutive",
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS}
# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
GLOSS_TAGS = {
"Grafia portuguesa": "Portugal",
"Grafia brasileira": "Brazil",
"histórico": "historical",
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS}

# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
TOPICS = {
"botânica": "botany",
"ciência da computação": "computing",
# "ciência dos materiais": "",
"engenharia": "engineering",
# "pedagogia": "pedagogy",
# "cronônimo": "chrononym",
"basquetebol": "basketball",
"beisebol": "baseball",
"críquete": "cricket",
"desporto": "sports",
"esporte": "sports",
"ténis": "tennis",
"tênis": "tennis",
"esgrima": "fencing",
"geografia": "geography",
# "toponímia": "",
# "territory": "",
"zoologia": "zoology",
"ornitologia": "ornithology",
# "artrópodes": "",
"entomologia": "entomology",
"ictiologia": "ichthyology",
"veterinária": "veterinary",
# "antropónimo": "",
"alimentação": "food",
"arte": "arts",
"aeronáutica": "aeronautics",
"aritmética": "arithmetic",
"Meteorologia": "meteorology",
"design": "design",
"patologia": "pathology",
"etnologia": "ethnology",
"farmacologia": "pharmacology",
"transporte": "transport",
"Ginecologia": "gynecology",
"linguística": "linguistics",
"indústria têxtil": "textiles",
"mídia": "media",
"ciência da informação": "information-science",
"ludologia": "ludology",
"náutica": "nautical",
"mitologia": "mythology",
"mineralogia": "mineralogy",
"mobiliário": "furniture",
"numismática": "numismatics",
# "Esoterismo": "",
"profissão": "profession",
# "parapsiquismo": "",
"vestuário": "clothing",
"direito": "law",
"química": "chemistry",
"videojogo": "video-games",
"vídeo game": "video-games",
"viticultura": "viticulture",
}


def translate_raw_tags(data: WordEntry) -> None:
Expand All @@ -125,6 +190,8 @@ def translate_raw_tags(data: WordEntry) -> None:
data.tags.append(tr_tag)
elif isinstance(tr_tag, list):
data.tags.extend(tr_tag)
elif raw_tag in TOPICS and hasattr(data, "topics"):
data.topics.append(TOPICS[raw_tag])
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
48 changes: 48 additions & 0 deletions tests/test_pt_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,51 @@ def test_double_italic_nodes(self):
],
},
)

def test_source_above_text_child_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"amor",
"""={{-pt-}}=
==Substantivo==
# [[sentimento]]
#* '''1595''', [[w:Luís de Camões|Luís de Camões]], ''Rimas'':
#*: "'''''Amor''' é fogo que arde sem se ver<br>é ferida que dói, e não se sente,<br>é um contentamento descontente,<br>é dor que desatina sem doer.''\"""",
)
self.assertEqual(
data[0]["senses"][0],
{
"glosses": ["sentimento"],
"examples": [
{
"text": '"Amor é fogo que arde sem se ver\né ferida que dói, e não se sente,\né um contentamento descontente,\né dor que desatina sem doer."',
"ref": "1595, Luís de Camões, Rimas:",
}
],
},
)

def test_text_above_ref_child_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"mar",
"""={{-pt-}}=
==Substantivo==
# grande quantidade de
#:"''Ó '''mar''' salgado, quanto do teu sal<br />São lágrimas de Portugal!<br />Por te cruzarmos, quantas mães choraram,<br />Quantos filhos em vão rezaram!<br />Quantas noivas ficaram por casar<br />Para que fosses nosso, ó '''mar'''!''"
#:: ''-Mensagem, de Fernando Pessoa''""",
)
self.assertEqual(
data[0]["senses"][0],
{
"glosses": ["grande quantidade de"],
"examples": [
{
"text": "Ó mar salgado, quanto do teu sal\nSão lágrimas de Portugal!\nPor te cruzarmos, quantas mães choraram,\nQuantos filhos em vão rezaram!\nQuantas noivas ficaram por casar\nPara que fosses nosso, ó mar!",
"ref": "-Mensagem, de Fernando Pessoa",
}
],
},
)
65 changes: 65 additions & 0 deletions tests/test_pt_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,68 @@ def test_flex_pt_subst_completa(self):
{"form": "matilha", "tags": ["standard", "collective"]},
],
)

def test_slash_cell(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
"Predefinição:flex.pt.subst.completa",
10,
"""{|
|-
! style="background:#f4f4f4;" rowspan="2" |
! style="background:#ffffe0;" colspan="2" | [[masculino|Masculino]]
! style="background:#ffffe0;" colspan="2" | [[feminino|Feminino]]
! style="background:#ffffe0;" rowspan="2" | [[coletivo|Coletivo]]
|-
! style="background:#ffffe0;" | [[singular|Singular]]
! style="background:#ffffe0;" | [[plural|Plural]]
! style="background:#ffffe0;" | [[singular|Singular]]
! style="background:#ffffe0;" | [[plural|Plural]]
|-
! style="background:#f4f4f4;" | [[normal|Normal]]
| style="background:#ffffff; text-align:center;" | [[parvo]]
| style="background:#ffffff; text-align:center;" | [[parvos#Português|<span style="color:black">parvos</span>]]
| style="background:#ffffff; text-align:center;" | [[parva#Português|parva]] / [[párvoa#Português|<span style="color:black">párvoa</span>]]
| style="background:#ffffff; text-align:center;" | [[parvas#Português|<span style="color:black">parvas</span>]] / [[párvoas#Português|<span style="color:black">párvoas</span>]]
| style="background:#ffffff; text-align:center;" rowspan="3" | [[-#Português|-]]
|}""",
)
data = parse_page(
self.wxr,
"parvo",
"""={{-pt-}}=
==Substantivo==
{{flex.pt.subst.completa
|alinhamento=left
|ms=parvo
|msa=parvalhão|msa2=parvoalho|msa3=parvoeirão
|msd=parvinho
|mp=parvos
|mpa=parvalhões|mpa2=parvoalhos|mpa3=parvoeirões
|mpd=parvinhos
|fs=parva|fs2=párvoa
|fsa=parvalhona|fsa2=parvoalha|fsa3=parvoeirona
|fsd=parvinha
|fp=parvas|fp2=párvoas
|fpa=parvalhonas|fpa2=parvoalhas|fpa3=pavoeironas
|fpd=parvinhas
|col=-
}}
# [[pessoa]]""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "parvos", "tags": ["standard", "masculine", "plural"]},
{
"form": "parva",
"tags": ["standard", "feminine", "singular"],
},
{
"form": "párvoa",
"tags": ["standard", "feminine", "singular"],
},
{"form": "parvas", "tags": ["standard", "feminine", "plural"]},
{"form": "párvoas", "tags": ["standard", "feminine", "plural"]},
],
)
31 changes: 31 additions & 0 deletions tests/test_pt_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,34 @@ def test_phraseology_nested_list(self):
},
],
)

def test_expression_gloss_child_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"testa",
"""={{-pt-}}=
==Substantivo==
# [[parte]]

===Expressões===
* '''[[testa de boi]]''': (Portugal, Douro)
*# indivíduo com a testa avantajada;""",
)
self.assertEqual(
data[0]["expressions"],
[
{
"word": "testa de boi",
"senses": [
{"glosses": ["(Portugal, Douro)"]},
{
"glosses": [
"(Portugal, Douro)",
"indivíduo com a testa avantajada;",
]
},
],
}
],
)
Loading