Skip to content

Commit

Permalink
Merge pull request #956 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] improve pos and linkage section code
  • Loading branch information
xxyzz authored Dec 25, 2024
2 parents 41e285e + f6c106f commit 27ac2c0
Show file tree
Hide file tree
Showing 6 changed files with 259 additions and 25 deletions.
32 changes: 17 additions & 15 deletions src/wiktextract/extractor/pt/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,22 @@ def extract_flex_template(
elif cell_node.attrs.get("style") == "background:#f4f4f4;":
row_header = cell_text
col_header_index += col_span
elif cell_text in ["–", wxr.wtp.title]:
col_cell_index += col_span
continue
else:
form = Form(form=cell_text)
if row_header != "":
form.raw_tags.append(row_header)
for col_header in col_headers:
if (
col_cell_index >= col_header.col_index
and col_cell_index
< col_header.col_index + col_header.colspan
):
form.raw_tags.append(col_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)
for link_node in cell_node.find_child(NodeKind.LINK):
form_str = clean_node(wxr, None, link_node)
if form_str in ["", "–", "-", wxr.wtp.title]:
continue
form_data = Form(form=form_str)
if row_header != "":
form_data.raw_tags.append(row_header)
for col_header in col_headers:
if (
col_cell_index >= col_header.col_index
and col_cell_index
< col_header.col_index + col_header.colspan
):
form_data.raw_tags.append(col_header.text)
translate_raw_tags(form_data)
word_entry.forms.append(form_data)

col_cell_index += col_span
39 changes: 30 additions & 9 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from wikitextprocessor import (
HTMLNode,
LevelNode,
Expand Down Expand Up @@ -74,7 +76,7 @@ def extract_gloss_list_item(
word_entry.senses.append(sense)

for child_list in list_item.find_child(NodeKind.LIST):
if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
if child_list.sarg.endswith("#"):
for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(
wxr, word_entry, child_list_item, sense.glosses
Expand All @@ -87,13 +89,10 @@ def extract_escopo_template(
t_node: TemplateNode,
) -> None:
# https://pt.wiktionary.org/wiki/Predefinição:escopo
for arg in range(2, 9):
if arg not in t_node.template_parameters:
break
raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
if raw_tag != "":
sense.raw_tags.append(raw_tag)
clean_node(wxr, sense, t_node)
expanded_str = clean_node(wxr, sense, t_node).strip("()")
for raw_tag in re.split(r", | e ", expanded_str):
if raw_tag.strip() != "":
sense.raw_tags.append(raw_tag.strip())


def extract_escopo2_template(
Expand All @@ -118,7 +117,8 @@ def extract_example_list_item(
) -> None:
example = Example()
ref_nodes = []
for node in list_item.children:

for index, node in enumerate(list_item.children):
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
Expand Down Expand Up @@ -147,6 +147,27 @@ def extract_example_list_item(
example.text = clean_node(
wxr, sense, node.template_parameters.get(1, "")
)
elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str) is not None:
list_item_str = clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
if list_item_str.endswith(":"):
ref_nodes.clear()
example.ref = list_item_str
for child_list in list_item.find_child(NodeKind.LIST):
for child_list_item in child_list.find_child(
NodeKind.LIST_ITEM
):
example.text = clean_node(
wxr, None, child_list_item.children
)
break
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
ref_nodes.clear()
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
ref_nodes.append(child_list_item.children)
else:
ref_nodes.append(node)

Expand Down
69 changes: 68 additions & 1 deletion src/wiktextract/extractor/pt/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,72 @@
"Diminutivo": "diminutive",
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS}
# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
GLOSS_TAGS = {
"Grafia portuguesa": "Portugal",
"Grafia brasileira": "Brazil",
"histórico": "historical",
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS}

# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
TOPICS = {
"botânica": "botany",
"ciência da computação": "computing",
# "ciência dos materiais": "",
"engenharia": "engineering",
# "pedagogia": "pedagogy",
# "cronônimo": "chrononym",
"basquetebol": "basketball",
"beisebol": "baseball",
"críquete": "cricket",
"desporto": "sports",
"esporte": "sports",
"ténis": "tennis",
"tênis": "tennis",
"esgrima": "fencing",
"geografia": "geography",
# "toponímia": "",
# "territory": "",
"zoologia": "zoology",
"ornitologia": "ornithology",
# "artrópodes": "",
"entomologia": "entomology",
"ictiologia": "ichthyology",
"veterinária": "veterinary",
# "antropónimo": "",
"alimentação": "food",
"arte": "arts",
"aeronáutica": "aeronautics",
"aritmética": "arithmetic",
"Meteorologia": "meteorology",
"design": "design",
"patologia": "pathology",
"etnologia": "ethnology",
"farmacologia": "pharmacology",
"transporte": "transport",
"Ginecologia": "gynecology",
"linguística": "linguistics",
"indústria têxtil": "textiles",
"mídia": "media",
"ciência da informação": "information-science",
"ludologia": "ludology",
"náutica": "nautical",
"mitologia": "mythology",
"mineralogia": "mineralogy",
"mobiliário": "furniture",
"numismática": "numismatics",
# "Esoterismo": "",
"profissão": "profession",
# "parapsiquismo": "",
"vestuário": "clothing",
"direito": "law",
"química": "chemistry",
"videojogo": "video-games",
"vídeo game": "video-games",
"viticultura": "viticulture",
}


def translate_raw_tags(data: WordEntry) -> None:
Expand All @@ -125,6 +190,8 @@ def translate_raw_tags(data: WordEntry) -> None:
data.tags.append(tr_tag)
elif isinstance(tr_tag, list):
data.tags.extend(tr_tag)
elif raw_tag in TOPICS and hasattr(data, "topics"):
data.topics.append(TOPICS[raw_tag])
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
48 changes: 48 additions & 0 deletions tests/test_pt_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,51 @@ def test_double_italic_nodes(self):
],
},
)

def test_source_above_text_child_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"amor",
"""={{-pt-}}=
==Substantivo==
# [[sentimento]]
#* '''1595''', [[w:Luís de Camões|Luís de Camões]], ''Rimas'':
#*: "'''''Amor''' é fogo que arde sem se ver<br>é ferida que dói, e não se sente,<br>é um contentamento descontente,<br>é dor que desatina sem doer.''\"""",
)
self.assertEqual(
data[0]["senses"][0],
{
"glosses": ["sentimento"],
"examples": [
{
"text": '"Amor é fogo que arde sem se ver\né ferida que dói, e não se sente,\né um contentamento descontente,\né dor que desatina sem doer."',
"ref": "1595, Luís de Camões, Rimas:",
}
],
},
)

def test_text_above_ref_child_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"mar",
"""={{-pt-}}=
==Substantivo==
# grande quantidade de
#:"''Ó '''mar''' salgado, quanto do teu sal<br />São lágrimas de Portugal!<br />Por te cruzarmos, quantas mães choraram,<br />Quantos filhos em vão rezaram!<br />Quantas noivas ficaram por casar<br />Para que fosses nosso, ó '''mar'''!''"
#:: ''-Mensagem, de Fernando Pessoa''""",
)
self.assertEqual(
data[0]["senses"][0],
{
"glosses": ["grande quantidade de"],
"examples": [
{
"text": "Ó mar salgado, quanto do teu sal\nSão lágrimas de Portugal!\nPor te cruzarmos, quantas mães choraram,\nQuantos filhos em vão rezaram!\nQuantas noivas ficaram por casar\nPara que fosses nosso, ó mar!",
"ref": "-Mensagem, de Fernando Pessoa",
}
],
},
)
65 changes: 65 additions & 0 deletions tests/test_pt_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,68 @@ def test_flex_pt_subst_completa(self):
{"form": "matilha", "tags": ["standard", "collective"]},
],
)

def test_slash_cell(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
"Predefinição:flex.pt.subst.completa",
10,
"""{|
|-
! style="background:#f4f4f4;" rowspan="2" |
! style="background:#ffffe0;" colspan="2" | [[masculino|Masculino]]
! style="background:#ffffe0;" colspan="2" | [[feminino|Feminino]]
! style="background:#ffffe0;" rowspan="2" | [[coletivo|Coletivo]]
|-
! style="background:#ffffe0;" | [[singular|Singular]]
! style="background:#ffffe0;" | [[plural|Plural]]
! style="background:#ffffe0;" | [[singular|Singular]]
! style="background:#ffffe0;" | [[plural|Plural]]
|-
! style="background:#f4f4f4;" | [[normal|Normal]]
| style="background:#ffffff; text-align:center;" | [[parvo]]
| style="background:#ffffff; text-align:center;" | [[parvos#Português|<span style="color:black">parvos</span>]]
| style="background:#ffffff; text-align:center;" | [[parva#Português|parva]] / [[párvoa#Português|<span style="color:black">párvoa</span>]]
| style="background:#ffffff; text-align:center;" | [[parvas#Português|<span style="color:black">parvas</span>]] / [[párvoas#Português|<span style="color:black">párvoas</span>]]
| style="background:#ffffff; text-align:center;" rowspan="3" | [[-#Português|-]]
|}""",
)
data = parse_page(
self.wxr,
"parvo",
"""={{-pt-}}=
==Substantivo==
{{flex.pt.subst.completa
|alinhamento=left
|ms=parvo
|msa=parvalhão|msa2=parvoalho|msa3=parvoeirão
|msd=parvinho
|mp=parvos
|mpa=parvalhões|mpa2=parvoalhos|mpa3=parvoeirões
|mpd=parvinhos
|fs=parva|fs2=párvoa
|fsa=parvalhona|fsa2=parvoalha|fsa3=parvoeirona
|fsd=parvinha
|fp=parvas|fp2=párvoas
|fpa=parvalhonas|fpa2=parvoalhas|fpa3=pavoeironas
|fpd=parvinhas
|col=-
}}
# [[pessoa]]""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "parvos", "tags": ["standard", "masculine", "plural"]},
{
"form": "parva",
"tags": ["standard", "feminine", "singular"],
},
{
"form": "párvoa",
"tags": ["standard", "feminine", "singular"],
},
{"form": "parvas", "tags": ["standard", "feminine", "plural"]},
{"form": "párvoas", "tags": ["standard", "feminine", "plural"]},
],
)
31 changes: 31 additions & 0 deletions tests/test_pt_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,34 @@ def test_phraseology_nested_list(self):
},
],
)

def test_expression_gloss_child_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"testa",
"""={{-pt-}}=
==Substantivo==
# [[parte]]
===Expressões===
* '''[[testa de boi]]''': (Portugal, Douro)
*# indivíduo com a testa avantajada;""",
)
self.assertEqual(
data[0]["expressions"],
[
{
"word": "testa de boi",
"senses": [
{"glosses": ["(Portugal, Douro)"]},
{
"glosses": [
"(Portugal, Douro)",
"indivíduo com a testa avantajada;",
]
},
],
}
],
)

0 comments on commit 27ac2c0

Please sign in to comment.