Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pt] translate tags and extract "conj.pt" template #957

Merged
merged 2 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 142 additions & 1 deletion src/wiktextract/extractor/pt/inflection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
from dataclasses import dataclass

from wikitextprocessor import NodeKind, TemplateNode
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand Down Expand Up @@ -73,3 +73,144 @@ def extract_flex_template(
word_entry.forms.append(form_data)

col_cell_index += col_span


def extract_conjugation_section(
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
) -> None:
for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name.startswith(("conj.pt", "conj/pt")):
extract_conj_pt_template(wxr, word_entry, t_node)


def extract_conj_pt_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://pt.wiktionary.org/wiki/Predefinição:conj.pt
# https://pt.wiktionary.org/wiki/Predefinição:conj/pt
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for index, table_node in enumerate(
expanded_node.find_child_recursively(NodeKind.TABLE)
):
match index:
case 0:
extract_conj_pt_template_first_table(
wxr, word_entry, table_node
)
case 1:
extract_conj_pt_template_second_table(
wxr, word_entry, table_node
)


def extract_conj_pt_template_first_table(
wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode
) -> None:
for row in table_node.find_child(NodeKind.TABLE_ROW):
row_header = ""
for cell in row.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
match cell.kind:
case NodeKind.TABLE_HEADER_CELL:
row_header = clean_node(wxr, None, cell)
case NodeKind.TABLE_CELL:
form_str = clean_node(wxr, None, cell)
if form_str not in ["", wxr.wtp.title]:
form = Form(form=form_str)
if row_header != "":
form.raw_tags.append(row_header)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_conj_pt_template_second_table(
wxr: WiktextractContext, word_entry: WordEntry, table_node: WikiNode
) -> None:
col_headers = []
row_headers = []
row_index = 0
for row in table_node.find_child(NodeKind.TABLE_ROW):
col_index = 0
for cell in row.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
match cell.kind:
case NodeKind.TABLE_HEADER_CELL:
colspan = 1
colspan_str = cell.attrs.get("colspan", "1")
if re.fullmatch(r"\d+", colspan_str):
colspan = int(colspan_str)
rowspan = 1
rowspan_str = cell.attrs.get("rowspan", "1")
if re.fullmatch(r"\d+", rowspan_str):
rowspan = int(rowspan_str)
header_str = clean_node(wxr, None, cell)
if header_str == "":
continue
if rowspan > 1:
row_index = 0
row_headers.clear()
header = TableHeader(
header_str, col_index, colspan, row_index, rowspan
)
if not row.contain_node(NodeKind.TABLE_CELL):
col_headers.append(header)
col_index += colspan
else:
row_headers.append(header)
case NodeKind.TABLE_CELL:
has_link = False
for link_node in cell.find_child(NodeKind.LINK):
link_str = clean_node(wxr, None, link_node)
if link_str not in ["", wxr.wtp.title]:
add_conj_pt_form(
word_entry,
link_str,
col_index,
row_index,
col_headers,
row_headers,
)
has_link = True
if not has_link:
cell_str = clean_node(wxr, None, cell)
if cell_str not in ["", wxr.wtp.title]:
add_conj_pt_form(
word_entry,
cell_str,
col_index,
row_index,
col_headers,
row_headers,
)
col_index += 1

row_index += 1


def add_conj_pt_form(
word_entry: WordEntry,
form_str: str,
col_index: int,
row_index: int,
col_headers: list[TableHeader],
row_headers: list[TableHeader],
) -> None:
form = Form(form=form_str)
for col_header in col_headers:
if (
col_index >= col_header.col_index
and col_index < col_header.col_index + col_header.colspan
):
form.raw_tags.append(col_header.text)
for row_header in row_headers:
if (
row_index >= row_header.row_index
and row_index < row_header.row_index + row_header.rowspan
):
form.raw_tags.append(row_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)
1 change: 1 addition & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class WordEntry(PortugueseBaseModel):
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
topics: list[str] = []
translations: list[Translation] = []
expressions: list[Linkage] = []
antonyms: list[Linkage] = []
Expand Down
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .inflection import extract_conjugation_section
from .linkage import (
extract_expression_section,
extract_linkage_section,
Expand Down Expand Up @@ -69,6 +70,10 @@ def parse_section(
)
elif title_text.startswith("Nota"):
extract_note_section(wxr, page_data, level_node)
elif title_text == "Conjugação":
extract_conjugation_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text.lower() not in [
"ver também",
"ligação externa",
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .inflection import extract_flex_template
from .models import Example, Linkage, Sense, WordEntry
from .section_titles import POS_DATA
from .tags import translate_raw_tags


def extract_pos_section(
Expand Down Expand Up @@ -73,6 +74,7 @@ def extract_gloss_list_item(
gloss_str = clean_node(wxr, sense, gloss_nodes)
if len(gloss_str) > 0:
sense.glosses.append(gloss_str)
translate_raw_tags(sense)
word_entry.senses.append(sense)

for child_list in list_item.find_child(NodeKind.LIST):
Expand Down
102 changes: 98 additions & 4 deletions src/wiktextract/extractor/pt/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,21 +111,113 @@
"Normal": "standard",
"Aumentativo": "augmentative",
"Diminutivo": "diminutive",
# Predefinição:conj.pt
"Infinitivo impessoal": ["impersonal", "infinitive"],
"Gerúndio": "gerund",
"Particípio": "participle",
"primeira": "first-person",
"segunda": "second-person",
"terceira": "third-person",
"Modo\nIndicativo": "indicative",
"Presente": "present",
"Pretérito imperfeito": ["past", "continuative"],
"Pretérito perfeito": "past",
"Pretérito mais-que-perfeito": "pluperfect",
"Futuro do presente": ["future", "present"],
"Futuro do pretérito": ["future", "past"],
"Modo\nSubjuntivo\n(Conjuntivo)": ["subjunctive", "conjunctive"],
"Futuro": "future",
"Modo\nImperativo": "imperative",
"Afirmativo": "affirmative",
"Negativo": "negative",
"Infinitivo pessoal": ["personal", "infinitive"],
}

# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
GLOSS_TAGS = {
"Grafia portuguesa": "Portugal",
"Grafia brasileira": "Brazil",
"histórico": "historical",
"antigo": "archaic",
"arcaico": "archaic",
"em desuso": "obsolete",
"obsoleto": "obsolete",
"pouco comum": "uncommon",
"raro": "rare",
"obsceno": "vulgar",
"coloquial": "colloquial",
"familiar": "familiar",
"informal": "informal",
# "popular": "",
"figurado": "figuratively",
"depreciativo": "derogatory",
"pejorativo": "pejorative",
"poético": "poetic",
"internetês": ["Internet", "slang"],
"ironia": "ironic",
# "alemanismo": "",
# "italianismo": "Italianism",
# "germanismo": "Germanism",
# "francesismo": "",
# "galicismo": "Gallicism",
"anglicismo": "Anglicism",
# "portuguesismo": "Portuguesism",
# "estrangeirismo": "loanword",
"regionalism": "regional",
"Angola": "Angola",
"Brasil": "Brazil",
# "Amazônia": "Amazonia",
"Nordeste do Brasil": "Northeast-Brazil",
"Norte do Brasil": "North-Brazil",
"Centro-Oeste do Brasil": "Central-West-Brazil",
"Sudeste do Brasil": "Southeast-Brazil",
"Sul do Brasil": "Southern-Brazil",
"Acre": "Acre",
"Alagoas": "Alagoas",
"Amapá": "Amapá",
"Amazonas": "Amazonas",
"Bahia": "Bahia",
"dialeto caipira": "dialectal",
"Ceará": "Ceará",
# "Distrito Federal": "Federal District",
"Espírito Santo": "Espírito Santo",
"Goiás": "Goias",
"Maranhão": "Maranhão",
"Mato Grosso": "Mato Grosso",
"Mato Grosso do Sul": "Mato Grosso do Sul",
"Minas Gerais": "Minas Gerais",
"Pará": "Pará",
"Paraíba": "Paraíba",
"Paraná": "Paraná",
"Pernambuco": "Pernambuco",
"Piauí": "Piauí",
"Rio de Janeiro": "Rio de Janeiro",
"Rio Grande do Norte": "Rio Grande do Norte",
"Rio Grande do Sul": "Rio Grande do Sul",
"Rondônia": "Rondônia",
"Roraima": "Roraima",
# "baralhete": "",
# "canteiros": "",
# "alvanéis": "",
# "telheiros": "",
# "músicos": "",
# "cesteiros": "",
"transitivo": "transitive",
"intransitivo": "intransitive",
"reflexivo": "reflexive",
"pronominal": "pronominal",
"plural": "plural",
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS}

# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
TOPICS = {
"anatomia": "anatomy",
"arquitetura": "architecture",
"botânica": "botany",
"ciência da computação": "computing",
"comunicação": "communications",
# "ciência dos materiais": "",
"engenharia": "engineering",
# "pedagogia": "pedagogy",
Expand Down Expand Up @@ -184,14 +276,16 @@
def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in TAGS:
tr_tag = TAGS[raw_tag]
if raw_tag in TAGS or raw_tag.lower() in TAGS:
tr_tag = TAGS.get(raw_tag, TAGS.get(raw_tag.lower()))
if isinstance(tr_tag, str):
data.tags.append(tr_tag)
elif isinstance(tr_tag, list):
data.tags.extend(tr_tag)
elif raw_tag in TOPICS and hasattr(data, "topics"):
data.topics.append(TOPICS[raw_tag])
elif (raw_tag in TOPICS or raw_tag.lower() in TOPICS) and hasattr(
data, "topics"
):
data.topics.append(TOPICS.get(raw_tag, TOPICS.get(raw_tag.lower())))
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/zh/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def translation_subpage(
wxr, None, template_node.template_parameters.get(2, wxr.wtp.title)
)
if "#" in page_title:
page_title = page_title[:page_title.index("#")]
page_title = page_title[: page_title.index("#")]

translation_subpage_title = page_title
if page_title == wxr.wtp.title:
Expand Down
Loading
Loading