Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pt] extract "conj.en*" templates and two new sections #960

Merged
merged 3 commits into from
Dec 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions src/wiktextract/extractor/pt/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ def extract_conjugation_section(
for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name.startswith(("conj.pt", "conj/pt")):
extract_conj_pt_template(wxr, word_entry, t_node)
elif t_node.template_name.startswith("conj.en"):
extract_conj_en_template(wxr, word_entry, t_node)


def extract_conj_pt_template(
Expand Down Expand Up @@ -214,3 +216,53 @@ def add_conj_pt_form(
form.raw_tags.append(row_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_conj_en_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://pt.wiktionary.org/wiki/Predefinição:conj.en
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table in expanded_node.find_child(NodeKind.TABLE):
for row in table.find_child(NodeKind.TABLE_ROW):
for cell in row.find_child(NodeKind.TABLE_CELL):
raw_tag = ""
for sup_tag in cell.find_html("sup"):
raw_tag = clean_node(wxr, None, sup_tag.children).strip(
": "
)
for list_node in cell.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for bold_node in list_item.find_child(NodeKind.BOLD):
form_str = clean_node(wxr, None, bold_node)
if form_str not in ["", wxr.wtp.title]:
form = Form(form=form_str)
if raw_tag != "":
form.raw_tags.append(raw_tag)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_degree_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for index, bold_node in list_item.find_child(NodeKind.BOLD, True):
bold_str = clean_node(wxr, None, bold_node)
forms_str = clean_node(
wxr, None, list_item.children[index + 1 :]
).strip(": ")
for form_str in forms_str.split(","):
form_str = form_str.strip()
if form_str not in ["", wxr.wtp.title]:
form = Form(form=form_str)
if form_str != "":
form.raw_tags.append(bold_str)
translate_raw_tags(form)
word_entry.forms.append(form)
break
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,5 @@ class WordEntry(PortugueseBaseModel):
sounds: list[Sound] = []
forms: list[Form] = []
notes: list[str] = []
cognates: list[Translation] = []
descendants: list[Translation] = []
15 changes: 11 additions & 4 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .inflection import extract_conjugation_section
from .inflection import extract_conjugation_section, extract_degree_section
from .linkage import (
extract_expression_section,
extract_linkage_section,
Expand Down Expand Up @@ -41,9 +41,12 @@ def parse_section(
title_text,
cats.get("categories", []),
)
elif title_text in ["Tradução", "Traduções", "Cognatos"]:
elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]:
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
title_text,
)
elif title_text == "Expressões":
extract_expression_section(
Expand All @@ -68,12 +71,16 @@ def parse_section(
extract_phraseology_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text.startswith("Nota"):
elif title_text.startswith(("Nota", "Uso")):
extract_note_section(wxr, page_data, level_node)
elif title_text == "Conjugação":
extract_conjugation_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text == "Graus":
extract_degree_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text.lower() not in [
"ver também",
"ligação externa",
Expand Down
15 changes: 14 additions & 1 deletion src/wiktextract/extractor/pt/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@
"Afirmativo": "affirmative",
"Negativo": "negative",
"Infinitivo pessoal": ["personal", "infinitive"],
# Predefinição:conj.en
"Infinitivo": "infinitive",
"Passado simples": "past",
}

# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
Expand Down Expand Up @@ -209,7 +212,17 @@
"plural": "plural",
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS}
OTHER_TAGS = {
"comparativo de superioridade": ["comparative", "superior"],
"superlativo absoluto sintético": ["absolute", "superlative"],
"superlativo relativo de superioridade": [
"relative",
"superlative",
"superior",
],
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS, **OTHER_TAGS}

# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
TOPICS = {
Expand Down
27 changes: 23 additions & 4 deletions src/wiktextract/extractor/pt/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,17 @@ def extract_translation_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
title_text: str,
) -> None:
sense = ""
sense_index = 0
target_field = "translations"
match title_text:
case "Cognatos":
target_field = "cognates"
case "Descendentes":
target_field = "descendants"

for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
match node.kind:
case NodeKind.TEMPLATE:
Expand All @@ -22,7 +30,12 @@ def extract_translation_section(
case NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_translation_list_item(
wxr, word_entry, list_item, sense, sense_index
wxr,
word_entry,
list_item,
sense,
sense_index,
target_field,
)


Expand All @@ -48,6 +61,7 @@ def extract_translation_list_item(
list_item: WikiNode,
sense: str,
sense_index: int,
target_field: str,
) -> None:
translations = []
lang_name = "unknown"
Expand Down Expand Up @@ -101,10 +115,15 @@ def extract_translation_list_item(
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_translation_list_item(
wxr, word_entry, next_list_item, sense, sense_index
wxr,
word_entry,
next_list_item,
sense,
sense_index,
target_field,
)

word_entry.translations.extend(translations)
getattr(word_entry, target_field).extend(translations)


def extract_trad_template(
Expand Down Expand Up @@ -239,4 +258,4 @@ def extract_translation_subpage(
page = wxr.wtp.get_page(page_title, 0)
if page is not None and page.body is not None:
root = wxr.wtp.parse(page.body)
extract_translation_section(wxr, word_entry, root)
extract_translation_section(wxr, word_entry, root, "Tradução")
54 changes: 54 additions & 0 deletions tests/test_pt_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,57 @@ def test_conj_pt(self):
},
],
)

def test_conj_en(self):
self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês")
self.wxr.wtp.add_page(
"Predefinição:conj.en.2",
10,
"""{|
|-
| <sup>Passado simples:</sup>
: '''[[red]]''' / '''[[redd]]'''
|}""",
)
data = parse_page(
self.wxr,
"rede",
"""={{-en-}}=
==Verbo==
# {{escopo|en|Arcaísmo}} [[governar]], [[proteger]]
===Conjugação===
{{conj.en.2|rede|redes|red|redd|red|redd|reding}}""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "red", "tags": ["past"]},
{"form": "redd", "tags": ["past"]},
],
)

def test_degree_section(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"bom",
"""={{-pt-}}=
==Adjetivo==
# que
===Graus===
* '''comparativo de superioridade''': [[melhor]] do que
* '''superlativo absoluto sintético''': [[boníssimo]], [[ótimo]]
* '''superlativo relativo de superioridade''': melhor""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "melhor do que", "tags": ["comparative", "superior"]},
{"form": "boníssimo", "tags": ["absolute", "superlative"]},
{"form": "ótimo", "tags": ["absolute", "superlative"]},
{
"form": "melhor",
"tags": ["relative", "superlative", "superior"],
},
],
)
Loading