Skip to content

Commit

Permalink
Merge pull request #960 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] extract "conj.en*" templates and two new sections
  • Loading branch information
xxyzz authored Dec 27, 2024
2 parents 5ac3627 + 3479756 commit b169b2b
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 9 deletions.
52 changes: 52 additions & 0 deletions src/wiktextract/extractor/pt/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ def extract_conjugation_section(
for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name.startswith(("conj.pt", "conj/pt")):
extract_conj_pt_template(wxr, word_entry, t_node)
elif t_node.template_name.startswith("conj.en"):
extract_conj_en_template(wxr, word_entry, t_node)


def extract_conj_pt_template(
Expand Down Expand Up @@ -214,3 +216,53 @@ def add_conj_pt_form(
form.raw_tags.append(row_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_conj_en_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://pt.wiktionary.org/wiki/Predefinição:conj.en
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table in expanded_node.find_child(NodeKind.TABLE):
for row in table.find_child(NodeKind.TABLE_ROW):
for cell in row.find_child(NodeKind.TABLE_CELL):
raw_tag = ""
for sup_tag in cell.find_html("sup"):
raw_tag = clean_node(wxr, None, sup_tag.children).strip(
": "
)
for list_node in cell.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for bold_node in list_item.find_child(NodeKind.BOLD):
form_str = clean_node(wxr, None, bold_node)
if form_str not in ["", wxr.wtp.title]:
form = Form(form=form_str)
if raw_tag != "":
form.raw_tags.append(raw_tag)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_degree_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for index, bold_node in list_item.find_child(NodeKind.BOLD, True):
bold_str = clean_node(wxr, None, bold_node)
forms_str = clean_node(
wxr, None, list_item.children[index + 1 :]
).strip(": ")
for form_str in forms_str.split(","):
form_str = form_str.strip()
if form_str not in ["", wxr.wtp.title]:
form = Form(form=form_str)
if form_str != "":
form.raw_tags.append(bold_str)
translate_raw_tags(form)
word_entry.forms.append(form)
break
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,5 @@ class WordEntry(PortugueseBaseModel):
sounds: list[Sound] = []
forms: list[Form] = []
notes: list[str] = []
cognates: list[Translation] = []
descendants: list[Translation] = []
15 changes: 11 additions & 4 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .inflection import extract_conjugation_section
from .inflection import extract_conjugation_section, extract_degree_section
from .linkage import (
extract_expression_section,
extract_linkage_section,
Expand Down Expand Up @@ -41,9 +41,12 @@ def parse_section(
title_text,
cats.get("categories", []),
)
elif title_text in ["Tradução", "Traduções", "Cognatos"]:
elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]:
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
title_text,
)
elif title_text == "Expressões":
extract_expression_section(
Expand All @@ -68,12 +71,16 @@ def parse_section(
extract_phraseology_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text.startswith("Nota"):
elif title_text.startswith(("Nota", "Uso")):
extract_note_section(wxr, page_data, level_node)
elif title_text == "Conjugação":
extract_conjugation_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text == "Graus":
extract_degree_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text.lower() not in [
"ver também",
"ligação externa",
Expand Down
15 changes: 14 additions & 1 deletion src/wiktextract/extractor/pt/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@
"Afirmativo": "affirmative",
"Negativo": "negative",
"Infinitivo pessoal": ["personal", "infinitive"],
# Predefinição:conj.en
"Infinitivo": "infinitive",
"Passado simples": "past",
}

# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
Expand Down Expand Up @@ -209,7 +212,17 @@
"plural": "plural",
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS}
OTHER_TAGS = {
"comparativo de superioridade": ["comparative", "superior"],
"superlativo absoluto sintético": ["absolute", "superlative"],
"superlativo relativo de superioridade": [
"relative",
"superlative",
"superior",
],
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS, **GLOSS_TAGS, **OTHER_TAGS}

# https://pt.wiktionary.org/wiki/Predefinição:escopo/núcleo
TOPICS = {
Expand Down
27 changes: 23 additions & 4 deletions src/wiktextract/extractor/pt/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,17 @@ def extract_translation_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
title_text: str,
) -> None:
sense = ""
sense_index = 0
target_field = "translations"
match title_text:
case "Cognatos":
target_field = "cognates"
case "Descendentes":
target_field = "descendants"

for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
match node.kind:
case NodeKind.TEMPLATE:
Expand All @@ -22,7 +30,12 @@ def extract_translation_section(
case NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_translation_list_item(
wxr, word_entry, list_item, sense, sense_index
wxr,
word_entry,
list_item,
sense,
sense_index,
target_field,
)


Expand All @@ -48,6 +61,7 @@ def extract_translation_list_item(
list_item: WikiNode,
sense: str,
sense_index: int,
target_field: str,
) -> None:
translations = []
lang_name = "unknown"
Expand Down Expand Up @@ -101,10 +115,15 @@ def extract_translation_list_item(
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_translation_list_item(
wxr, word_entry, next_list_item, sense, sense_index
wxr,
word_entry,
next_list_item,
sense,
sense_index,
target_field,
)

word_entry.translations.extend(translations)
getattr(word_entry, target_field).extend(translations)


def extract_trad_template(
Expand Down Expand Up @@ -239,4 +258,4 @@ def extract_translation_subpage(
page = wxr.wtp.get_page(page_title, 0)
if page is not None and page.body is not None:
root = wxr.wtp.parse(page.body)
extract_translation_section(wxr, word_entry, root)
extract_translation_section(wxr, word_entry, root, "Tradução")
54 changes: 54 additions & 0 deletions tests/test_pt_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,57 @@ def test_conj_pt(self):
},
],
)

def test_conj_en(self):
self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês")
self.wxr.wtp.add_page(
"Predefinição:conj.en.2",
10,
"""{|
|-
| <sup>Passado simples:</sup>
: '''[[red]]''' / '''[[redd]]'''
|}""",
)
data = parse_page(
self.wxr,
"rede",
"""={{-en-}}=
==Verbo==
# {{escopo|en|Arcaísmo}} [[governar]], [[proteger]]
===Conjugação===
{{conj.en.2|rede|redes|red|redd|red|redd|reding}}""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "red", "tags": ["past"]},
{"form": "redd", "tags": ["past"]},
],
)

def test_degree_section(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"bom",
"""={{-pt-}}=
==Adjetivo==
# que
===Graus===
* '''comparativo de superioridade''': [[melhor]] do que
* '''superlativo absoluto sintético''': [[boníssimo]], [[ótimo]]
* '''superlativo relativo de superioridade''': melhor""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "melhor do que", "tags": ["comparative", "superior"]},
{"form": "boníssimo", "tags": ["absolute", "superlative"]},
{"form": "ótimo", "tags": ["absolute", "superlative"]},
{
"form": "melhor",
"tags": ["relative", "superlative", "superior"],
},
],
)

0 comments on commit b169b2b

Please sign in to comment.