Skip to content

Commit

Permalink
Merge pull request #592 from xxyzz/es
Browse files Browse the repository at this point in the history
Fix some check JSON errors and pydantic errors in es edition
  • Loading branch information
xxyzz authored Apr 17, 2024
2 parents 6e7adb9 + 72bf857 commit 95353f4
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 47 deletions.
9 changes: 3 additions & 6 deletions src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def extract_gloss(
definition.append(node)

gloss = clean_node(wxr, gloss_data, definition)
gloss_data.glosses.append(gloss)
if len(gloss) > 0:
gloss_data.glosses.append(gloss)

gloss_note = clean_node(wxr, gloss_data, list_item.children)
match = re.match(r"^(\d+)", gloss_note)
Expand All @@ -58,11 +59,7 @@ def extract_gloss(
if len(other) > 0:
for node in other:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
process_sense_data_list(
wxr,
page_data[-1].senses[-1],
node,
)
process_sense_data_list(wxr, page_data[-1], node)
else:
wxr.wtp.debug(
f"Found nodes that are not part of definition: {node}",
Expand Down
10 changes: 8 additions & 2 deletions src/wiktextract/extractor/es/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ def process_linkage_template(
for key, value_raw in template_node.template_parameters.items():
value = clean_node(wxr, None, value_raw)
if isinstance(key, int):
getattr(word_entry, linkage_type).append(Linkage(word=value))
linkage_data = Linkage(word=value)
if len(word_entry.senses) > 0:
linkage_data.senseid = word_entry.senses[-1].get("senseid")
getattr(word_entry, linkage_type).append(linkage_data)
elif isinstance(key, str):
if key.startswith("nota"):
idx = int(key[4:]) - 1 if len(key) > 4 else 0
Expand Down Expand Up @@ -79,4 +82,7 @@ def process_linkage_list_children(
if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if len(word) > 0:
getattr(word_entry, linkage_type).append(Linkage(word=word))
linkage_data = Linkage(word=word)
if len(word_entry.senses) > 0:
linkage_data.senseid = word_entry.senses[-1].get("senseid")
getattr(word_entry, linkage_type).append(linkage_data)
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class Linkage(BaseModelWrap):
alternative_spelling: str = Field(
default="", description="Alternative spelling of the word"
)
senseid: str = ""


class Translation(BaseModelWrap):
Expand Down Expand Up @@ -150,5 +151,6 @@ class WordEntry(BaseModelWrap):
meronyms: list[Linkage] = []
related: list[Linkage] = []
synonyms: list[Linkage] = []
proverbs: list[Linkage] = []
tags: list[str] = []
extra_sounds: dict[str, str] = {}
20 changes: 9 additions & 11 deletions src/wiktextract/extractor/es/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,13 @@ def parse_section(
process_etymology_block(wxr, base_data, level_node)
for nested_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, nested_level_node)
elif section_title in TRANSLATIONS_TITLES:
if wxr.config.capture_translations:
for template_node in level_node.find_child_recursively(
NodeKind.TEMPLATE
):
if template_node.template_name == "t+" and len(page_data) > 0:
extract_translation(wxr, page_data[-1], template_node)
elif (
section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations
):
if len(page_data) == 0:
page_data.append(base_data.model_copy(deep=True))
for template_node in level_node.find_child(NodeKind.TEMPLATE):
extract_translation(wxr, page_data[-1], template_node)

elif section_title in LINKAGE_TITLES:
if len(page_data) == 0:
Expand Down Expand Up @@ -262,9 +262,7 @@ def process_group(
if template_name == "clear":
return
elif template_name.removesuffix("s") in LINKAGE_TITLES:
process_linkage_template(
wxr, page_data[-1].senses[-1], group[0]
)
process_linkage_template(wxr, page_data[-1], group[0])
elif template_name == "ejemplo":
extract_example(wxr, page_data[-1].senses[-1], group)
elif template_name == "uso":
Expand All @@ -282,7 +280,7 @@ def process_group(
list_node = group[0]
# List groups seem to not be followed by string nodes.
# We, therefore, only process the list_node.
process_sense_data_list(wxr, page_data[-1].senses[-1], list_node)
process_sense_data_list(wxr, page_data[-1], list_node)

elif (
isinstance(child, WikiNode)
Expand Down
17 changes: 6 additions & 11 deletions src/wiktextract/extractor/es/sense_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,13 @@

from .example import process_example_list
from .linkage import process_linkage_list_children
from .models import Sense
from .models import WordEntry
from .section_titles import LINKAGE_TITLES


def process_sense_data_list(
wxr: WiktextractContext,
sense_data: Sense,
list_node: WikiNode,
):
wxr: WiktextractContext, word_entry: list[WordEntry], list_node: WikiNode
) -> None:
list_marker = list_node.sarg

if list_marker == ":;":
Expand All @@ -31,13 +29,10 @@ def process_sense_data_list(
)

if list_type == "ejemplo":
process_example_list(wxr, sense_data, list_item)
process_example_list(wxr, word_entry.senses[-1], list_item)
elif list_type in LINKAGE_TITLES:
process_linkage_list_children(
wxr,
sense_data,
children[1:],
LINKAGE_TITLES[list_type],
wxr, word_entry, children[1:], LINKAGE_TITLES[list_type]
)
elif list_type == "ámbito":
# XXX: Extract scope tag
Expand All @@ -54,7 +49,7 @@ def process_sense_data_list(
elif list_marker in ["::", ":::"]:
# E.g. https://es.wiktionary.org/wiki/silepsis
for list_item in list_node.find_child_recursively(NodeKind.LIST_ITEM):
process_example_list(wxr, sense_data, list_item)
process_example_list(wxr, word_entry.senses[-1], list_item)

else:
wxr.wtp.debug(
Expand Down
91 changes: 78 additions & 13 deletions src/wiktextract/extractor/es/translation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import itertools
from typing import Optional

from mediawiki_langcodes import code_to_name
from wikitextprocessor import WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

Expand All @@ -10,11 +11,75 @@


def extract_translation(
wxr: WiktextractContext,
word_entry: WordEntry,
template_node: WikiNode,
wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
):
# Documentation: https://es.wiktionary.org/wiki/Plantilla:t+
if template_node.template_name == "t":
process_t_template(wxr, word_entry, template_node)
elif template_node.template_name == "t+":
process_t_plus_template(wxr, word_entry, template_node)


T_GENDERS = {
"m": "masculine",
"f": "feminine",
"mf": ["masculine", "feminine"],
"n": "neuter",
}
T_NUMBERS = {
"s": "singular",
"p": "plural",
"d": "dual",
}


def process_t_template(
wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
) -> None:
# https://es.wiktionary.org/wiki/Plantilla:t
lang_code = template_node.template_parameters.get(1, "")
lang_name = code_to_name(lang_code, "es")
for tr_index in itertools.count(1):
if "t" + str(tr_index) not in template_node.template_parameters:
break
tr_data = Translation(lang_code=lang_code, lang=lang_name, word="")
for param_prefix, field in (
("t", "word"),
("a", "senseids"),
("tl", "roman"),
("nota", "raw_tags"),
("g", "tags"),
("n", "tags"),
):
param = param_prefix + str(tr_index)
if param not in template_node.template_parameters:
continue
value = clean_node(
wxr, None, template_node.template_parameters[param]
)
if param_prefix == "g":
value = T_GENDERS.get(value)
elif param_prefix == "n":
value = T_NUMBERS.get(value)
if value is None:
continue

pre_value = getattr(tr_data, field)
if isinstance(pre_value, list):
if isinstance(value, list):
pre_value.extend(value)
else:
pre_value.append(value)
else:
setattr(tr_data, field, value)

if len(tr_data.word) > 0:
word_entry.translations.append(tr_data)


def process_t_plus_template(
wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
) -> None:
# obsolete template: https://es.wiktionary.org/wiki/Plantilla:t+

lang_code = template_node.template_parameters.get(1) # Language code
lang = code_to_name(lang_code, "es")
Expand All @@ -29,17 +94,17 @@ def extract_translation(
if key == 1:
continue # Skip language code

value = clean_node(
wxr, {}, template_node.template_parameters[key]
).strip()

value = clean_node(wxr, None, template_node.template_parameters[key])
if isinstance(key, int):
if value == ",":
if current_translation:
if (
current_translation is not None
and len(current_translation.word) > 0
):
word_entry.translations.append(current_translation)

current_translation = None
senseids = []
current_translation = None
senseids = []
elif (
value.isdigit()
or (value != "," and "," in value)
Expand Down Expand Up @@ -98,5 +163,5 @@ def extract_translation(
current_translation.roman = value

# Add the last translation if it exists
if current_translation:
if current_translation is not None and len(current_translation.word) > 0:
word_entry.translations.append(current_translation)
5 changes: 1 addition & 4 deletions tests/test_es_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,7 @@ def test_es_process_linkage_template(self):
"input": "{{sinónimo|automóvil|coche|nota2=España|carro|nota3=Colombia, Estados Unidos, México, Venezuela}}",
"expected": [
{"word": "automóvil"},
{
"word": "coche",
"note": "España",
},
{"word": "coche", "note": "España"},
{
"word": "carro",
"note": "Colombia, Estados Unidos, México, Venezuela",
Expand Down
64 changes: 64 additions & 0 deletions tests/test_es_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ def get_default_page_data(self) -> list[WordEntry]:
def test_es_extract_translation(self):
# Test cases from https://es.wiktionary.org/wiki/Plantilla:t+
test_cases = [
{
# https://es.wiktionary.org/wiki/calderón
"input": "{{t+|ar|}}",
"expected": [],
},
{
"input": "{{t+|af|1|kat}}",
"expected": [
Expand Down Expand Up @@ -142,3 +147,62 @@ def test_es_extract_translation(self):
translations,
case["expected"],
)

def test_t_roman(self):
self.wxr.wtp.start_page("hola")
word_entry = WordEntry(word="hola", lang_code="es", lang="Español")
root = self.wxr.wtp.parse(
"{{t|zh|a1=1|t1=你好|tl1=nĭ hăo|t2=您好|tl2=nín hăo|nota2=formal}}"
)
extract_translation(self.wxr, word_entry, root.children[0])
self.assertEqual(
[
t.model_dump(exclude_defaults=True)
for t in word_entry.translations
],
[
{
"lang": "chino",
"lang_code": "zh",
"word": "你好",
"senseids": ["1"],
"roman": "nĭ hăo",
},
{
"lang": "chino",
"lang_code": "zh",
"word": "您好",
"roman": "nín hăo",
"raw_tags": ["formal"],
},
],
)

def test_t_gender(self):
self.wxr.wtp.start_page("hola")
word_entry = WordEntry(word="hola", lang_code="es", lang="Español")
root = self.wxr.wtp.parse(
"{{t|th|a1=1|t1=สวัสดีครับ|g1=m|t2=สวัสดีค่ะ|g2=f}}"
)
extract_translation(self.wxr, word_entry, root.children[0])
self.assertEqual(
[
t.model_dump(exclude_defaults=True)
for t in word_entry.translations
],
[
{
"lang": "tailandés",
"lang_code": "th",
"word": "สวัสดีครับ",
"senseids": ["1"],
"tags": ["masculine"],
},
{
"lang": "tailandés",
"lang_code": "th",
"word": "สวัสดีค่ะ",
"tags": ["feminine"],
},
],
)

0 comments on commit 95353f4

Please sign in to comment.