Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More other-edition field stuff #482

Merged
merged 4 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 9 additions & 16 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ class Translation(BaseModelWrap):
description="Tags specifying the translated term, usually gender information",
)
notes: list[str] = Field(default=[], description="A list of notes")
roman: str = Field(
default="", description="Transliteration in roman characters"
)


class Example(BaseModelWrap):
Expand Down Expand Up @@ -123,22 +120,18 @@ class Sense(BaseModelWrap):


class Sound(BaseModelWrap):
ipa: list[str] = Field(
default=[], description="International Phonetic Alphabet"
)
ipa: str = Field(default="", description="International Phonetic Alphabet")
# phonetic_transcription: list[str] = Field(
# default=[], description="Phonetic transcription, less exact than IPA."
# )
audio: list[str] = Field(default=[], description="Audio file name")
wav_url: list[str] = Field(default=[])
ogg_url: list[str] = Field(default=[])
mp3_url: list[str] = Field(default=[])
oga_url: list[str] = Field(default=[])
flac_url: list[str] = Field(default=[])
lang_code: list[str] = Field(
default=[], description="Wiktionary language code"
)
lang: list[str] = Field(default=[], description="Localized language name")
audio: str = Field(default="", description="Audio file name")
wav_url: str = Field(default="")
ogg_url: str = Field(default="")
mp3_url: str = Field(default="")
oga_url: str = Field(default="")
flac_url: str = Field(default="")
lang_code: str = Field(default="", description="Wiktionary language code")
lang: str = Field(default="", description="Localized language name")
# roman: list[str] = Field(
# default=[], description="Translitaration to Roman characters"
# )
Expand Down
42 changes: 25 additions & 17 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def extract_pronunciation(
NodeKind.LIST_ITEM
):
wxr.wtp.debug(
f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}",
f"Found unexpected non-list-item node in pronunciation "
f"section: {not_list_item_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/28",
)

Expand All @@ -37,8 +38,10 @@ def extract_pronunciation(
or not rest
):
wxr.wtp.debug(
f"Found unexpected non-template node in pronunciation section: {head_template}",
sortid="extractor/de/pronunciation/extract_pronunciation/37",
f"Found unexpected non-template node in pronunciation "
f"section: {head_template}",
sortid="extractor/de/pronunciation/"
"extract_pronunciation/37",
)
continue
if head_template.template_name == "IPA":
Expand Down Expand Up @@ -91,29 +94,34 @@ def process_ipa(

def process_lautschrift_template(
wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
):
) -> None:
template_parameters = node.template_parameters

ipa = template_parameters.get(1)

lang_code = template_parameters.get("spr")
if lang_code:
lang = code_to_name(lang_code, "de")
add_sound_data_without_appending_to_existing_properties(
wxr,
sound_data,
{
"ipa": [ipa],
"lang_code": lang_code,
"lang": lang,
},
)
new_data = {
"lang_code": lang_code,
"lang": lang,
}
else:
sound_data[-1].ipa.append(ipa)
new_data = dict()

new_data["ipa"] = ipa

add_sound_data_without_appending_to_existing_properties(
wxr,
sound_data,
new_data,
)


def process_hoerbeispiele(
wxr: WiktextractContext, sound_data: list[Sound], nodes: list[WikiNode]
wxr: WiktextractContext,
sound_data: list[Sound],
nodes: list[Union[str, WikiNode]],
):
for node in nodes:
if is_template_node_with_name(node, "Audio"):
Expand Down Expand Up @@ -157,7 +165,7 @@ def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
def add_sound_data_without_appending_to_existing_properties(
wxr: WiktextractContext,
sound_data: list[Sound],
new_sound_data: list[dict],
new_sound_data: dict,
):
"""Creates a new IPA data entry if properties exist in previous entry."""
if any(
Expand All @@ -171,7 +179,7 @@ def add_sound_data_without_appending_to_existing_properties(
for key, value in new_sound_data.items():
if key in sound_data[-1].model_fields:
if isinstance(value, str):
getattr(sound_data[-1], key).append(value)
setattr(sound_data[-1], key, value)
else:
getattr(sound_data[-1], key).extend(value)
else:
Expand Down
24 changes: 23 additions & 1 deletion src/wiktextract/extractor/es/etymology.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import cast
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.extractor.es.models import EtymologyTemplate, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -23,23 +25,43 @@ def process_etymology_block(
"""

has_etymology_info = False

for template_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
# no-op type-annotation cast; we softly assert template_node is a
# TemplateNode, which has .template_name, to quiet the type-checker.
template_node = cast(TemplateNode, template_node)
if "etim" not in template_node.template_name:
# We don't want to keep any other template data other than
# the main etymology templates (and maybe Plantilla:etim)
continue

entry.etymology_templates = entry.etymology_templates or []

etymology_template = EtymologyTemplate(
name=template_node.template_name,
expansion=clean_node(wxr, None, template_node),
)

if etymology_template.expansion in (
# "Please fill in this etymology, thank you..."
"Si puedes, incorpórala: ver cómo.",
"Préstamo no adaptado.",
"Este lema en este idioma es ampliable. "
"Retira este aviso si la mayor parte de las acepciones ya están incluidas.",
):
continue

args = {}
for index, param in template_node.template_parameters.items():
args[str(index)] = (
param
if isinstance(param, str)
else clean_node(wxr, None, param)
)
# if any other index other than "leng" is encountered,
# has_etymology => True
has_etymology_info = has_etymology_info or index != "leng"
if args:
if args and not (len(args) == 1 and "leng" in args):
etymology_template.args = args

entry.etymology_templates.append(etymology_template)
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class Translation(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code of the translation term"
)
lang: str = Field(description="Name of the language of translation")
senseids: list[str] = Field(
default=[],
description="List of senseids where this translation applies",
Expand Down
9 changes: 8 additions & 1 deletion src/wiktextract/extractor/es/translation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional

from mediawiki_langcodes import code_to_name
from wikitextprocessor import WikiNode
from wiktextract.extractor.es.models import Translation, WordEntry
from wiktextract.extractor.share import split_senseids
Expand All @@ -15,6 +16,9 @@ def extract_translation(
# Documentation: https://es.wiktionary.org/wiki/Plantilla:t+

lang_code = template_node.template_parameters.get(1) # Language code
lang = code_to_name(lang_code, "es")
if not lang:
lang = f"Unknown({lang_code})"

# Initialize variables
current_translation: Optional[Translation] = None
Expand Down Expand Up @@ -82,7 +86,10 @@ def extract_translation(

else:
current_translation = Translation(
word=value, lang_code=lang_code, senseids=list(senseids)
word=value,
lang_code=lang_code,
lang=lang,
senseids=list(senseids),
)
elif isinstance(key, str):
if key == "tr":
Expand Down
31 changes: 16 additions & 15 deletions tests/test_de_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,36 +27,37 @@ def test_de_process_ipa(self):
"input": "{{Lautschrift|ipa1}}",
"expected": [
{
"ipa": ["ipa1"],
"ipa": "ipa1",
}
],
},
{
"input": "{{Lautschrift|ipa1|spr=de}}",
"expected": [
{
"ipa": ["ipa1"],
"lang": ["Deutsch"],
"lang_code": ["de"],
"ipa": "ipa1",
"lang": "Deutsch",
"lang_code": "de",
}
],
},
{
"input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}",
"expected": [
{"ipa": ["ipa1", "ipa2"]},
{"ipa": "ipa1"},
{"ipa": "ipa2"},
{
"ipa": ["ipa3"],
"lang": ["Deutsch"],
"lang_code": ["de"],
"ipa": "ipa3",
"lang": "Deutsch",
"lang_code": "de",
},
],
},
{
"input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}",
"expected": [
{"ipa": ["ipa1"]},
{"ipa": ["ipa2"], "tags": ["tag1"]},
{"ipa": "ipa1"},
{"ipa": "ipa2", "tags": ["tag1"]},
],
},
]
Expand Down Expand Up @@ -90,7 +91,7 @@ def test_de_process_hoerbeispiele(self):
"input": "{{Audio|" + filename1 + "}}",
"expected": [
{
"audio": [filename1],
"audio": filename1,
"mp3_url": None, # None indicates we don't care about the exact value
"ogg_url": None,
}
Expand All @@ -104,12 +105,12 @@ def test_de_process_hoerbeispiele(self):
+ "}}",
"expected": [
{
"audio": [filename1],
"audio": filename1,
"mp3_url": None,
"ogg_url": None,
},
{
"audio": [filename2],
"audio": filename2,
"ogg_url": None,
"mp3_url": None,
"wav_url": None,
Expand All @@ -124,13 +125,13 @@ def test_de_process_hoerbeispiele(self):
+ "}}",
"expected": [
{
"audio": [filename1],
"audio": filename1,
"mp3_url": None,
"ogg_url": None,
"tags": ["tag1"],
},
{
"audio": [filename2],
"audio": filename2,
"mp3_url": None,
"ogg_url": None,
"wav_url": None,
Expand Down
12 changes: 1 addition & 11 deletions tests/test_es_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,7 @@ def test_es_extract_etymology(self):
{
# https://es.wiktionary.org/wiki/Schreck
"input": "{{etimología|leng=de}}",
"expected": {
"etymology_templates": [
{
"args": {
"leng": "de",
},
"name": "etimología",
"expansion": "Si puedes, incorpórala: ver cómo.",
},
],
},
"expected": dict(),
},
{
# https://es.wiktionary.org/wiki/bagre
Expand Down
Loading