Skip to content

Commit

Permalink
Merge pull request #482 from tatuylonen/de-ipa
Browse files Browse the repository at this point in the history
More other-edition field stuff
  • Loading branch information
kristian-clausal authored Jan 31, 2024
2 parents b38b8f6 + 372b117 commit b03fb53
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 65 deletions.
25 changes: 9 additions & 16 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ class Translation(BaseModelWrap):
description="Tags specifying the translated term, usually gender information",
)
notes: list[str] = Field(default=[], description="A list of notes")
roman: str = Field(
default="", description="Transliteration in roman characters"
)


class Example(BaseModelWrap):
Expand Down Expand Up @@ -123,22 +120,18 @@ class Sense(BaseModelWrap):


class Sound(BaseModelWrap):
ipa: list[str] = Field(
default=[], description="International Phonetic Alphabet"
)
ipa: str = Field(default="", description="International Phonetic Alphabet")
# phonetic_transcription: list[str] = Field(
# default=[], description="Phonetic transcription, less exact than IPA."
# )
audio: list[str] = Field(default=[], description="Audio file name")
wav_url: list[str] = Field(default=[])
ogg_url: list[str] = Field(default=[])
mp3_url: list[str] = Field(default=[])
oga_url: list[str] = Field(default=[])
flac_url: list[str] = Field(default=[])
lang_code: list[str] = Field(
default=[], description="Wiktionary language code"
)
lang: list[str] = Field(default=[], description="Localized language name")
audio: str = Field(default="", description="Audio file name")
wav_url: str = Field(default="")
ogg_url: str = Field(default="")
mp3_url: str = Field(default="")
oga_url: str = Field(default="")
flac_url: str = Field(default="")
lang_code: str = Field(default="", description="Wiktionary language code")
lang: str = Field(default="", description="Localized language name")
# roman: list[str] = Field(
# default=[], description="Translitaration to Roman characters"
# )
Expand Down
42 changes: 25 additions & 17 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def extract_pronunciation(
NodeKind.LIST_ITEM
):
wxr.wtp.debug(
f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}",
f"Found unexpected non-list-item node in pronunciation "
f"section: {not_list_item_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/28",
)

Expand All @@ -37,8 +38,10 @@ def extract_pronunciation(
or not rest
):
wxr.wtp.debug(
f"Found unexpected non-template node in pronunciation section: {head_template}",
sortid="extractor/de/pronunciation/extract_pronunciation/37",
f"Found unexpected non-template node in pronunciation "
f"section: {head_template}",
sortid="extractor/de/pronunciation/"
"extract_pronunciation/37",
)
continue
if head_template.template_name == "IPA":
Expand Down Expand Up @@ -91,29 +94,34 @@ def process_ipa(

def process_lautschrift_template(
wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
):
) -> None:
template_parameters = node.template_parameters

ipa = template_parameters.get(1)

lang_code = template_parameters.get("spr")
if lang_code:
lang = code_to_name(lang_code, "de")
add_sound_data_without_appending_to_existing_properties(
wxr,
sound_data,
{
"ipa": [ipa],
"lang_code": lang_code,
"lang": lang,
},
)
new_data = {
"lang_code": lang_code,
"lang": lang,
}
else:
sound_data[-1].ipa.append(ipa)
new_data = dict()

new_data["ipa"] = ipa

add_sound_data_without_appending_to_existing_properties(
wxr,
sound_data,
new_data,
)


def process_hoerbeispiele(
wxr: WiktextractContext, sound_data: list[Sound], nodes: list[WikiNode]
wxr: WiktextractContext,
sound_data: list[Sound],
nodes: list[Union[str, WikiNode]],
):
for node in nodes:
if is_template_node_with_name(node, "Audio"):
Expand Down Expand Up @@ -157,7 +165,7 @@ def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
def add_sound_data_without_appending_to_existing_properties(
wxr: WiktextractContext,
sound_data: list[Sound],
new_sound_data: list[dict],
new_sound_data: dict,
):
"""Creates a new IPA data entry if properties exist in previous entry."""
if any(
Expand All @@ -171,7 +179,7 @@ def add_sound_data_without_appending_to_existing_properties(
for key, value in new_sound_data.items():
if key in sound_data[-1].model_fields:
if isinstance(value, str):
getattr(sound_data[-1], key).append(value)
setattr(sound_data[-1], key, value)
else:
getattr(sound_data[-1], key).extend(value)
else:
Expand Down
24 changes: 23 additions & 1 deletion src/wiktextract/extractor/es/etymology.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from typing import cast
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.extractor.es.models import EtymologyTemplate, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand All @@ -23,23 +25,43 @@ def process_etymology_block(
"""

has_etymology_info = False

for template_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
# no-op type-annotation cast; we softly assert template_node is a
# TemplateNode, which has .template_name, to quiet the type-checker.
template_node = cast(TemplateNode, template_node)
if "etim" not in template_node.template_name:
# We don't want to keep any other template data other than
# the main etymology templates (and maybe Plantilla:etim)
continue

entry.etymology_templates = entry.etymology_templates or []

etymology_template = EtymologyTemplate(
name=template_node.template_name,
expansion=clean_node(wxr, None, template_node),
)

if etymology_template.expansion in (
# "Please fill in this etymology, thank you..."
"Si puedes, incorpórala: ver cómo.",
"Préstamo no adaptado.",
"Este lema en este idioma es ampliable. "
"Retira este aviso si la mayor parte de las acepciones ya están incluidas.",
):
continue

args = {}
for index, param in template_node.template_parameters.items():
args[str(index)] = (
param
if isinstance(param, str)
else clean_node(wxr, None, param)
)
# if any other index other than "leng" is encountered,
# has_etymology => True
has_etymology_info = has_etymology_info or index != "leng"
if args:
if args and not (len(args) == 1 and "leng" in args):
etymology_template.args = args

entry.etymology_templates.append(etymology_template)
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class Translation(BaseModelWrap):
lang_code: str = Field(
description="Wiktionary language code of the translation term"
)
lang: str = Field(description="Name of the language of translation")
senseids: list[str] = Field(
default=[],
description="List of senseids where this translation applies",
Expand Down
9 changes: 8 additions & 1 deletion src/wiktextract/extractor/es/translation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Optional

from mediawiki_langcodes import code_to_name
from wikitextprocessor import WikiNode
from wiktextract.extractor.es.models import Translation, WordEntry
from wiktextract.extractor.share import split_senseids
Expand All @@ -15,6 +16,9 @@ def extract_translation(
# Documentation: https://es.wiktionary.org/wiki/Plantilla:t+

lang_code = template_node.template_parameters.get(1) # Language code
lang = code_to_name(lang_code, "es")
if not lang:
lang = f"Unknown({lang_code})"

# Initialize variables
current_translation: Optional[Translation] = None
Expand Down Expand Up @@ -82,7 +86,10 @@ def extract_translation(

else:
current_translation = Translation(
word=value, lang_code=lang_code, senseids=list(senseids)
word=value,
lang_code=lang_code,
lang=lang,
senseids=list(senseids),
)
elif isinstance(key, str):
if key == "tr":
Expand Down
31 changes: 16 additions & 15 deletions tests/test_de_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,36 +27,37 @@ def test_de_process_ipa(self):
"input": "{{Lautschrift|ipa1}}",
"expected": [
{
"ipa": ["ipa1"],
"ipa": "ipa1",
}
],
},
{
"input": "{{Lautschrift|ipa1|spr=de}}",
"expected": [
{
"ipa": ["ipa1"],
"lang": ["Deutsch"],
"lang_code": ["de"],
"ipa": "ipa1",
"lang": "Deutsch",
"lang_code": "de",
}
],
},
{
"input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}",
"expected": [
{"ipa": ["ipa1", "ipa2"]},
{"ipa": "ipa1"},
{"ipa": "ipa2"},
{
"ipa": ["ipa3"],
"lang": ["Deutsch"],
"lang_code": ["de"],
"ipa": "ipa3",
"lang": "Deutsch",
"lang_code": "de",
},
],
},
{
"input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}",
"expected": [
{"ipa": ["ipa1"]},
{"ipa": ["ipa2"], "tags": ["tag1"]},
{"ipa": "ipa1"},
{"ipa": "ipa2", "tags": ["tag1"]},
],
},
]
Expand Down Expand Up @@ -90,7 +91,7 @@ def test_de_process_hoerbeispiele(self):
"input": "{{Audio|" + filename1 + "}}",
"expected": [
{
"audio": [filename1],
"audio": filename1,
"mp3_url": None, # None indicates we don't care about the exact value
"ogg_url": None,
}
Expand All @@ -104,12 +105,12 @@ def test_de_process_hoerbeispiele(self):
+ "}}",
"expected": [
{
"audio": [filename1],
"audio": filename1,
"mp3_url": None,
"ogg_url": None,
},
{
"audio": [filename2],
"audio": filename2,
"ogg_url": None,
"mp3_url": None,
"wav_url": None,
Expand All @@ -124,13 +125,13 @@ def test_de_process_hoerbeispiele(self):
+ "}}",
"expected": [
{
"audio": [filename1],
"audio": filename1,
"mp3_url": None,
"ogg_url": None,
"tags": ["tag1"],
},
{
"audio": [filename2],
"audio": filename2,
"mp3_url": None,
"ogg_url": None,
"wav_url": None,
Expand Down
12 changes: 1 addition & 11 deletions tests/test_es_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,7 @@ def test_es_extract_etymology(self):
{
# https://es.wiktionary.org/wiki/Schreck
"input": "{{etimología|leng=de}}",
"expected": {
"etymology_templates": [
{
"args": {
"leng": "de",
},
"name": "etimología",
"expansion": "Si puedes, incorpórala: ver cómo.",
},
],
},
"expected": dict(),
},
{
# https://es.wiktionary.org/wiki/bagre
Expand Down
Loading

0 comments on commit b03fb53

Please sign in to comment.