Skip to content

Commit

Permalink
Merge pull request #523 from xxyzz/es
Browse files Browse the repository at this point in the history
Add `raw_tags` fields to es edition pydantic models
  • Loading branch information
xxyzz authored Feb 27, 2024
2 parents e1bea9f + 5872565 commit ccf5e84
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 21 deletions.
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def extract_gloss(
.removeprefix("Main")
)
if tag:
gloss_data.tags.append(tag)
gloss_data.raw_tags.append(tag)

page_data[-1].senses.append(gloss_data)

Expand Down
10 changes: 7 additions & 3 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@ class Translation(BaseModelWrap):
default=[],
description="List of senseids where this translation applies",
)
tags: list[str] = Field(
raw_tags: list[str] = Field(
default=[],
description="Tags specifying the translated term, usually gender information",
)
tags: list[str] = []
notes: list[str] = Field(default=[], description="A list of notes")
roman: str = Field(
default="", description="Transliteration in roman characters"
Expand Down Expand Up @@ -72,10 +73,11 @@ class Sense(BaseModelWrap):
default=[],
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
)
tags: list[str] = Field(
raw_tags: list[str] = Field(
default=[],
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
)
tags: list[str] = []
categories: list[str] = Field(
default=[],
description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
Expand Down Expand Up @@ -118,9 +120,10 @@ class Sound(BaseModelWrap):
default="", description="Translitaration to Roman characters"
)
syllabic: str = Field(default="", description="Syllabic transcription")
tags: list[str] = Field(
raw_tags: list[str] = Field(
default=[], description="Specifying the variant of the pronunciation"
)
tags: list[str] = []


class WordEntry(BaseModelWrap):
Expand Down Expand Up @@ -167,3 +170,4 @@ class WordEntry(BaseModelWrap):
meronyms: list[Linkage] = []
related: list[Linkage] = []
synonyms: list[Linkage] = []
tags: list[str] = []
8 changes: 4 additions & 4 deletions src/wiktextract/extractor/es/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def process_pron_graf_template(
sound.syllabic = value
elif key_plain == "pron":
if value != "no":
sound.tags.append(value)
sound.raw_tags.append(value)
elif key_plain == "audio":
audio_url_dict = create_audio_url_dict(value)
for dict_key, dict_value in audio_url_dict.items():
Expand Down Expand Up @@ -133,12 +133,12 @@ def process_pron_graf_template(
continue
if key not in other_variation.model_fields_set:
setattr(other_variation, key, getattr(main_sound, key))
if main_sound.tags:
if len(main_sound.raw_tags) > 0:
for i, other_variaton in variations.items():
if i == 0:
continue
if not other_variation.tags:
other_variation.tags = main_sound.tags.copy()
if len(other_variation.raw_tags) == 0:
other_variation.raw_tags = main_sound.raw_tags.copy()

for sound in variations.values():
if len(sound.model_dump(exclude_defaults=True)) > 0:
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/es/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def extract_translation(
"adj y sust",
]:
if current_translation:
current_translation.tags.append(value)
current_translation.raw_tags.append(value)
elif value in ["nota", "tr", "nl"]:
continue
elif (
Expand Down
2 changes: 1 addition & 1 deletion tests/test_es_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_es_extract_gloss_categories(self):
"Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa"
],
"senseid": "1",
"tags": ["Humanidades"],
"raw_tags": ["Humanidades"],
"categories": ["ES:Sentimientos"],
}
],
Expand Down
12 changes: 6 additions & 6 deletions tests/test_es_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ def test_es_extract_pronunciation(self):
{
"input": "{{pron-graf|leng=en|pron=Reino Unido|fone=ˈɒ.pə.zɪt|fone2=ˈɒ.pə.sɪt|2pron=EE.UU.|2fone=ˈɑ.pə.sɪt|2fone2=ˈɑ.pə.sət}}",
"sounds": [
{"ipa": "ˈɒ.pə.zɪt", "tags": ["Reino Unido"]},
{"ipa": "ˈɒ.pə.sɪt", "tags": ["Reino Unido"]},
{"ipa": "ˈɑ.pə.sɪt", "tags": ["EE.UU."]},
{"ipa": "ˈɑ.pə.sət", "tags": ["EE.UU."]},
{"ipa": "ˈɒ.pə.zɪt", "raw_tags": ["Reino Unido"]},
{"ipa": "ˈɒ.pə.sɪt", "raw_tags": ["Reino Unido"]},
{"ipa": "ˈɑ.pə.sɪt", "raw_tags": ["EE.UU."]},
{"ipa": "ˈɑ.pə.sət", "raw_tags": ["EE.UU."]},
],
"spellings": [],
},
Expand All @@ -97,11 +97,11 @@ def test_es_extract_pronunciation(self):
"sounds": [
{
"audio": "En-uk-direction.ogg",
"tags": ["británico"],
"raw_tags": ["británico"],
},
{
"audio": "En-us-direction.ogg",
"tags": ["americano"],
"raw_tags": ["americano"],
},
],
"spellings": [],
Expand Down
10 changes: 5 additions & 5 deletions tests/test_es_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,14 +48,14 @@ def test_es_extract_translation(self):
"lang_code": "de",
"word": "Katze",
"senseids": ["1", "2"],
"tags": ["f"],
"raw_tags": ["f"],
},
{
"lang_code": "de",
"lang": "alemán",
"word": "Kater",
"senseids": ["1"],
"tags": ["m"],
"raw_tags": ["m"],
"notes": ["gato macho"],
},
{
Expand Down Expand Up @@ -108,19 +108,19 @@ def test_es_extract_translation(self):
"lang": "alemán",
"lang_code": "de",
"word": "amphibisch",
"tags": ["adj"],
"raw_tags": ["adj"],
},
{
"lang": "alemán",
"lang_code": "de",
"word": "Amphibie",
"tags": ["sust"],
"raw_tags": ["sust"],
},
{
"lang": "alemán",
"lang_code": "de",
"word": "Amphibium",
"tags": ["sust"],
"raw_tags": ["sust"],
},
],
},
Expand Down

0 comments on commit ccf5e84

Please sign in to comment.