Skip to content

Commit

Permalink
Merge pull request #522 from xxyzz/de
Browse files Browse the repository at this point in the history
Add `raw_tags` fields to de edition pydantic models
  • Loading branch information
xxyzz authored Feb 27, 2024
2 parents 2d503f6 + 11df7c6 commit e1bea9f
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 37 deletions.
12 changes: 6 additions & 6 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ def handle_sense_modifier(
f"Found more than one child in sense modifier: {list_item_node.children}",
sortid="extractor/de/gloss/handle_sense_modifier/114",
)
modifier = clean_node(wxr, {}, list_item_node.children)
if modifier:
sense.tags = [modifier]
modifier = clean_node(wxr, None, list_item_node.children)
if modifier != "":
sense.raw_tags = [modifier]


def process_K_template(
Expand All @@ -128,7 +128,7 @@ def process_K_template(
text = clean_node(wxr, categories, template_node).removesuffix(":")
sense_data.categories.extend(categories["categories"])
tags = re.split(r";|,", text)
sense_data.tags.extend([t.strip() for t in tags])
sense_data.raw_tags.extend([t.strip() for t in tags])

# Prepositional and case information is sometimes only expanded to
# category links and not present in cleaned node. We still want it
Expand All @@ -137,7 +137,7 @@ def process_K_template(
case = template_node.template_parameters.get("Kas")
category = (prep if prep else "") + (" + " + case if case else "")
if category:
sense_data.tags.append(category)
sense_data.raw_tags.append(category)

# XXX: Investigate better ways to handle free text in K template
ft = template_node.template_parameters.get("ft")
Expand All @@ -160,7 +160,7 @@ def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None:

categories = [c.strip() for c in re.split(",", tags_part)]
if all(c.isalnum() for c in categories):
sense_data.tags.extend(categories)
sense_data.raw_tags.extend(categories)
return parts[1].strip()

return gloss_text
10 changes: 7 additions & 3 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ class Translation(BaseModelWrap):
default="", description="Transliteration to Roman characters"
)
sense_id: str = ""
tags: list[str] = Field(
raw_tags: list[str] = Field(
default=[],
description="Tags specifying the translated term, usually gender information",
)
tags: list[str] = []
notes: list[str] = Field(default=[], description="A list of notes")


Expand Down Expand Up @@ -88,10 +89,11 @@ class Sense(BaseModelWrap):
default=[],
description="list of uncleaned raw glosses for the word sense (usually only one).",
)
tags: list[str] = Field(
raw_tags: list[str] = Field(
default=[],
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
)
tags: list[str] = []
categories: list[str] = Field(
default=[],
description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
Expand Down Expand Up @@ -126,9 +128,10 @@ class Sound(BaseModelWrap):
# syllabic: list[str] = Field(
# default=[], description="Syllabic transcription"
# )
tags: list[str] = Field(
raw_tags: list[str] = Field(
default=[], description="Specifying the variant of the pronunciation"
)
tags: list[str] = []


class WordEntry(BaseModelWrap):
Expand Down Expand Up @@ -164,3 +167,4 @@ class WordEntry(BaseModelWrap):
coordinate_terms: list[Linkage] = []
proverbs: list[Linkage] = []
synonyms: list[Linkage] = []
tags: list[str] = []
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/de/pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,9 @@ def is_tag_node(node: Union[WikiNode, str]):


def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode):
tag = clean_node(wxr, {}, node).strip()
if tag:
sound_data.tags.append(tag)
tag = clean_node(wxr, None, node)
if tag != "":
sound_data.raw_tags.append(tag)


def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):
Expand Down
6 changes: 3 additions & 3 deletions src/wiktextract/extractor/de/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,11 @@ def process_modifiers(
if clean_text:
tags = re.split(r";|,|\(|\)|:", clean_text)
tags = [tag.strip() for tag in tags if tag.strip()]
if tags:
if len(tags) > 0:
if clean_text.endswith(":"):
base_translation_data.tags.extend(tags)
base_translation_data.raw_tags.extend(tags)
elif sense_translations:
sense_translations[-1].tags.extend(tags)
sense_translations[-1].raw_tags.extend(tags)

# Reset modifiers
modifiers.clear()
Expand Down
22 changes: 11 additions & 11 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,13 @@ def test_de_extract_glosses_with_only_subglosses(self):
senses,
[
{
"tags": ["tag"],
"raw_tags": ["tag"],
"glosses": ["subglossA"],
"raw_glosses": ["[a] subglossA"],
"senseid": "1a",
},
{
"tags": ["tag"],
"raw_tags": ["tag"],
"glosses": ["subglossB"],
"raw_glosses": ["[1b] subglossB"],
"senseid": "1b",
Expand All @@ -138,7 +138,7 @@ def test_process_K_template_removes_K_template_nodes(self):
self.assertEqual(
sense_data.model_dump(exclude_defaults=True),
{
"tags": ["tag1", "tag2"],
"raw_tags": ["tag1", "tag2"],
},
)

Expand Down Expand Up @@ -240,7 +240,7 @@ def test_process_K_template(self):
):
process_K_template(self.wxr, sense_data, root)
self.assertEqual(
sense_data.tags,
sense_data.raw_tags,
case["expected_tags"],
)

Expand Down Expand Up @@ -280,14 +280,14 @@ def test_de_extract_tags_from_gloss_text(self):
)
else:
self.assertEqual(
sense_data.tags,
sense_data.raw_tags,
case["expected_tags"],
)
self.assertEqual(gloss_text, case["expected_gloss"])

def test_handle_sense_modifier(self):
# https://de.wiktionary.org/wiki/habitare
input = """
wikitext = """
* {{trans.}}
:[1] etwas [[oft]] [[haben]], zu haben [[pflegen]]
:[2] ''Stadt/Dorf:''
Expand All @@ -302,15 +302,15 @@ def test_handle_sense_modifier(self):
self.wxr.wtp.add_page("Vorlage:trans.", 10, "transitiv")
self.wxr.wtp.add_page("Vorlage:intrans.", 10, "intransitiv")

root = self.wxr.wtp.parse(input)
root = self.wxr.wtp.parse(wikitext)

word_entry = self.get_default_word_entry()

extract_glosses(self.wxr, word_entry, root)

for i in range(2):
self.assertEqual(word_entry.senses[i].tags, ["transitiv"])
self.assertEqual(word_entry.senses[2].tags, ["transitiv", "aktiv"])
self.assertEqual(word_entry.senses[3].tags, ["transitiv", "passiv"])
self.assertEqual(word_entry.senses[i].raw_tags, ["transitiv"])
self.assertEqual(word_entry.senses[2].raw_tags, ["transitiv", "aktiv"])
self.assertEqual(word_entry.senses[3].raw_tags, ["transitiv", "passiv"])
for i in range(4, 6):
self.assertEqual(word_entry.senses[i].tags, ["intransitiv"])
self.assertEqual(word_entry.senses[i].raw_tags, ["intransitiv"])
6 changes: 3 additions & 3 deletions tests/test_de_pronunciation.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def test_de_process_ipa(self):
"input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}",
"expected": [
{"ipa": "ipa1"},
{"ipa": "ipa2", "tags": ["tag1"]},
{"ipa": "ipa2", "raw_tags": ["tag1"]},
],
},
]
Expand Down Expand Up @@ -129,14 +129,14 @@ def test_de_process_hoerbeispiele(self):
"audio": filename1,
"mp3_url": None,
"ogg_url": None,
"tags": ["tag1"],
"raw_tags": ["tag1"],
},
{
"audio": filename2,
"mp3_url": None,
"ogg_url": None,
"wav_url": None,
"tags": ["tag2"],
"raw_tags": ["tag2"],
},
],
},
Expand Down
16 changes: 8 additions & 8 deletions tests/test_de_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def test_de_process_translation_list_with_modifiers(self):
"lang_code": "en",
"lang": "Englisch",
"word": "model",
"tags": ["Vorbild"],
"raw_tags": ["Vorbild"],
},
],
},
Expand All @@ -203,7 +203,7 @@ def test_de_process_translation_list_with_modifiers(self):
"lang_code": "fr",
"lang": "Französisch",
"word": "exemple",
"tags": ["m"],
"raw_tags": ["m"],
}
],
},
Expand All @@ -217,19 +217,19 @@ def test_de_process_translation_list_with_modifiers(self):
"lang_code": "la",
"lang": "Latein",
"word": "crus",
"tags": ["f"],
"raw_tags": ["f"],
},
{
"lang_code": "la",
"lang": "Latein",
"word": "camba",
"tags": ["vulgärlateinisch", "f"],
"raw_tags": ["vulgärlateinisch", "f"],
},
{
"lang_code": "la",
"lang": "Latein",
"word": "gamba",
"tags": ["vulgärlateinisch", "f"],
"raw_tags": ["vulgärlateinisch", "f"],
},
],
},
Expand All @@ -245,7 +245,7 @@ def test_de_process_translation_list_with_modifiers(self):
"lang_code": "en",
"lang": "Englisch",
"word": "subscription",
"tags": ["[1a]"],
"raw_tags": ["[1a]"],
},
{
"lang_code": "en",
Expand All @@ -256,13 +256,13 @@ def test_de_process_translation_list_with_modifiers(self):
"lang_code": "en",
"lang": "Englisch",
"word": "membership fee",
"tags": ["[1", "2]"],
"raw_tags": ["[1", "2]"],
},
{
"lang_code": "en",
"lang": "Englisch",
"word": "contribution",
"tags": ["[3]"],
"raw_tags": ["[3]"],
},
{
"lang_code": "en",
Expand Down

0 comments on commit e1bea9f

Please sign in to comment.