Merge pull request #522 from xxyzz/de

Add `raw_tags` fields to de edition pydantic models
tatuylonen · Feb 27, 2024 · e1bea9f · e1bea9f
2 parents 2d503f6 + 11df7c6
commit e1bea9f
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 37 deletions.
diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py
@@ -112,9 +112,9 @@ def handle_sense_modifier(
             f"Found more than one child in sense modifier: {list_item_node.children}",
             sortid="extractor/de/gloss/handle_sense_modifier/114",
         )
-    modifier = clean_node(wxr, {}, list_item_node.children)
-    if modifier:
-        sense.tags = [modifier]
+    modifier = clean_node(wxr, None, list_item_node.children)
+    if modifier != "":
+        sense.raw_tags = [modifier]
 
 
 def process_K_template(
@@ -128,7 +128,7 @@ def process_K_template(
             text = clean_node(wxr, categories, template_node).removesuffix(":")
             sense_data.categories.extend(categories["categories"])
             tags = re.split(r";|,", text)
-            sense_data.tags.extend([t.strip() for t in tags])
+            sense_data.raw_tags.extend([t.strip() for t in tags])
 
             # Prepositional and case information is sometimes only expanded to
             # category links and not present in cleaned node. We still want it
@@ -137,7 +137,7 @@ def process_K_template(
             case = template_node.template_parameters.get("Kas")
             category = (prep if prep else "") + (" + " + case if case else "")
             if category:
-                sense_data.tags.append(category)
+                sense_data.raw_tags.append(category)
 
             # XXX: Investigate better ways to handle free text in K template
             ft = template_node.template_parameters.get("ft")
@@ -160,7 +160,7 @@ def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None:
 
         categories = [c.strip() for c in re.split(",", tags_part)]
         if all(c.isalnum() for c in categories):
-            sense_data.tags.extend(categories)
+            sense_data.raw_tags.extend(categories)
             return parts[1].strip()
 
     return gloss_text
diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py
@@ -32,10 +32,11 @@ class Translation(BaseModelWrap):
         default="", description="Transliteration to Roman characters"
     )
     sense_id: str = ""
-    tags: list[str] = Field(
+    raw_tags: list[str] = Field(
         default=[],
         description="Tags specifying the translated term, usually gender information",
     )
+    tags: list[str] = []
     notes: list[str] = Field(default=[], description="A list of notes")
 
 
@@ -88,10 +89,11 @@ class Sense(BaseModelWrap):
         default=[],
         description="list of uncleaned raw glosses for the word sense (usually only one).",
     )
-    tags: list[str] = Field(
+    raw_tags: list[str] = Field(
         default=[],
         description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
     )
+    tags: list[str] = []
     categories: list[str] = Field(
         default=[],
         description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
@@ -126,9 +128,10 @@ class Sound(BaseModelWrap):
     # syllabic: list[str] = Field(
     #     default=[], description="Syllabic transcription"
     # )
-    tags: list[str] = Field(
+    raw_tags: list[str] = Field(
         default=[], description="Specifying the variant of the pronunciation"
     )
+    tags: list[str] = []
 
 
 class WordEntry(BaseModelWrap):
@@ -164,3 +167,4 @@ class WordEntry(BaseModelWrap):
     coordinate_terms: list[Linkage] = []
     proverbs: list[Linkage] = []
     synonyms: list[Linkage] = []
+    tags: list[str] = []
diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py
@@ -197,9 +197,9 @@ def is_tag_node(node: Union[WikiNode, str]):
 
 
 def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode):
-    tag = clean_node(wxr, {}, node).strip()
-    if tag:
-        sound_data.tags.append(tag)
+    tag = clean_node(wxr, None, node)
+    if tag != "":
+        sound_data.raw_tags.append(tag)
 
 
 def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):

diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py
@@ -184,11 +184,11 @@ def process_modifiers(
     if clean_text:
         tags = re.split(r";|,|\(|\)|:", clean_text)
         tags = [tag.strip() for tag in tags if tag.strip()]
-        if tags:
+        if len(tags) > 0:
             if clean_text.endswith(":"):
-                base_translation_data.tags.extend(tags)
+                base_translation_data.raw_tags.extend(tags)
             elif sense_translations:
-                sense_translations[-1].tags.extend(tags)
+                sense_translations[-1].raw_tags.extend(tags)
 
     # Reset modifiers
     modifiers.clear()

diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
@@ -110,13 +110,13 @@ def test_de_extract_glosses_with_only_subglosses(self):
             senses,
             [
                 {
-                    "tags": ["tag"],
+                    "raw_tags": ["tag"],
                     "glosses": ["subglossA"],
                     "raw_glosses": ["[a] subglossA"],
                     "senseid": "1a",
                 },
                 {
-                    "tags": ["tag"],
+                    "raw_tags": ["tag"],
                     "glosses": ["subglossB"],
                     "raw_glosses": ["[1b] subglossB"],
                     "senseid": "1b",
@@ -138,7 +138,7 @@ def test_process_K_template_removes_K_template_nodes(self):
         self.assertEqual(
             sense_data.model_dump(exclude_defaults=True),
             {
-                "tags": ["tag1", "tag2"],
+                "raw_tags": ["tag1", "tag2"],
             },
         )
 
@@ -240,7 +240,7 @@ def test_process_K_template(self):
                 ):
                     process_K_template(self.wxr, sense_data, root)
                     self.assertEqual(
-                        sense_data.tags,
+                        sense_data.raw_tags,
                         case["expected_tags"],
                     )
 
@@ -280,14 +280,14 @@ def test_de_extract_tags_from_gloss_text(self):
                     )
                 else:
                     self.assertEqual(
-                        sense_data.tags,
+                        sense_data.raw_tags,
                         case["expected_tags"],
                     )
                 self.assertEqual(gloss_text, case["expected_gloss"])
 
     def test_handle_sense_modifier(self):
         # https://de.wiktionary.org/wiki/habitare
-        input = """
+        wikitext = """
 * {{trans.}}
 :[1] etwas [[oft]] [[haben]], zu haben [[pflegen]]
 :[2] ''Stadt/Dorf:''
@@ -302,15 +302,15 @@ def test_handle_sense_modifier(self):
         self.wxr.wtp.add_page("Vorlage:trans.", 10, "transitiv")
         self.wxr.wtp.add_page("Vorlage:intrans.", 10, "intransitiv")
 
-        root = self.wxr.wtp.parse(input)
+        root = self.wxr.wtp.parse(wikitext)
 
         word_entry = self.get_default_word_entry()
 
         extract_glosses(self.wxr, word_entry, root)
 
         for i in range(2):
-            self.assertEqual(word_entry.senses[i].tags, ["transitiv"])
-        self.assertEqual(word_entry.senses[2].tags, ["transitiv", "aktiv"])
-        self.assertEqual(word_entry.senses[3].tags, ["transitiv", "passiv"])
+            self.assertEqual(word_entry.senses[i].raw_tags, ["transitiv"])
+        self.assertEqual(word_entry.senses[2].raw_tags, ["transitiv", "aktiv"])
+        self.assertEqual(word_entry.senses[3].raw_tags, ["transitiv", "passiv"])
         for i in range(4, 6):
-            self.assertEqual(word_entry.senses[i].tags, ["intransitiv"])
+            self.assertEqual(word_entry.senses[i].raw_tags, ["intransitiv"])
diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py
@@ -58,7 +58,7 @@ def test_de_process_ipa(self):
                 "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}",
                 "expected": [
                     {"ipa": "ipa1"},
-                    {"ipa": "ipa2", "tags": ["tag1"]},
+                    {"ipa": "ipa2", "raw_tags": ["tag1"]},
                 ],
             },
         ]
@@ -129,14 +129,14 @@ def test_de_process_hoerbeispiele(self):
                         "audio": filename1,
                         "mp3_url": None,
                         "ogg_url": None,
-                        "tags": ["tag1"],
+                        "raw_tags": ["tag1"],
                     },
                     {
                         "audio": filename2,
                         "mp3_url": None,
                         "ogg_url": None,
                         "wav_url": None,
-                        "tags": ["tag2"],
+                        "raw_tags": ["tag2"],
                     },
                 ],
             },

diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py
@@ -189,7 +189,7 @@ def test_de_process_translation_list_with_modifiers(self):
                         "lang_code": "en",
                         "lang": "Englisch",
                         "word": "model",
-                        "tags": ["Vorbild"],
+                        "raw_tags": ["Vorbild"],
                     },
                 ],
             },
@@ -203,7 +203,7 @@ def test_de_process_translation_list_with_modifiers(self):
                         "lang_code": "fr",
                         "lang": "Französisch",
                         "word": "exemple",
-                        "tags": ["m"],
+                        "raw_tags": ["m"],
                     }
                 ],
             },
@@ -217,19 +217,19 @@ def test_de_process_translation_list_with_modifiers(self):
                         "lang_code": "la",
                         "lang": "Latein",
                         "word": "crus",
-                        "tags": ["f"],
+                        "raw_tags": ["f"],
                     },
                     {
                         "lang_code": "la",
                         "lang": "Latein",
                         "word": "camba",
-                        "tags": ["vulgärlateinisch", "f"],
+                        "raw_tags": ["vulgärlateinisch", "f"],
                     },
                     {
                         "lang_code": "la",
                         "lang": "Latein",
                         "word": "gamba",
-                        "tags": ["vulgärlateinisch", "f"],
+                        "raw_tags": ["vulgärlateinisch", "f"],
                     },
                 ],
             },
@@ -245,7 +245,7 @@ def test_de_process_translation_list_with_modifiers(self):
                         "lang_code": "en",
                         "lang": "Englisch",
                         "word": "subscription",
-                        "tags": ["[1a]"],
+                        "raw_tags": ["[1a]"],
                     },
                     {
                         "lang_code": "en",
@@ -256,13 +256,13 @@ def test_de_process_translation_list_with_modifiers(self):
                         "lang_code": "en",
                         "lang": "Englisch",
                         "word": "membership fee",
-                        "tags": ["[1", "2]"],
+                        "raw_tags": ["[1", "2]"],
                     },
                     {
                         "lang_code": "en",
                         "lang": "Englisch",
                         "word": "contribution",
-                        "tags": ["[3]"],
+                        "raw_tags": ["[3]"],
                     },
                     {
                         "lang_code": "en",