Merge pull request #401 from xxyzz/datautil

Check data attibute in `data_append()` and `clean_node()`
tatuylonen · Nov 28, 2023 · 1ccc547 · 1ccc547
2 parents 3821035 + 73c0aec
commit 1ccc547
Show file tree

Hide file tree

Showing 9 changed files with 183 additions and 209 deletions.
diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
@@ -48,9 +48,6 @@ class WiktionaryConfig:
         "POS_TYPES",
         "OTHER_SUBTITLES",
         "ZH_PRON_TAGS",
-        "FR_FORM_TABLES",
-        "DE_FORM_TABLES",
-        "FORM_OF_TEMPLATES",
         "analyze_templates",
         "extract_thesaurus_pages",
     )
@@ -112,10 +109,6 @@ def __init__(
         self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
         self.init_subtitles()
         self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
-        if dump_file_lang_code == "zh":
-            self.set_attr_from_json(
-                "FORM_OF_TEMPLATES", "form_of_templates.json"
-            )
         self.analyze_templates = True  # find templates that need pre-expand
         self.extract_thesaurus_pages = True
         self.load_edition_settings()

diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py
@@ -4,54 +4,51 @@
 import copy
 import re
 from collections import defaultdict
-from functools import lru_cache, partial
+from functools import partial
 from typing import Any, Dict, Iterable, List, Tuple
 
 from wiktextract.wxr_context import WiktextractContext
 
 # Keys in ``data`` that can only have string values (a list of them)
-str_keys = ("tags", "glosses")
+STR_KEYS = frozenset({"tags", "glosses"})
 # Keys in ``data`` that can only have dict values (a list of them)
-dict_keys = {
-    "pronunciations",
-    "senses",
-    "synonyms",
-    "related",
-    "antonyms",
-    "hypernyms",
-    "holonyms",
-    "forms",
-}
+DICT_KEYS = frozenset(
+    {
+        "pronunciations",
+        "senses",
+        "synonyms",
+        "related",
+        "antonyms",
+        "hypernyms",
+        "holonyms",
+        "forms",
+    }
+)
 
 
-def data_append(
-    wxr: WiktextractContext, data: Dict, key: str, value: Any
-) -> None:
+def data_append(data: Dict, key: str, value: Any) -> None:
     """Appends ``value`` under ``key`` in the dictionary ``data``.  The key
     is created if it does not exist."""
-    assert isinstance(wxr, WiktextractContext)
-    assert isinstance(data, dict)
     assert isinstance(key, str)
 
-    if key in str_keys:
+    if key in STR_KEYS:
         assert isinstance(value, str)
-    elif key in dict_keys:
+    elif key in DICT_KEYS:
         assert isinstance(value, dict)
-    if key == "tags":
-        if value == "":
-            return
-    lst = data.get(key)
-    if lst is None:
-        lst = []
-        data[key] = lst
-    lst.append(value)
+    if key == "tags" and value == "":
+        return
+    list_value = (
+        getattr(data, key, []) if hasattr(data, key) else data.get(key, [])
+    )
+    list_value.append(value)
+    if hasattr(data, key):
+        setattr(data, key, list_value)
+    elif isinstance(data, dict):
+        data[key] = list_value
 
 
-def data_extend(
-    wxr: WiktextractContext, data: Dict, key: str, values: Iterable
-) -> None:
+def data_extend(data: Dict, key: str, values: Iterable) -> None:
     """Appends all values in a list under ``key`` in the dictionary ``data``."""
-    assert isinstance(wxr, WiktextractContext)
     assert isinstance(data, dict)
     assert isinstance(key, str)
     assert isinstance(values, (list, tuple))
@@ -61,12 +58,7 @@ def data_extend(
     # out of memory.  Other ways of avoiding the sharing may be more
     # complex.
     for x in tuple(values):
-        data_append(wxr, data, key, x)
-
-
-@lru_cache(maxsize=20)
-def make_split_re(seps):
-    """Cached helper function for split_at_comma_semi."""
+        data_append(data, key, x)
 
 
 def split_at_comma_semi(text: str, separators=(",", ";", "，", "،"), extra=()