Skip to content

Commit

Permalink
Merge pull request #401 from xxyzz/datautil
Browse files Browse the repository at this point in the history
Check data attibute in `data_append()` and `clean_node()`
  • Loading branch information
xxyzz authored Nov 28, 2023
2 parents 3821035 + 73c0aec commit 1ccc547
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 209 deletions.
7 changes: 0 additions & 7 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,6 @@ class WiktionaryConfig:
"POS_TYPES",
"OTHER_SUBTITLES",
"ZH_PRON_TAGS",
"FR_FORM_TABLES",
"DE_FORM_TABLES",
"FORM_OF_TEMPLATES",
"analyze_templates",
"extract_thesaurus_pages",
)
Expand Down Expand Up @@ -112,10 +109,6 @@ def __init__(
self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
self.init_subtitles()
self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
if dump_file_lang_code == "zh":
self.set_attr_from_json(
"FORM_OF_TEMPLATES", "form_of_templates.json"
)
self.analyze_templates = True # find templates that need pre-expand
self.extract_thesaurus_pages = True
self.load_edition_settings()
Expand Down
66 changes: 29 additions & 37 deletions src/wiktextract/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,51 @@
import copy
import re
from collections import defaultdict
from functools import lru_cache, partial
from functools import partial
from typing import Any, Dict, Iterable, List, Tuple

from wiktextract.wxr_context import WiktextractContext

# Keys in ``data`` that can only have string values (a list of them)
str_keys = ("tags", "glosses")
STR_KEYS = frozenset({"tags", "glosses"})
# Keys in ``data`` that can only have dict values (a list of them)
dict_keys = {
"pronunciations",
"senses",
"synonyms",
"related",
"antonyms",
"hypernyms",
"holonyms",
"forms",
}
DICT_KEYS = frozenset(
{
"pronunciations",
"senses",
"synonyms",
"related",
"antonyms",
"hypernyms",
"holonyms",
"forms",
}
)


def data_append(
wxr: WiktextractContext, data: Dict, key: str, value: Any
) -> None:
def data_append(data: Dict, key: str, value: Any) -> None:
"""Appends ``value`` under ``key`` in the dictionary ``data``. The key
is created if it does not exist."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(data, dict)
assert isinstance(key, str)

if key in str_keys:
if key in STR_KEYS:
assert isinstance(value, str)
elif key in dict_keys:
elif key in DICT_KEYS:
assert isinstance(value, dict)
if key == "tags":
if value == "":
return
lst = data.get(key)
if lst is None:
lst = []
data[key] = lst
lst.append(value)
if key == "tags" and value == "":
return
list_value = (
getattr(data, key, []) if hasattr(data, key) else data.get(key, [])
)
list_value.append(value)
if hasattr(data, key):
setattr(data, key, list_value)
elif isinstance(data, dict):
data[key] = list_value


def data_extend(
wxr: WiktextractContext, data: Dict, key: str, values: Iterable
) -> None:
def data_extend(data: Dict, key: str, values: Iterable) -> None:
"""Appends all values in a list under ``key`` in the dictionary ``data``."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(data, dict)
assert isinstance(key, str)
assert isinstance(values, (list, tuple))
Expand All @@ -61,12 +58,7 @@ def data_extend(
# out of memory. Other ways of avoiding the sharing may be more
# complex.
for x in tuple(values):
data_append(wxr, data, key, x)


@lru_cache(maxsize=20)
def make_split_re(seps):
"""Cached helper function for split_at_comma_semi."""
data_append(data, key, x)


def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=()
Expand Down
Loading

0 comments on commit 1ccc547

Please sign in to comment.