Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check data attibute in data_append() and clean_node() #401

Merged
merged 3 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,6 @@ class WiktionaryConfig:
"POS_TYPES",
"OTHER_SUBTITLES",
"ZH_PRON_TAGS",
"FR_FORM_TABLES",
"DE_FORM_TABLES",
"FORM_OF_TEMPLATES",
"analyze_templates",
"extract_thesaurus_pages",
)
Expand Down Expand Up @@ -112,10 +109,6 @@ def __init__(
self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
self.init_subtitles()
self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json")
if dump_file_lang_code == "zh":
self.set_attr_from_json(
"FORM_OF_TEMPLATES", "form_of_templates.json"
)
self.analyze_templates = True # find templates that need pre-expand
self.extract_thesaurus_pages = True
self.load_edition_settings()
Expand Down
66 changes: 29 additions & 37 deletions src/wiktextract/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,51 @@
import copy
import re
from collections import defaultdict
from functools import lru_cache, partial
from functools import partial
from typing import Any, Dict, Iterable, List, Tuple

from wiktextract.wxr_context import WiktextractContext

# Keys in ``data`` that can only have string values (a list of them)
str_keys = ("tags", "glosses")
STR_KEYS = frozenset({"tags", "glosses"})
# Keys in ``data`` that can only have dict values (a list of them)
dict_keys = {
"pronunciations",
"senses",
"synonyms",
"related",
"antonyms",
"hypernyms",
"holonyms",
"forms",
}
DICT_KEYS = frozenset(
{
"pronunciations",
"senses",
"synonyms",
"related",
"antonyms",
"hypernyms",
"holonyms",
"forms",
}
)


def data_append(
wxr: WiktextractContext, data: Dict, key: str, value: Any
) -> None:
def data_append(data: Dict, key: str, value: Any) -> None:
"""Appends ``value`` under ``key`` in the dictionary ``data``. The key
is created if it does not exist."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(data, dict)
assert isinstance(key, str)

if key in str_keys:
if key in STR_KEYS:
assert isinstance(value, str)
elif key in dict_keys:
elif key in DICT_KEYS:
assert isinstance(value, dict)
if key == "tags":
if value == "":
return
lst = data.get(key)
if lst is None:
lst = []
data[key] = lst
lst.append(value)
if key == "tags" and value == "":
return
list_value = (
getattr(data, key, []) if hasattr(data, key) else data.get(key, [])
)
list_value.append(value)
if hasattr(data, key):
setattr(data, key, list_value)
elif isinstance(data, dict):
data[key] = list_value


def data_extend(
wxr: WiktextractContext, data: Dict, key: str, values: Iterable
) -> None:
def data_extend(data: Dict, key: str, values: Iterable) -> None:
"""Appends all values in a list under ``key`` in the dictionary ``data``."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(data, dict)
assert isinstance(key, str)
assert isinstance(values, (list, tuple))
Expand All @@ -61,12 +58,7 @@ def data_extend(
# out of memory. Other ways of avoiding the sharing may be more
# complex.
for x in tuple(values):
data_append(wxr, data, key, x)


@lru_cache(maxsize=20)
def make_split_re(seps):
"""Cached helper function for split_at_comma_semi."""
data_append(data, key, x)


def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=()
Expand Down
Loading