Skip to content

Commit

Permalink
Remove unused wxr argument in data_append() and data_extend()
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Nov 28, 2023
1 parent 79ab825 commit ba2492e
Show file tree
Hide file tree
Showing 8 changed files with 150 additions and 161 deletions.
19 changes: 4 additions & 15 deletions src/wiktextract/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import copy
import re
from collections import defaultdict
from functools import lru_cache, partial
from functools import partial
from typing import Any, Dict, Iterable, List, Tuple

from wiktextract.wxr_context import WiktextractContext
Expand All @@ -26,12 +26,9 @@
)


def data_append(
wxr: WiktextractContext, data: Dict, key: str, value: Any
) -> None:
def data_append(data: Dict, key: str, value: Any) -> None:
"""Appends ``value`` under ``key`` in the dictionary ``data``. The key
is created if it does not exist."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(key, str)

if key in STR_KEYS:
Expand All @@ -50,11 +47,8 @@ def data_append(
data[key] = list_value


def data_extend(
wxr: WiktextractContext, data: Dict, key: str, values: Iterable
) -> None:
def data_extend(data: Dict, key: str, values: Iterable) -> None:
"""Appends all values in a list under ``key`` in the dictionary ``data``."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(data, dict)
assert isinstance(key, str)
assert isinstance(values, (list, tuple))
Expand All @@ -64,12 +58,7 @@ def data_extend(
# out of memory. Other ways of avoiding the sharing may be more
# complex.
for x in tuple(values):
data_append(wxr, data, key, x)


@lru_cache(maxsize=20)
def make_split_re(seps):
"""Cached helper function for split_at_comma_semi."""
data_append(data, key, x)


def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=()
Expand Down
132 changes: 66 additions & 66 deletions src/wiktextract/extractor/en/page.py

Large diffs are not rendered by default.

82 changes: 41 additions & 41 deletions src/wiktextract/form_descriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1535,25 +1535,25 @@ def check_related(related):
dt["ruby"] = ruby
if "alt-of" in tags2:
check_related(related)
data_extend(wxr, data, "tags", tags1)
data_extend(wxr, data, "tags", tags2)
data_extend(wxr, data, "topics", topics1)
data_extend(wxr, data, "topics", topics2)
data_append(wxr, data, "alt_of", dt)
data_extend(data, "tags", tags1)
data_extend(data, "tags", tags2)
data_extend(data, "topics", topics1)
data_extend(data, "topics", topics2)
data_append(data, "alt_of", dt)
elif "form-of" in tags2:
check_related(related)
data_extend(wxr, data, "tags", tags1)
data_extend(wxr, data, "tags", tags2)
data_extend(wxr, data, "topics", topics1)
data_extend(wxr, data, "topics", topics2)
data_append(wxr, data, "form_of", dt)
data_extend(data, "tags", tags1)
data_extend(data, "tags", tags2)
data_extend(data, "topics", topics1)
data_extend(data, "topics", topics2)
data_append(data, "form_of", dt)
elif "compound-of" in tags2:
check_related(related)
data_extend(wxr, data, "tags", tags1)
data_extend(wxr, data, "tags", tags2)
data_extend(wxr, data, "topics", topics1)
data_extend(wxr, data, "topics", topics2)
data_append(wxr, data, "compound", related)
data_extend(data, "tags", tags1)
data_extend(data, "tags", tags2)
data_extend(data, "topics", topics1)
data_extend(data, "topics", topics2)
data_append(data, "compound", related)
else:
lang = wxr.wtp.section
related, final_tags = parse_head_final_tags(wxr, lang,
Expand All @@ -1570,8 +1570,8 @@ def check_related(related):
form["roman"] = roman
if ruby:
form["ruby"] = ruby
data_extend(wxr, form, "topics", topics1)
data_extend(wxr, form, "topics", topics2)
data_extend(form, "topics", topics1)
data_extend(form, "topics", topics2)
if topics1 or topics2:
wxr.wtp.debug("word head form has topics: {}".format(form),
sortid="form_descriptions/1233")
Expand All @@ -1586,22 +1586,22 @@ def check_related(related):
continue
if (related != titleword or add_all_canonicals or
topics1 or topics2 or ruby):
data_extend(wxr, form, "tags",
data_extend(form, "tags",
list(sorted(set(tags))))
else:
# We won't add canonical form here
filtered_tags = list(x for x in tags
if x != "canonical")
data_extend(wxr, data, "tags", filtered_tags)
data_extend(data, "tags", filtered_tags)
continue
else:
data_extend(wxr, form, "tags", list(sorted(set(tags))))
data_extend(form, "tags", list(sorted(set(tags))))
# Only insert if the form is not already there
for old in data.get("forms", ()):
if form == old:
break
else:
data_append(wxr, data, "forms", form)
data_append(data, "forms", form)

# If this form had pre-tags that started with "both" or "all", add those
# tags also to following related forms that don't have their own tags
Expand Down Expand Up @@ -1670,7 +1670,7 @@ def parse_word_head(wxr, pos, text, data, is_reconstruction,
m = re.search(head_end_re, base)
if m:
tags = head_end_map[m.group(1).lower()].split()
data_extend(wxr, data, "tags", tags)
data_extend(data, "tags", tags)
base = base[:m.start()]

# Special case: handle Hán Nôm readings for Vietnamese characters
Expand Down Expand Up @@ -1767,9 +1767,9 @@ def parse_word_head(wxr, pos, text, data, is_reconstruction,
if alt_i > 0:
tagsets, topics = decode_tags(" ".join(baseparts))
if not any("error-unknown-tag" in x for x in tagsets):
data_extend(wxr, data, "topics", topics)
data_extend(data, "topics", topics)
for tags in tagsets:
data_extend(wxr, data, "tags", tags)
data_extend(data, "tags", tags)
continue

alt, tags = parse_head_final_tags(wxr, language, alt)
Expand Down Expand Up @@ -2143,7 +2143,7 @@ def strokes_repl(m):
prev_tags = new_prev_tags
continue
for tags in tagsets:
data_extend(wxr, data, "tags", tags)
data_extend(data, "tags", tags)
prev_tags = tagsets
following_tags = None

Expand Down Expand Up @@ -2186,7 +2186,7 @@ def parse_sense_qualifier(wxr, text, data):
# .format(semi, cls))
if cls == "tags":
tagsets, topics = decode_tags(semi)
data_extend(wxr, data, "topics", topics)
data_extend(data, "topics", topics)
# XXX should think how to handle distinct options better,
# e.g., "singular and plural genitive"; that can't really be
# done with changing the calling convention of this function.
Expand All @@ -2208,7 +2208,7 @@ def parse_sense_qualifier(wxr, text, data):
.format(text),
sortid="form_descriptions/1831")
sense_tags = list(sorted(set(sense_tags)))
data_extend(wxr, data, "tags", sense_tags)
data_extend(data, "tags", sense_tags)


def parse_pronunciation_tags(wxr, text, data):
Expand All @@ -2222,13 +2222,13 @@ def parse_pronunciation_tags(wxr, text, data):
notes = []
if cls == "tags":
tagsets, topics = decode_tags(text)
data_extend(wxr, data, "topics", topics)
data_extend(data, "topics", topics)
for tagset in tagsets:
for t in tagset:
if " " in t:
notes.append(t)
else:
data_append(wxr, data, "tags", t)
data_append(data, "tags", t)
else:
notes.append(text)
if notes:
Expand Down Expand Up @@ -2322,16 +2322,16 @@ def parse_translation_desc(wxr, lang, text, tr):
if cls == "tags":
tagsets, topics = decode_tags(lst[0])
for t in tagsets:
data_extend(wxr, tr, "tags", t)
data_extend(wxr, tr, "topics", topics)
data_extend(tr, "tags", t)
data_extend(tr, "topics", topics)
lst = lst[1:]
continue
cls = classify_desc(lst[-1])
if cls == "tags":
tagsets, topics = decode_tags(lst[-1])
for t in tagsets:
data_extend(wxr, tr, "tags", t)
data_extend(wxr, tr, "topics", topics)
data_extend(tr, "tags", t)
data_extend(tr, "topics", topics)
lst = lst[:-1]
continue
break
Expand All @@ -2351,14 +2351,14 @@ def parse_translation_desc(wxr, lang, text, tr):
if par == text:
pass
if par == "f":
data_append(wxr, tr, "tags", "feminine")
data_append(tr, "tags", "feminine")
elif par == "m":
data_append(wxr, tr, "tags", "masculine")
data_append(tr, "tags", "masculine")
elif cls == "tags":
tagsets, topics = decode_tags(par)
for tags in tagsets:
data_extend(wxr, tr, "tags", tags)
data_extend(wxr, tr, "topics", topics)
data_extend(tr, "tags", tags)
data_extend(tr, "topics", topics)
elif cls == "english":
# If the text contains any of certain grammatical words, treat it
# as a "note" instead of "english"
Expand Down Expand Up @@ -2398,7 +2398,7 @@ def parse_translation_desc(wxr, lang, text, tr):
.format(tr["taxonomic"], par),
sortid="form_descriptions/2019")
if re.match(r"×[A-Z]", par):
data_append(wxr, tr, "tags", "extinct")
data_append(tr, "tags", "extinct")
par = par[1:]
tr["taxonomic"] = par
elif cls == "other":
Expand All @@ -2414,7 +2414,7 @@ def parse_translation_desc(wxr, lang, text, tr):

# Check for gender indications in suffix
text, final_tags = parse_head_final_tags(wxr, lang, text)
data_extend(wxr, tr, "tags", final_tags)
data_extend(tr, "tags", final_tags)

# Restore those parts that we did not want to remove (they are often
# optional words or words that are always used with the given translation)
Expand All @@ -2430,10 +2430,10 @@ def parse_translation_desc(wxr, lang, text, tr):
roman = tr.get("roman")
if roman:
if roman.endswith(" f"):
data_append(wxr, tr, "tags", "feminine")
data_append(tr, "tags", "feminine")
tr["roman"] = roman[:-2].strip()
elif roman.endswith(" m"):
data_append(wxr, tr, "tags", "masculine")
data_append(tr, "tags", "masculine")
tr["roman"] = roman[:-2].strip()

# If the word now has "english" field but no "roman" field, and
Expand Down
4 changes: 2 additions & 2 deletions src/wiktextract/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -2356,7 +2356,7 @@ def merge_row_and_column_tags(form, some_has_covered_text):
dt["tags"] = list(dt.get("tags", []))
# This strange copy-assigning shuffle is preventative black
# magic; do not touch lest you invoke deep bugs.
data_append(wxr, dt, "tags", "multiword-construction")
data_append(dt, "tags", "multiword-construction")
new_ret.append(dt)
ret = new_ret

Expand Down Expand Up @@ -2431,7 +2431,7 @@ def handle_generic_table(wxr, tablecontext, data,
else:
if "table-tags" not in tags:
have_forms.add(fdt)
data_append(wxr, data, "forms", dt)
data_append(data, "forms", dt)

def determine_header(wxr, tablecontext, lang, word, pos,
table_kind, kind, style,
Expand Down
8 changes: 4 additions & 4 deletions src/wiktextract/linkages.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,14 +835,14 @@ def add(w, r):
dt["english"] = english.strip()
if taxonomic:
if re.match(r"×[A-Z]", taxonomic):
data_append(wxr, dt, "tags", "extinct")
data_append(dt, "tags", "extinct")
taxonomic = taxonomic[1:]
dt["taxonomic"] = taxonomic
if re.match(r"×[A-Z]", w):
data_append(wxr, dt, "tags", "extinct")
data_append(dt, "tags", "extinct")
w = w[1:] # Remove × before dead species names
if alt and re.match(r"×[A-Z]", alt):
data_append(wxr, dt, "tags", "extinct")
data_append(dt, "tags", "extinct")
alt = alt[1:] # Remove × before dead species names
if alt and alt.strip() != w:
dt["alt"] = alt.strip()
Expand All @@ -854,7 +854,7 @@ def add(w, r):
if dt == old:
break
else:
data_append(wxr, data, field, dt)
data_append(data, field, dt)

# Handle exceptional linkage splits and other linkage
# conversions (including expanding to variant forms)
Expand Down
18 changes: 9 additions & 9 deletions src/wiktextract/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def inject_linkages(wxr: WiktextractContext, page_data: List[Dict]) -> None:
dt["roman"] = term.roman
if term.language_variant is not None:
dt["language_variant"] = term.language_variant
data_append(wxr, data, term.linkage, dt)
data_append(data, term.linkage, dt)


def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None:
Expand Down Expand Up @@ -210,7 +210,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None:
assert data.get(field) is not vals
if data.get("alt_of") or data.get("form_of"):
continue # Don't add to alt-of/form-of entries
data_extend(wxr, data, field, vals)
data_extend(data, field, vals)
continue
if len(lst) != 1:
continue
Expand All @@ -224,7 +224,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None:
if field in data:
v = data[field]
del data[field]
data_extend(wxr, senses[0], field, v)
data_extend(senses[0], field, v)

# If the last part-of-speech of the last language (i.e., last item in "ret")
# has categories or topics not bound to a sense, propagate those
Expand All @@ -240,7 +240,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None:
for data in page_data[:-1]:
if data.get("form_of") or data.get("alt_of"):
continue # Don't add to form_of or alt_of entries
data_extend(wxr, data, field, lst)
data_extend(data, field, lst)

# Regexp for matching category tags that start with a language name.
# Group 2 will be the language name. The category tag should be without
Expand Down Expand Up @@ -379,9 +379,9 @@ def clean_node_handler_fn(node):
if sense_data is not None:
# Check for Lua execution error
if '<strong class="error">Lua execution error' in v:
data_append(wxr, sense_data, "tags", "error-lua-exec")
data_append(sense_data, "tags", "error-lua-exec")
if '<strong class="error">Lua timeout error' in v:
data_append(wxr, sense_data, "tags", "error-lua-timeout")
data_append(sense_data, "tags", "error-lua-timeout")
# Capture Category tags
if not collect_links:
for m in re.finditer(
Expand All @@ -394,7 +394,7 @@ def clean_node_handler_fn(node):
if not cat:
continue
if not sense_data_has_value(sense_data, "categories", cat):
data_append(wxr, sense_data, "categories", cat)
data_append(sense_data, "categories", cat)
else:
for m in re.finditer(
r"(?is)\[\[:?(\s*([^][|:]+):)?\s*([^]|]+)(\|([^]|]+))?\]\]",
Expand All @@ -410,7 +410,7 @@ def clean_node_handler_fn(node):
if not cat:
continue
if not sense_data_has_value(sense_data, "categories", cat):
data_append(wxr, sense_data, "categories", cat)
data_append(sense_data, "categories", cat)
elif not m.group(1):
if m.group(5):
ltext = clean_value(wxr, m.group(5))
Expand All @@ -431,7 +431,7 @@ def clean_node_handler_fn(node):
ltext = ltarget
ltuple = (ltext, ltarget)
if not sense_data_has_value(sense_data, "links", ltuple):
data_append(wxr, sense_data, "links", ltuple)
data_append(sense_data, "links", ltuple)

v = clean_value(wxr, v)
# print("After clean_value:", repr(v))
Expand Down
Loading

0 comments on commit ba2492e

Please sign in to comment.