From 79ab8259b3c5b548af1f8280e96909d4be3432c1 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 27 Nov 2023 17:25:39 +0800 Subject: [PATCH 1/3] Check data attibute in `data_append()` and `clean_node()` Attibutes are used in pydantic models. An `assert` is removed because pydanic could be passed but we don't want to import pydantic for code doesn't use it. And the `wxr` argument should be removed from `data_append()` and `data_extend()`. --- src/wiktextract/datautils.py | 47 +++++++++++++++++++----------------- src/wiktextract/page.py | 19 ++++++++++++--- 2 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py index 313f82fc..3d7d5094 100644 --- a/src/wiktextract/datautils.py +++ b/src/wiktextract/datautils.py @@ -10,18 +10,20 @@ from wiktextract.wxr_context import WiktextractContext # Keys in ``data`` that can only have string values (a list of them) -str_keys = ("tags", "glosses") +STR_KEYS = frozenset({"tags", "glosses"}) # Keys in ``data`` that can only have dict values (a list of them) -dict_keys = { - "pronunciations", - "senses", - "synonyms", - "related", - "antonyms", - "hypernyms", - "holonyms", - "forms", -} +DICT_KEYS = frozenset( + { + "pronunciations", + "senses", + "synonyms", + "related", + "antonyms", + "hypernyms", + "holonyms", + "forms", + } +) def data_append( @@ -30,21 +32,22 @@ def data_append( """Appends ``value`` under ``key`` in the dictionary ``data``. The key is created if it does not exist.""" assert isinstance(wxr, WiktextractContext) - assert isinstance(data, dict) assert isinstance(key, str) - if key in str_keys: + if key in STR_KEYS: assert isinstance(value, str) - elif key in dict_keys: + elif key in DICT_KEYS: assert isinstance(value, dict) - if key == "tags": - if value == "": - return - lst = data.get(key) - if lst is None: - lst = [] - data[key] = lst - lst.append(value) + if key == "tags" and value == "": + return + list_value = ( + getattr(data, key, []) if hasattr(data, key) else data.get(key, []) + ) + list_value.append(value) + if hasattr(data, key): + setattr(data, key, list_value) + elif isinstance(data, dict): + data[key] = list_value def data_extend( diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py index 648666da..a16fb8ab 100644 --- a/src/wiktextract/page.py +++ b/src/wiktextract/page.py @@ -9,6 +9,7 @@ from mediawiki_langcodes import get_all_names, name_to_code from wikitextprocessor import NodeKind, WikiNode + from wiktextract.wxr_context import WiktextractContext from .clean import clean_value @@ -392,7 +393,7 @@ def clean_node_handler_fn(node): cat = cat.strip() if not cat: continue - if cat not in sense_data.get("categories", ()): + if not sense_data_has_value(sense_data, "categories", cat): data_append(wxr, sense_data, "categories", cat) else: for m in re.finditer( @@ -408,7 +409,7 @@ def clean_node_handler_fn(node): cat = cat.strip() if not cat: continue - if cat not in sense_data.get("categories", ()): + if not sense_data_has_value(sense_data, "categories", cat): data_append(wxr, sense_data, "categories", cat) elif not m.group(1): if m.group(5): @@ -429,7 +430,7 @@ def clean_node_handler_fn(node): if not ltext and ltarget: ltext = ltarget ltuple = (ltext, ltarget) - if ltuple not in sense_data.get("links", ()): + if not sense_data_has_value(sense_data, "links", ltuple): data_append(wxr, sense_data, "links", ltuple) v = clean_value(wxr, v) @@ -448,3 +449,15 @@ def clean_node_handler_fn(node): # some Korean Hanja form v = re.sub(r"\^\?", "", v) return v + + +def sense_data_has_value(sense_data, name, value): + """ + Return True if `sense_data` has value in the attribute `name`'s value or + in the value of key `name` if `sense_date` is dictionary. + """ + if hasattr(sense_data, name): + return value in getattr(sense_data, name) + elif isinstance(sense_data, dict): + return value in sense_data.get(name, ()) + return False From ba2492ef6880d17152e960d88900448bcc35b5ea Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 28 Nov 2023 15:48:58 +0800 Subject: [PATCH 2/3] Remove unused `wxr` argument in `data_append()` and `data_extend()` --- src/wiktextract/datautils.py | 19 +--- src/wiktextract/extractor/en/page.py | 132 +++++++++++++-------------- src/wiktextract/form_descriptions.py | 82 ++++++++--------- src/wiktextract/inflection.py | 4 +- src/wiktextract/linkages.py | 8 +- src/wiktextract/page.py | 18 ++-- src/wiktextract/pronunciations.py | 26 +++--- src/wiktextract/translations.py | 22 ++--- 8 files changed, 150 insertions(+), 161 deletions(-) diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py index 3d7d5094..25844eb2 100644 --- a/src/wiktextract/datautils.py +++ b/src/wiktextract/datautils.py @@ -4,7 +4,7 @@ import copy import re from collections import defaultdict -from functools import lru_cache, partial +from functools import partial from typing import Any, Dict, Iterable, List, Tuple from wiktextract.wxr_context import WiktextractContext @@ -26,12 +26,9 @@ ) -def data_append( - wxr: WiktextractContext, data: Dict, key: str, value: Any -) -> None: +def data_append(data: Dict, key: str, value: Any) -> None: """Appends ``value`` under ``key`` in the dictionary ``data``. The key is created if it does not exist.""" - assert isinstance(wxr, WiktextractContext) assert isinstance(key, str) if key in STR_KEYS: @@ -50,11 +47,8 @@ def data_append( data[key] = list_value -def data_extend( - wxr: WiktextractContext, data: Dict, key: str, values: Iterable -) -> None: +def data_extend(data: Dict, key: str, values: Iterable) -> None: """Appends all values in a list under ``key`` in the dictionary ``data``.""" - assert isinstance(wxr, WiktextractContext) assert isinstance(data, dict) assert isinstance(key, str) assert isinstance(values, (list, tuple)) @@ -64,12 +58,7 @@ def data_extend( # out of memory. Other ways of avoiding the sharing may be more # complex. for x in tuple(values): - data_append(wxr, data, key, x) - - -@lru_cache(maxsize=20) -def make_split_re(seps): - """Cached helper function for split_at_comma_semi.""" + data_append(data, key, x) def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=() diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 8c3d63c1..ce563da4 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -525,14 +525,14 @@ def parse_sense_linkage(wxr, data, name, ht): dt = {"word": w} if tags: - data_extend(wxr, dt, "tags", tags) + data_extend(dt, "tags", tags) if topics: - data_extend(wxr, dt, "topics", topics) + data_extend(dt, "topics", topics) if english: dt["english"] = english if alt: dt["alt"] = alt - data_append(wxr, data, field, dt) + data_append(data, field, dt) def init_head_tag_re(wxr): @@ -738,7 +738,7 @@ def parse_language(wxr, langnode, language, lang_code): base_data = {"word": word, "lang": language, "lang_code": lang_code} if is_reconstruction: - data_append(wxr, base_data, "tags", "reconstruction") + data_append(base_data, "tags", "reconstruction") sense_data = {} pos_data = {} # For a current part-of-speech etym_data = {} # For one etymology @@ -811,15 +811,15 @@ def push_sense(): tags, lst = ret assert isinstance(lst, (list, tuple)) if "form-of" in tags: - data_extend(wxr, sense_data, "form_of", lst) - data_extend(wxr, sense_data, "tags", tags) + data_extend(sense_data, "form_of", lst) + data_extend(sense_data, "tags", tags) elif "alt-of" in tags: - data_extend(wxr, sense_data, "alt_of", lst) - data_extend(wxr, sense_data, "tags", tags) + data_extend(sense_data, "alt_of", lst) + data_extend(sense_data, "tags", tags) if (not sense_data.get("glosses") and "no-gloss" not in sense_data.get("tags", ())): - data_append(wxr, sense_data, "tags", "no-gloss") + data_append(sense_data, "tags", "no-gloss") pos_datas.append(sense_data) sense_data = {} @@ -876,15 +876,15 @@ def head_post_template_fn(name, ht, expansion): # be removed? t = ht.get(2, "") if t == "pinyin": - data_append(wxr, pos_data, "tags", "Pinyin") + data_append(pos_data, "tags", "Pinyin") elif t == "romanization": - data_append(wxr, pos_data, "tags", "romanization") + data_append(pos_data, "tags", "romanization") m = re.search(head_tag_re, name) if m: args_ht = clean_template_args(wxr, ht) cleaned_expansion = clean_node(wxr, None, expansion) dt = {"name": name, "args": args_ht, "expansion": cleaned_expansion} - data_append(wxr, pos_data, "head_templates", dt) + data_append(pos_data, "head_templates", dt) # The following are both captured in head_templates and parsed # separately @@ -1192,8 +1192,8 @@ def parse_part_of_speech(posnode, pos): # keep tags extracted from the head for the dummy sense. push_sense() # Make sure unfinished data pushed, and start clean sense if not pos_datas: - data_extend(wxr, sense_data, "tags", header_tags) - data_append(wxr, sense_data, "tags", "no-gloss") + data_extend(sense_data, "tags", header_tags) + data_append(sense_data, "tags", "no-gloss") push_sense() def process_gloss_header( @@ -1418,15 +1418,15 @@ def sense_template_fn( langid = clean_node(wxr, None, ht.get(1, ())) arg = clean_node(wxr, sense_base, ht.get(2, ())) if re.match(r"Q\d+$", arg): - data_append(wxr, sense_base, "wikidata", arg) - data_append(wxr, sense_base, "senseid", + data_append(sense_base, "wikidata", arg) + data_append(sense_base, "senseid", langid + ":" + arg) if name in sense_linkage_templates: # print(f"SENSE_TEMPLATE_FN: {name}") parse_sense_linkage(wxr, sense_base, name, ht) return "" if name == "†" or name == "zh-obsolete": - data_append(wxr, sense_base, "tags", "obsolete") + data_append(sense_base, "tags", "obsolete") return "" if name in { "ux", @@ -1527,7 +1527,7 @@ def extract_link_texts(item): # Generate no gloss for translation hub pages, but add the # "translation-hub" tag for them if rawgloss == "(This entry is a translation hub.)": - data_append(wxr, sense_data, "tags", "translation-hub") + data_append(sense_data, "tags", "translation-hub") return push_sense() # Remove certain substrings specific to outer glosses @@ -1552,7 +1552,7 @@ def extract_link_texts(item): # parenthesized tags/topics if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): - data_append(wxr, sense_base, "raw_glosses", subglosses[1]) + data_append(sense_base, "raw_glosses", subglosses[1]) m = re.match(r"\(([^()]+)\):?\s*", rawgloss) # ( ..\1.. ): ... or ( ..\1.. ) ... if m: @@ -1560,17 +1560,17 @@ def extract_link_texts(item): rawgloss = rawgloss[m.end():].strip() parse_sense_qualifier(wxr, q, sense_base) if rawgloss == "A pejorative:": - data_append(wxr, sense_base, "tags", "pejorative") + data_append(sense_base, "tags", "pejorative") rawgloss = None elif rawgloss == "Short forms.": - data_append(wxr, sense_base, "tags", "abbreviation") + data_append(sense_base, "tags", "abbreviation") rawgloss = None elif rawgloss == "Technical or specialized senses.": rawgloss = None if rawgloss: - data_append(wxr, sense_base, "glosses", rawgloss) + data_append(sense_base, "glosses", rawgloss) if rawgloss in ("A person:",): - data_append(wxr, sense_base, "tags", "g-person") + data_append(sense_base, "tags", "g-person") # The main recursive call (except for the exceptions at the # start of this function). @@ -1629,11 +1629,11 @@ def extract_link_texts(item): len(infl_tags) == 1): # Interpret others as a particular form under # "inflection of" - data_extend(wxr, sense_base, "tags", infl_tags) - data_extend(wxr, sense_base, "form_of", infl_dts) + data_extend(sense_base, "tags", infl_tags) + data_extend(sense_base, "form_of", infl_dts) subglosses = subglosses[1:] elif not infl_dts: - data_extend(wxr, sense_base, "tags", infl_tags) + data_extend(sense_base, "tags", infl_tags) subglosses = subglosses[1:] # Create senses for remaining subglosses @@ -1647,26 +1647,26 @@ def extract_link_texts(item): if push_sense(): added = True # if gloss not in sense_data.get("raw_glosses", ()): - # data_append(wxr, sense_data, "raw_glosses", gloss) + # data_append(sense_data, "raw_glosses", gloss) if gloss_i == 0 and examples: # In a multi-line gloss, associate examples # with only one of them. # XXX or you could use gloss_i == len(subglosses) # to associate examples with the *last* one. - data_extend(wxr, sense_data, "examples", examples) + data_extend(sense_data, "examples", examples) # If the gloss starts with †, mark as obsolete if gloss.startswith("^†"): - data_append(wxr, sense_data, "tags", "obsolete") + data_append(sense_data, "tags", "obsolete") gloss = gloss[2:].strip() elif gloss.startswith("^‡"): - data_extend(wxr, sense_data, "tags", ["obsolete", "historical"]) + data_extend(sense_data, "tags", ["obsolete", "historical"]) gloss = gloss[2:].strip() # Copy data for all senses to this sense for k, v in sense_base.items(): if isinstance(v, (list, tuple)): if k != "tags": # Tags handled below (countable/uncountable special) - data_extend(wxr, sense_data, k, v) + data_extend(sense_data, k, v) else: assert k not in ("tags", "categories", "topics") sense_data[k] = v @@ -1700,10 +1700,10 @@ def extract_link_texts(item): if gloss.startswith("N. of "): gloss = "Name of " + gloss[6:] if gloss.startswith("†"): - data_append(wxr, sense_data, "tags", "obsolete") + data_append(sense_data, "tags", "obsolete") gloss = gloss[1:] elif gloss.startswith("^†"): - data_append(wxr, sense_data, "tags", "obsolete") + data_append(sense_data, "tags", "obsolete") gloss = gloss[2:] # Copy tags from sense_base if any. This will not copy @@ -1719,11 +1719,11 @@ def extract_link_texts(item): countability_tags.append(tag) continue if tag not in sense_tags: - data_append(wxr, sense_data, "tags", tag) + data_append(sense_data, "tags", tag) if countability_tags: if ("countable" not in sense_tags and "uncountable" not in sense_tags): - data_extend(wxr, sense_data, "tags", countability_tags) + data_extend(sense_data, "tags", countability_tags) # If outer gloss specifies a form-of ("inflection of", see # aquamarine/German), try to parse the inner glosses as @@ -1735,13 +1735,13 @@ def extract_link_texts(item): infl_tags, infl_dts = parsed if not infl_dts and infl_tags: # Interpret as a particular form under "inflection of" - data_extend(wxr, sense_data, "tags", infl_tags) + data_extend(sense_data, "tags", infl_tags) if not gloss: - data_append(wxr, sense_data, "tags", "empty-gloss") + data_append(sense_data, "tags", "empty-gloss") elif gloss != "-" and gloss not in sense_data.get("glosses", []): # Add the gloss for the sense. - data_append(wxr, sense_data, "glosses", gloss) + data_append(sense_data, "glosses", gloss) # Kludge: there are cases (e.g., etc./Swedish) where there are # two abbreviations in the same sense, both generated by the @@ -1766,27 +1766,27 @@ def extract_link_texts(item): continue tags, dts = parsed if not dts and tags: - data_extend(wxr, sense_data, "tags", tags) + data_extend(sense_data, "tags", tags) continue for dt in dts: ftags = list(tag for tag in tags if tag != "form-of") if "alt-of" in tags: - data_extend(wxr, sense_data, "tags", ftags) - data_append(wxr, sense_data, "alt_of", dt) + data_extend(sense_data, "tags", ftags) + data_append(sense_data, "alt_of", dt) elif "compound-of" in tags: - data_extend(wxr, sense_data, "tags", ftags) - data_append(wxr, sense_data, "compound_of", dt) + data_extend(sense_data, "tags", ftags) + data_append(sense_data, "compound_of", dt) elif "synonym-of" in tags: - data_extend(wxr, dt, "tags", ftags) - data_append(wxr, sense_data, "synonyms", dt) + data_extend(dt, "tags", ftags) + data_append(sense_data, "synonyms", dt) elif tags and dt.get("word", "").startswith("of "): dt["word"] = dt["word"][3:] - data_append(wxr, sense_data, "tags", "form-of") - data_extend(wxr, sense_data, "tags", ftags) - data_append(wxr, sense_data, "form_of", dt) + data_append(sense_data, "tags", "form-of") + data_extend(sense_data, "tags", ftags) + data_append(sense_data, "form_of", dt) elif "form-of" in tags: - data_extend(wxr, sense_data, "tags", tags) - data_append(wxr, sense_data, "form_of", dt) + data_extend(sense_data, "tags", tags) + data_append(sense_data, "form_of", dt) if len(sense_data) == 0: if len(sense_base.get("tags")) == 0: @@ -1827,7 +1827,7 @@ def inflection_template_fn(name, ht): if m: args_ht = clean_template_args(wxr, ht) dt = {"name": name, "args": args_ht} - data_append(wxr, pos_data, "inflection_templates", dt) + data_append(pos_data, "inflection_templates", dt) return None @@ -2355,7 +2355,7 @@ def contains_kind(children, nodekind): else: parse_zh_synonyms(parsed.children, synonyms, [], "") #print(json.dumps(synonyms, indent=4, ensure_ascii=False)) - data_extend(wxr, data, "synonyms", synonyms) + data_extend(data, "synonyms", synonyms) parse_linkage_recurse(parsed.children, field, None) if not data.get(field) and not have_panel_template: text = "".join(toplevel_text).strip() @@ -3003,7 +3003,7 @@ def skip_template_fn(name, ht): parse_part_of_speech(node, pos) if "tags" in dt: for pdata in pos_datas: - data_extend(wxr, pdata, "tags", dt["tags"]) + data_extend(pdata, "tags", dt["tags"]) elif t_no_number in wxr.config.LINKAGE_SUBTITLES: rel = wxr.config.LINKAGE_SUBTITLES.get(t_no_number) data = select_data() @@ -3296,7 +3296,7 @@ def usex_template_fn(name, ht): if "tags" in data: del data["tags"] for sense in data["senses"]: - data_extend(wxr, sense, "tags", tags) + data_extend(sense, "tags", tags) return ret @@ -3309,9 +3309,9 @@ def parse_wikipedia_template(wxr, data, ht): langid = clean_node(wxr, data, ht.get("lang", ())) pagename = clean_node(wxr, data, ht.get(1, ())) or wxr.wtp.title if langid: - data_append(wxr, data, "wikipedia", langid + ":" + pagename) + data_append(data, "wikipedia", langid + ":" + pagename) else: - data_append(wxr, data, "wikipedia", pagename) + data_append(data, "wikipedia", pagename) def parse_top_template(wxr, node, data): @@ -3352,7 +3352,7 @@ def top_template_fn(name, ht): if name == "wikidata": arg = clean_node(wxr, data, ht.get(1, ())) if arg.startswith("Q") or arg.startswith("Lexeme:L"): - data_append(wxr, data, "wikidata", arg) + data_append(data, "wikidata", arg) return "" wxr.wtp.debug("UNIMPLEMENTED top-level template: {} {}" .format(name, ht), @@ -3533,7 +3533,7 @@ def multitrans_post_fn(name, ht, text): continue for k, v in top_data.items(): assert isinstance(v, (list, tuple)) - data_extend(wxr, data, k, v) + data_extend(data, k, v) by_lang[data["lang"]].append(data) # XXX this code is clearly out of date. There is no longer a "conjugation" @@ -3601,19 +3601,19 @@ def multitrans_post_fn(name, ht, text): def add_form_of_tags(wxr, template_name, form_of_templates, sense_data): # https://en.wiktionary.org/wiki/Category:Form-of_templates if template_name in form_of_templates: - data_append(wxr, sense_data, "tags", "form-of") + data_append(sense_data, "tags", "form-of") if template_name in ("abbreviation of", "abbr of"): - data_append(wxr, sense_data, "tags", "abbreviation") + data_append(sense_data, "tags", "abbreviation") elif template_name.startswith(("alt ", "alternative")): - data_append(wxr, sense_data, "tags", "alt-of") + data_append(sense_data, "tags", "alt-of") elif template_name.startswith(("female", "feminine")): - data_append(wxr, sense_data, "tags", "feminine") + data_append(sense_data, "tags", "feminine") elif template_name == "initialism of": - data_extend(wxr, sense_data, "tags", ["abbreviation", "initialism"]) + data_extend(sense_data, "tags", ["abbreviation", "initialism"]) elif template_name.startswith("masculine"): - data_append(wxr, sense_data, "tags", "masculine") + data_append(sense_data, "tags", "masculine") elif template_name.startswith("misspelling"): - data_append(wxr, sense_data, "tags", "misspelling") + data_append(sense_data, "tags", "misspelling") elif template_name.startswith(("obsolete", "obs ")): - data_append(wxr, sense_data, "tags", "obsolete") + data_append(sense_data, "tags", "obsolete") diff --git a/src/wiktextract/form_descriptions.py b/src/wiktextract/form_descriptions.py index 6ef230a3..219291b1 100644 --- a/src/wiktextract/form_descriptions.py +++ b/src/wiktextract/form_descriptions.py @@ -1535,25 +1535,25 @@ def check_related(related): dt["ruby"] = ruby if "alt-of" in tags2: check_related(related) - data_extend(wxr, data, "tags", tags1) - data_extend(wxr, data, "tags", tags2) - data_extend(wxr, data, "topics", topics1) - data_extend(wxr, data, "topics", topics2) - data_append(wxr, data, "alt_of", dt) + data_extend(data, "tags", tags1) + data_extend(data, "tags", tags2) + data_extend(data, "topics", topics1) + data_extend(data, "topics", topics2) + data_append(data, "alt_of", dt) elif "form-of" in tags2: check_related(related) - data_extend(wxr, data, "tags", tags1) - data_extend(wxr, data, "tags", tags2) - data_extend(wxr, data, "topics", topics1) - data_extend(wxr, data, "topics", topics2) - data_append(wxr, data, "form_of", dt) + data_extend(data, "tags", tags1) + data_extend(data, "tags", tags2) + data_extend(data, "topics", topics1) + data_extend(data, "topics", topics2) + data_append(data, "form_of", dt) elif "compound-of" in tags2: check_related(related) - data_extend(wxr, data, "tags", tags1) - data_extend(wxr, data, "tags", tags2) - data_extend(wxr, data, "topics", topics1) - data_extend(wxr, data, "topics", topics2) - data_append(wxr, data, "compound", related) + data_extend(data, "tags", tags1) + data_extend(data, "tags", tags2) + data_extend(data, "topics", topics1) + data_extend(data, "topics", topics2) + data_append(data, "compound", related) else: lang = wxr.wtp.section related, final_tags = parse_head_final_tags(wxr, lang, @@ -1570,8 +1570,8 @@ def check_related(related): form["roman"] = roman if ruby: form["ruby"] = ruby - data_extend(wxr, form, "topics", topics1) - data_extend(wxr, form, "topics", topics2) + data_extend(form, "topics", topics1) + data_extend(form, "topics", topics2) if topics1 or topics2: wxr.wtp.debug("word head form has topics: {}".format(form), sortid="form_descriptions/1233") @@ -1586,22 +1586,22 @@ def check_related(related): continue if (related != titleword or add_all_canonicals or topics1 or topics2 or ruby): - data_extend(wxr, form, "tags", + data_extend(form, "tags", list(sorted(set(tags)))) else: # We won't add canonical form here filtered_tags = list(x for x in tags if x != "canonical") - data_extend(wxr, data, "tags", filtered_tags) + data_extend(data, "tags", filtered_tags) continue else: - data_extend(wxr, form, "tags", list(sorted(set(tags)))) + data_extend(form, "tags", list(sorted(set(tags)))) # Only insert if the form is not already there for old in data.get("forms", ()): if form == old: break else: - data_append(wxr, data, "forms", form) + data_append(data, "forms", form) # If this form had pre-tags that started with "both" or "all", add those # tags also to following related forms that don't have their own tags @@ -1670,7 +1670,7 @@ def parse_word_head(wxr, pos, text, data, is_reconstruction, m = re.search(head_end_re, base) if m: tags = head_end_map[m.group(1).lower()].split() - data_extend(wxr, data, "tags", tags) + data_extend(data, "tags", tags) base = base[:m.start()] # Special case: handle Hán Nôm readings for Vietnamese characters @@ -1767,9 +1767,9 @@ def parse_word_head(wxr, pos, text, data, is_reconstruction, if alt_i > 0: tagsets, topics = decode_tags(" ".join(baseparts)) if not any("error-unknown-tag" in x for x in tagsets): - data_extend(wxr, data, "topics", topics) + data_extend(data, "topics", topics) for tags in tagsets: - data_extend(wxr, data, "tags", tags) + data_extend(data, "tags", tags) continue alt, tags = parse_head_final_tags(wxr, language, alt) @@ -2143,7 +2143,7 @@ def strokes_repl(m): prev_tags = new_prev_tags continue for tags in tagsets: - data_extend(wxr, data, "tags", tags) + data_extend(data, "tags", tags) prev_tags = tagsets following_tags = None @@ -2186,7 +2186,7 @@ def parse_sense_qualifier(wxr, text, data): # .format(semi, cls)) if cls == "tags": tagsets, topics = decode_tags(semi) - data_extend(wxr, data, "topics", topics) + data_extend(data, "topics", topics) # XXX should think how to handle distinct options better, # e.g., "singular and plural genitive"; that can't really be # done with changing the calling convention of this function. @@ -2208,7 +2208,7 @@ def parse_sense_qualifier(wxr, text, data): .format(text), sortid="form_descriptions/1831") sense_tags = list(sorted(set(sense_tags))) - data_extend(wxr, data, "tags", sense_tags) + data_extend(data, "tags", sense_tags) def parse_pronunciation_tags(wxr, text, data): @@ -2222,13 +2222,13 @@ def parse_pronunciation_tags(wxr, text, data): notes = [] if cls == "tags": tagsets, topics = decode_tags(text) - data_extend(wxr, data, "topics", topics) + data_extend(data, "topics", topics) for tagset in tagsets: for t in tagset: if " " in t: notes.append(t) else: - data_append(wxr, data, "tags", t) + data_append(data, "tags", t) else: notes.append(text) if notes: @@ -2322,16 +2322,16 @@ def parse_translation_desc(wxr, lang, text, tr): if cls == "tags": tagsets, topics = decode_tags(lst[0]) for t in tagsets: - data_extend(wxr, tr, "tags", t) - data_extend(wxr, tr, "topics", topics) + data_extend(tr, "tags", t) + data_extend(tr, "topics", topics) lst = lst[1:] continue cls = classify_desc(lst[-1]) if cls == "tags": tagsets, topics = decode_tags(lst[-1]) for t in tagsets: - data_extend(wxr, tr, "tags", t) - data_extend(wxr, tr, "topics", topics) + data_extend(tr, "tags", t) + data_extend(tr, "topics", topics) lst = lst[:-1] continue break @@ -2351,14 +2351,14 @@ def parse_translation_desc(wxr, lang, text, tr): if par == text: pass if par == "f": - data_append(wxr, tr, "tags", "feminine") + data_append(tr, "tags", "feminine") elif par == "m": - data_append(wxr, tr, "tags", "masculine") + data_append(tr, "tags", "masculine") elif cls == "tags": tagsets, topics = decode_tags(par) for tags in tagsets: - data_extend(wxr, tr, "tags", tags) - data_extend(wxr, tr, "topics", topics) + data_extend(tr, "tags", tags) + data_extend(tr, "topics", topics) elif cls == "english": # If the text contains any of certain grammatical words, treat it # as a "note" instead of "english" @@ -2398,7 +2398,7 @@ def parse_translation_desc(wxr, lang, text, tr): .format(tr["taxonomic"], par), sortid="form_descriptions/2019") if re.match(r"×[A-Z]", par): - data_append(wxr, tr, "tags", "extinct") + data_append(tr, "tags", "extinct") par = par[1:] tr["taxonomic"] = par elif cls == "other": @@ -2414,7 +2414,7 @@ def parse_translation_desc(wxr, lang, text, tr): # Check for gender indications in suffix text, final_tags = parse_head_final_tags(wxr, lang, text) - data_extend(wxr, tr, "tags", final_tags) + data_extend(tr, "tags", final_tags) # Restore those parts that we did not want to remove (they are often # optional words or words that are always used with the given translation) @@ -2430,10 +2430,10 @@ def parse_translation_desc(wxr, lang, text, tr): roman = tr.get("roman") if roman: if roman.endswith(" f"): - data_append(wxr, tr, "tags", "feminine") + data_append(tr, "tags", "feminine") tr["roman"] = roman[:-2].strip() elif roman.endswith(" m"): - data_append(wxr, tr, "tags", "masculine") + data_append(tr, "tags", "masculine") tr["roman"] = roman[:-2].strip() # If the word now has "english" field but no "roman" field, and diff --git a/src/wiktextract/inflection.py b/src/wiktextract/inflection.py index 2ef3ea51..bca01050 100644 --- a/src/wiktextract/inflection.py +++ b/src/wiktextract/inflection.py @@ -2356,7 +2356,7 @@ def merge_row_and_column_tags(form, some_has_covered_text): dt["tags"] = list(dt.get("tags", [])) # This strange copy-assigning shuffle is preventative black # magic; do not touch lest you invoke deep bugs. - data_append(wxr, dt, "tags", "multiword-construction") + data_append(dt, "tags", "multiword-construction") new_ret.append(dt) ret = new_ret @@ -2431,7 +2431,7 @@ def handle_generic_table(wxr, tablecontext, data, else: if "table-tags" not in tags: have_forms.add(fdt) - data_append(wxr, data, "forms", dt) + data_append(data, "forms", dt) def determine_header(wxr, tablecontext, lang, word, pos, table_kind, kind, style, diff --git a/src/wiktextract/linkages.py b/src/wiktextract/linkages.py index 08fe4cd1..5efbbea4 100644 --- a/src/wiktextract/linkages.py +++ b/src/wiktextract/linkages.py @@ -835,14 +835,14 @@ def add(w, r): dt["english"] = english.strip() if taxonomic: if re.match(r"×[A-Z]", taxonomic): - data_append(wxr, dt, "tags", "extinct") + data_append(dt, "tags", "extinct") taxonomic = taxonomic[1:] dt["taxonomic"] = taxonomic if re.match(r"×[A-Z]", w): - data_append(wxr, dt, "tags", "extinct") + data_append(dt, "tags", "extinct") w = w[1:] # Remove × before dead species names if alt and re.match(r"×[A-Z]", alt): - data_append(wxr, dt, "tags", "extinct") + data_append(dt, "tags", "extinct") alt = alt[1:] # Remove × before dead species names if alt and alt.strip() != w: dt["alt"] = alt.strip() @@ -854,7 +854,7 @@ def add(w, r): if dt == old: break else: - data_append(wxr, data, field, dt) + data_append(data, field, dt) # Handle exceptional linkage splits and other linkage # conversions (including expanding to variant forms) diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py index a16fb8ab..957ccd66 100644 --- a/src/wiktextract/page.py +++ b/src/wiktextract/page.py @@ -180,7 +180,7 @@ def inject_linkages(wxr: WiktextractContext, page_data: List[Dict]) -> None: dt["roman"] = term.roman if term.language_variant is not None: dt["language_variant"] = term.language_variant - data_append(wxr, data, term.linkage, dt) + data_append(data, term.linkage, dt) def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None: @@ -210,7 +210,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None: assert data.get(field) is not vals if data.get("alt_of") or data.get("form_of"): continue # Don't add to alt-of/form-of entries - data_extend(wxr, data, field, vals) + data_extend(data, field, vals) continue if len(lst) != 1: continue @@ -224,7 +224,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None: if field in data: v = data[field] del data[field] - data_extend(wxr, senses[0], field, v) + data_extend(senses[0], field, v) # If the last part-of-speech of the last language (i.e., last item in "ret") # has categories or topics not bound to a sense, propagate those @@ -240,7 +240,7 @@ def process_categories(wxr: WiktextractContext, page_data: List[Dict]) -> None: for data in page_data[:-1]: if data.get("form_of") or data.get("alt_of"): continue # Don't add to form_of or alt_of entries - data_extend(wxr, data, field, lst) + data_extend(data, field, lst) # Regexp for matching category tags that start with a language name. # Group 2 will be the language name. The category tag should be without @@ -379,9 +379,9 @@ def clean_node_handler_fn(node): if sense_data is not None: # Check for Lua execution error if 'Lua execution error' in v: - data_append(wxr, sense_data, "tags", "error-lua-exec") + data_append(sense_data, "tags", "error-lua-exec") if 'Lua timeout error' in v: - data_append(wxr, sense_data, "tags", "error-lua-timeout") + data_append(sense_data, "tags", "error-lua-timeout") # Capture Category tags if not collect_links: for m in re.finditer( @@ -394,7 +394,7 @@ def clean_node_handler_fn(node): if not cat: continue if not sense_data_has_value(sense_data, "categories", cat): - data_append(wxr, sense_data, "categories", cat) + data_append(sense_data, "categories", cat) else: for m in re.finditer( r"(?is)\[\[:?(\s*([^][|:]+):)?\s*([^]|]+)(\|([^]|]+))?\]\]", @@ -410,7 +410,7 @@ def clean_node_handler_fn(node): if not cat: continue if not sense_data_has_value(sense_data, "categories", cat): - data_append(wxr, sense_data, "categories", cat) + data_append(sense_data, "categories", cat) elif not m.group(1): if m.group(5): ltext = clean_value(wxr, m.group(5)) @@ -431,7 +431,7 @@ def clean_node_handler_fn(node): ltext = ltarget ltuple = (ltext, ltarget) if not sense_data_has_value(sense_data, "links", ltuple): - data_append(wxr, sense_data, "links", ltuple) + data_append(sense_data, "links", ltuple) v = clean_value(wxr, v) # print("After clean_value:", repr(v)) diff --git a/src/wiktextract/pronunciations.py b/src/wiktextract/pronunciations.py index 99cd4fa7..324038a9 100644 --- a/src/wiktextract/pronunciations.py +++ b/src/wiktextract/pronunciations.py @@ -141,7 +141,7 @@ def parse_pronunciation_template_fn(name, ht): # parse_pronunciation_tags(wxr, dial, pron) # if country: # parse_pronunciation_tags(wxr, country, pron) - # data_append(wxr, data, "sounds", pron) + # data_append(data, "sounds", pron) return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" return None @@ -296,7 +296,7 @@ def generate_pron(v, new_parent_hdrs, new_specific_hdrs): if pron: pron["tags"] = list(sorted(pron["tags"])) if pron not in data.get("sounds", ()): - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) elif "→" in v: vals = re.split("→", v) for v in vals: @@ -314,7 +314,7 @@ def generate_pron(v, new_parent_hdrs, new_specific_hdrs): pron["tags"] = list(sorted(pron["tags"])) if pron not in data.get("sounds", ()): - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) else: # split alternative pronunciations split # with "," or " / " @@ -335,7 +335,7 @@ def generate_pron(v, new_parent_hdrs, new_specific_hdrs): if pron: pron["tags"] = list(sorted(pron["tags"])) if pron not in data.get("sounds", ()): - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) else: new_parent_hdrs.append(text) @@ -465,7 +465,7 @@ def split_cleaned_node_on_newlines(contents): if m: pron = {field: m.group(1)} if active_pos: pron["pos"] = active_pos - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) # have_pronunciations = True continue @@ -477,7 +477,7 @@ def split_cleaned_node_on_newlines(contents): if ending: pron = {"rhymes": ending} if active_pos: pron["pos"] = active_pos - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) # have_pronunciations = True continue @@ -489,7 +489,7 @@ def split_cleaned_node_on_newlines(contents): if w: pron = {"homophone": w} if active_pos: pron["pos"] = active_pos - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) # have_pronunciations = True continue @@ -503,12 +503,12 @@ def split_cleaned_node_on_newlines(contents): seen.add(w) pron = {"hangeul": w} if active_pos: pron["pos"] = active_pos - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) # have_pronunciations = True m = re.search(r"\b(Syllabification|Hyphenation): ([^\s,]*)", text) if m: - data_append(wxr, data, "hyphenation", m.group(2)) + data_append(data, "hyphenation", m.group(2)) # have_pronunciations = True # See if it contains a word prefix restricting which forms the @@ -546,7 +546,7 @@ def split_cleaned_node_on_newlines(contents): tag = pron_romanizations[prefix] form = {"form": w, "tags": tag.split()} - data_append(wxr, data, "forms", form) + data_append(data, "forms", form) # Find IPA pronunciations for m in re.finditer(r"(?m)/[^][\n/,]+?/" @@ -576,7 +576,7 @@ def split_cleaned_node_on_newlines(contents): parse_pronunciation_tags(wxr, tagstext, pron) if active_pos: pron["pos"] = active_pos - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) # have_pronunciations = True # XXX what about {{hyphenation|...}}, {{hyph|...}} @@ -634,7 +634,7 @@ def split_cleaned_node_on_newlines(contents): audio["mp3_url"] = mp3 if active_pos: audio["pos"] = active_pos if audio not in data.get("sounds", ()): - data_append(wxr, data, "sounds", audio) + data_append(data, "sounds", audio) # have_pronunciations = True audios =[] for enpr in enprs: @@ -645,7 +645,7 @@ def split_cleaned_node_on_newlines(contents): if active_pos: pron["pos"] = active_pos if pron not in data.get("sounds", ()): - data_append(wxr, data, "sounds", pron) + data_append(data, "sounds", pron) # have_pronunciations = True enprs = [] diff --git a/src/wiktextract/translations.py b/src/wiktextract/translations.py index 287d41c6..090423f6 100644 --- a/src/wiktextract/translations.py +++ b/src/wiktextract/translations.py @@ -498,8 +498,8 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, if cls == "tags": tagsets2, topics2 = decode_tags(par) for t in tagsets2: - data_extend(wxr, tr, "tags", t) - data_extend(wxr, tr, "topics", topics2) + data_extend(tr, "tags", t) + data_extend(tr, "topics", topics2) part = rest # Check if this part ends with (tags). Note that @@ -513,8 +513,8 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, if cls == "tags": tagsets2, topics2 = decode_tags(par) for t in tagsets2: - data_extend(wxr, tr, "tags", t) - data_extend(wxr, tr, "topics", topics2) + data_extend(tr, "tags", t) + data_extend(tr, "topics", topics2) part = rest # Check if this part starts with ": " @@ -530,8 +530,8 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, if cls == "tags": tagsets2, topics2 = decode_tags(par) for t in tagsets2: - data_extend(wxr, tr, "tags", t) - data_extend(wxr, tr, "topics", topics2) + data_extend(tr, "tags", t) + data_extend(tr, "topics", topics2) part = rest elif cls == "english": if re.search(tr_note_re, par): @@ -569,7 +569,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, ): if part.endswith(suffix): part = part[:-len(suffix)] - data_append(wxr, tr, "tags", t) + data_append(tr, "tags", t) break # Handle certain prefixes in translations @@ -578,7 +578,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, ): if part.startswith(prefix): part = part[len(prefix):] - data_append(wxr, tr, "tags", t) + data_append(tr, "tags", t) break # Skip certain one-character translations entirely @@ -587,7 +587,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, continue if "english" in tr and tr["english"] in english_to_tags: - data_extend(wxr, tr, "tags", + data_extend(tr, "tags", english_to_tags[tr["english"]].split()) del tr["english"] @@ -646,7 +646,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, # If we have only notes, add as-is if "word" not in tr: - data_append(wxr, data, "translations", tr) + data_append(data, "translations", tr) continue # Split if it contains no spaces @@ -665,7 +665,7 @@ def parse_translation_item_text(wxr, word, data, item, sense, pos_datas, if not alt: continue tr1["word"] = alt - data_append(wxr, data, "translations", tr1) + data_append(data, "translations", tr1) # Return the language name, in case we have subitems return lang From 73c0aec2f2c1baa5fd8ee3155048b78823c74c0a Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 28 Nov 2023 15:53:26 +0800 Subject: [PATCH 3/3] Remove unused code for Chinese Wiktionary from English extractor --- src/wiktextract/config.py | 7 ------- src/wiktextract/extractor/en/page.py | 24 ------------------------ 2 files changed, 31 deletions(-) diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 98d44f58..1680902a 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -48,9 +48,6 @@ class WiktionaryConfig: "POS_TYPES", "OTHER_SUBTITLES", "ZH_PRON_TAGS", - "FR_FORM_TABLES", - "DE_FORM_TABLES", - "FORM_OF_TEMPLATES", "analyze_templates", "extract_thesaurus_pages", ) @@ -112,10 +109,6 @@ def __init__( self.data_folder = files("wiktextract") / "data" / dump_file_lang_code self.init_subtitles() self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json") - if dump_file_lang_code == "zh": - self.set_attr_from_json( - "FORM_OF_TEMPLATES", "form_of_templates.json" - ) self.analyze_templates = True # find templates that need pre-expand self.extract_thesaurus_pages = True self.load_edition_settings() diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index ce563da4..3062c838 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -1470,9 +1470,6 @@ def sense_template_fn( v = v.strip() if v and "<" not in v: gloss_template_args.add(v) - if wxr.config.dump_file_lang_code == "zh": - add_form_of_tags(wxr, name, - wxr.config.FORM_OF_TEMPLATES, sense_base) return None def extract_link_texts(item): @@ -3596,24 +3593,3 @@ def multitrans_post_fn(name, ht, text): ) x["original_title"] = word return ret - - -def add_form_of_tags(wxr, template_name, form_of_templates, sense_data): - # https://en.wiktionary.org/wiki/Category:Form-of_templates - if template_name in form_of_templates: - data_append(sense_data, "tags", "form-of") - - if template_name in ("abbreviation of", "abbr of"): - data_append(sense_data, "tags", "abbreviation") - elif template_name.startswith(("alt ", "alternative")): - data_append(sense_data, "tags", "alt-of") - elif template_name.startswith(("female", "feminine")): - data_append(sense_data, "tags", "feminine") - elif template_name == "initialism of": - data_extend(sense_data, "tags", ["abbreviation", "initialism"]) - elif template_name.startswith("masculine"): - data_append(sense_data, "tags", "masculine") - elif template_name.startswith("misspelling"): - data_append(sense_data, "tags", "misspelling") - elif template_name.startswith(("obsolete", "obs ")): - data_append(sense_data, "tags", "obsolete")