Implemented checks for valid output data format (in wiktionary.py).

The checks generate debug messages. Fixed a couple of typos in README.md.
tatuylonen · Feb 23, 2024 · d135b89 · d135b89 · xxyzz · Feb 23, 2024
1 parent efa31c5
commit d135b89
Show file tree

Hide file tree

Showing 2 changed files with 339 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -712,14 +712,14 @@ sense is a dictionary that may contain the following keys (among others, and mor
 * ``hypernyms`` - sense-disambiguated hypernym linkages for the word (see below)
 * ``holonyms`` - sense-disambiguated linkages indicating being part of something (see below) (not systematically encoded)
 * ``meronyms`` - sense-disambiguated linkages indicating having a part (see below) (fairly rare)
-* ``coordianate_terms`` - sense-disambiguated coordinate_terms linkages (see below)
+* ``coordinate_terms`` - sense-disambiguated coordinate_terms linkages (see below)
 * ``derived`` - sense-disambiguated derived word linkages for the word (see below)
 * ``related`` - sense-disambiguated related word linkages for the word (see below)
 * ``senseid`` - list of textual identifiers collected for the sense.  If there is a QID for the entry (e.g., Q123), those are stored in the ``wikidata`` field.
 * ``wikidata`` - list of QIDs (e.g., Q123) for the sense
 * ``wikipedia`` - list of Wikipedia page titles (with optional language code prefix)
 * ``examples`` - list of usage examples, each example being a dictionary with ``text`` field containing the example text, optional ``ref`` field containing a source reference, optional ``english`` field containing English translation, optional ``type`` field containing example type (currently ``example`` or ``quotation`` if present), optional ``roman`` field containing romanization (for some languages written in non-Latin scripts), and optional (rare) ``note`` field contains English-language parenthesized note from the beginning of a non-english example.
-* ``english`` - if the word sense has a qualifier that could not be parsed, that qualifier is put in this field (rare).  Most qualifiers parsed into ``tags`` and/or ``topics``.  The gloss with the qualifier still present can be found in ``raw_glosses``.
+* ``english`` - if the word sense has a qualifier that could not be parsed, that qualifier is put in this field (rare).  Most qualifiers are parsed into ``tags`` and/or ``topics``.  The gloss with the qualifier still present can be found in ``raw_glosses``.
 
 ### Pronunciation
 

diff --git a/src/wiktextract/wiktionary.py b/src/wiktextract/wiktionary.py
@@ -2,7 +2,7 @@
 # from wiktionary.  This file contains code to uncompress the Wiktionary
 # dump file and to separate it into individual pages.
 #
-# Copyright (c) 2018-2022 Tatu Ylonen.  See file LICENSE and https://ylonen.org
+# Copyright (c) 2018-2022, 2024 Tatu Ylonen.  See file LICENSE and https://ylonen.org
 
 import io
 import json
@@ -162,6 +162,341 @@ def init_worker_process(worker_func, wxr: WiktextractContext) -> None:
     worker_func.wxr = wxr
 
 
+def check_error(
+        wxr: WiktextractContext,
+        dt: dict,
+        word: str,
+        lang: str,
+        pos: str,
+        msg: str
+) -> None:
+    """Formats and outputs an error message about data format checks."""
+    msg += ": " + json.dumps(dt, sort_keys=True)
+    prefix = word or ""
+    if lang:
+        prefix += "/" + lang
+    if pos:
+        prefix += "/" + pos
+    if prefix:
+        msg = prefix + ": " + msg
+    print(msg)
+    config = wxr.config
+    dt = { "msg": msg,
+           "trace": "",
+           "title": word,
+           "section": lang,
+           "subsection": pos,
+           "called_from": "XYZunsorted",
+           "path": [] }
+    config.debugs.append(dt)
+
+
+def check_tags(
+        wxr: WiktextractContext,
+        dt: dict,
+        word: str,
+        lang: str,
+        pos: str,
+        item: dict
+) -> None:
+    assert isinstance(item, dict)
+    tags = dt.get("tags")
+    if tags is None:
+        return
+    if not isinstance(tags, str):
+        check_error(wxr, dt, word, lang, pos,
+                    "\"tags\" field value must be a string: {}"
+                    .format(repr(tags)))
+        return
+    if tags.find("  ") >= 0:
+        check_error(wxr, dt, word, lang, pos,
+                    "\"tags\" field should not contain duplicate spaces "
+                    "(perhaps an empty tag was added?): {}"
+                    .format(repr(tags)))
+    # XXX enable the following later (currently too many bogus tags in
+    # non-English editions).  Tag values should be standardized across
+    # editions, except for uppercase tags (e.g., regional variants).
+    if wtp.lang_code in ("en",):  # Check edition
+        for tag in tags.split():
+            if tag not in valid_tags:
+                check_error(wxr, dt, word, lang, pos,
+                            "invalid tag {} not in valid_tags (or "
+                            "uppercase_tags)"
+                            .format(repr(tag)))
+
+def check_str_fields(
+        wxr: WiktextractContext,
+        dt: dict,
+        word: str,
+        lang: str,
+        pos: str,
+        item: dict,
+        fields: list[str],
+        mandatory: bool = False,
+        empty_ok: bool = False
+) -> None:
+    """Checks that each of the listed fields contains a non-empty string.
+    Non-existent fields are ok unless ``mandatory`` is True."""
+    assert isinstance(item, dict)
+    for field in fields:
+        v = item.get(field)
+        if v is None:
+            if mandatory:
+                check_error(wxr, dt, word, lang, pos,
+                            "{!r} should be a non-empty string (it is a "
+                            "mandatory field): {}"
+                            .format(field,
+                                    json.dumps(item, sort_keys=True)))
+            continue
+        if not isinstance(v, str):
+            check_error(wxr, dt, word, lang, pos,
+                        "{!r} should be a non-empty string: {}"
+                        .format(field,
+                                json.dumps(item, sort_keys=True)))
+        if not v and not empty_ok:
+            check_error(wxr, dt, word, lang, pos,
+                        "{!r} should contain a non-empty string: {}"
+                        .format(field, json.dumps(item, sort_keys=True)))
+
+
+
+def check_dict_list_fields(
+        wxr: WiktextractContext,
+        dt: dict,
+        word: str,
+        lang: str,
+        pos: str,
+        item: dict,
+        fields: list[str]
+) -> bool:
+    """Checks that each listed field, if present, is a list of dicts."""
+    assert isinstance(item, dict)
+    for field in fields:
+        lst = item.get(field)
+        if lst is None:
+            continue
+        if not isinstance(lst, (list, tuple)):
+            check_error(wxr, dt, word, lang, pos,
+                        "{!r} should be a list of dicts: {}"
+                        .format(field,
+                                json.dumps(lst, sort_keys=True)))
+            return False
+        for x in lst:
+            if not isinstance(x, dict):
+                check_error(wxr, dt, word, lang, pos,
+                            "{!r} should be a list of dicts: {}"
+                            .format(field,
+                                    json.dumps(lst, sort_keys=True)))
+                return False
+    return True
+
+
+def check_str_list_fields(
+        wxr: WiktextractContext,
+        dt: dict,
+        word: str,
+        lang: str,
+        pos: str,
+        item: dict,
+        fields: list[str]
+) -> None:
+    """Checks that each of the listed fields contains a list of non-empty
+    strings or is not present."""
+    assert isinstance(item, dict)
+    for field in fields:
+        lst = item.get(field)
+        if lst is None:
+            continue
+        if not isinstance(lst, (list, tuple)):
+            check_error(wxr, dt, word, lang, pos,
+                        "{!r} should be a list of dicts: {}"
+                        .format(field, json.dumps(item, sort_keys=True)))
+            continue
+        for x in lst:
+            if not isinstance(x, str) or not x:
+                check_error(wxr, dt, word, lang, pos,
+                            "{!r} should be a list of non-empty strings: {}"
+                            .format(field,
+                                    json.dumps(item, sort_keys=True)))
+                break
+
+
+def check_json_data(
+        wxr: WiktextractContext,
+        dt: dict
+) -> None:
+    """Performs some basic checks on the generated data."""
+    word = dt.get("word")
+    if not word:
+        check_error(wxr, dt, None, None, None,
+                    "missing \"word\" field in data")
+        return
+    lang = dt.get("lang")
+    if not lang:
+        check_error(wxr, dt, word, None, None,
+                    "missing \"lang\" field in data")
+        return
+    pos = dt.get("pos")
+    if not pos:
+        check_error(wxr, dt, word, lang, pos,
+                    "missing \"pos\" field in data")
+        return
+    if not dt.get("lang_code"):
+        check_error(wxr, dt, word, lang, pos,
+                    "missing \"lang_code\" field in data")
+    check_tags(wxr, dt, word, lang, pos, dt)
+    check_str_fields(wxr, dt, word, lang, pos, dt,
+                     ["etymology_text"])
+    num = dt.get("etymology_number")
+    if num is not None and not isinstance(num, int):
+        check_error(wxr, dt, word, lang, pos,
+                    "\"etymology_number\" must be an int")
+    # Check that certain fields, if present, contain lists of dicts
+    if not check_dict_list_fields(wxr, dt, word, lang, pos, dt,
+                                  ["forms", "senses",
+                                   "synonyms", "antonyms", "hypernyms",
+                                   "holonyms", "meronyms",
+                                   "coordinate_terms", "derived",
+                                   "related",
+                                   "sounds", "translations",
+                                   "descendants", "etymology_templates",
+                                   "head_templates",
+                                   "inflection_templates"]):
+        return  # Avoid further processing because it would cause type errors
+    # Check the "forms" field
+    forms = dt.get("forms") or []
+    for form in forms:
+        check_str_fields(wxr, dt, word, lang, pos, form, ["form"],
+                         mandatory=True)
+        check_tags(wxr, dt, word, lang, pos, form)
+    check_str_list_fields(wxr, dt, word, lang, pos, dt,
+                          ["categories", "topics", "wikidata",
+                           "wikipedia"])
+    # Check the "senses" field
+    senses = dt.get("senses") or []
+    if not senses:
+        check_error(wxr, dt, word, lang, pos,
+                    "missing \"senses\" in data (must have at least one "
+                    "sense, add empty sense with \"no-gloss\" tag if none "
+                    "otherwise available)")
+        return
+    for sense in senses:
+        check_str_list_fields(wxr, dt, word, lang, pos, sense,
+                              ["glosses", "raw_glosses"])
+        # Extra check: should have no-gloss tag if no glosses
+        for field in ("glosses", "raw_glosses"):
+            glosses = sense.get(field) or []
+            if (not glosses and
+                isinstance(sense.get("tags"), str) and
+                "no-gloss" not in sense.get("tags", "").split()):
+                check_error(wxr, dt, word, lang, pos,
+                            "{!r} should have at least one gloss or "
+                            "\"no-gloss\" in \"tags\""
+                            .format(field))
+                continue
+        check_tags(wxr, dt, word, lang, pos, sense)
+        check_str_list_fields(wxr, dt, word, lang, pos, sense,
+                              ["categories", "topics", "wikidata",
+                               "wikipedia"])
+        check_str_fields(wxr, dt, word, lang, pos, sense,
+                         ["english"])
+        if not check_dict_list_fields(wxr, dt, word, lang, pos, sense,
+                                      ["alt_of", "form_of",
+                                       "synonyms", "antonyms", "hypernyms",
+                                       "holonyms", "meronyms",
+                                       "coordinate_terms", "derived",
+                                       "related"]):
+            continue
+        for field in ("alt_of", "form_of"):
+            lst = sense.get(field)
+            if lst is None:
+                continue
+            for item in lst:
+                check_str_fields(wxr, dt, word, lang, pos, item,
+                                 ["word"], mandatory=True)
+                check_str_fields(wxr, dt, word, lang, pos, item,
+                                 ["extra"], mandatory=False)
+
+        for field in ("synonyms", "antonyms", "hypernyms", "holonyms",
+                      "meronyms", "coordinate_terms", "derived", "related"):
+            lst = sense.get(field)
+            if lst is None:
+                continue
+            for item in lst:
+                check_str_fields(wxr, dt, word, lang, pos, item,
+                                 ["word"], mandatory=True)
+                check_tags(wxr, dt, word, lang, pos, item)
+                check_str_fields(wxr, dt, word, lang, pos, item,
+                                 ["english", "roman", "sense", "taxonomic"],
+                                 mandatory=False, empty_ok=True)
+                check_str_list_fields(wxr, dt, word, lang, pos, item,
+                                      ["topics"])
+    # Check the "sounds" field
+    # We will permit having any number of different types (ipa, enpr, etc)
+    # in the same sound entry or in different sound entries.
+    sounds = dt.get("sounds") or []
+    for item in sounds:
+        check_str_fields(wxr, dt, word, lang, pos, item,
+                         ["ipa", "enpr", "audio", "ogg_url", "mp3_url",
+                          "audio-ipa", "text"])
+        check_tags(wxr, dt, word, lang, pos, item)
+        check_str_list_fields(wxr, dt, word, lang, pos, item,
+                              ["homophones", "hyphenation"])
+    # Check the "translations" field
+    translations = dt.get("translations") or []
+    for item in translations:
+        check_str_fields(wxr, dt, word, lang, pos, item, ["word"],
+                         mandatory=True)
+        check_tags(wxr, dt, word, lang, pos, item)
+        check_str_fields(wxr, dt, word, lang, pos, item,
+                         ["alt", "code", "english", "lang", "note", "roman",
+                          "sense", "taxonomic"])
+        if not item.get("code") and not item.get("lang"):
+            check_error(wxr, dt, word, lang, pos,
+                        "\"translations\" items must contain at least one "
+                        "of \"code\" and \"lang\" (normally both): {}"
+                        .format(json.dumps(item, sort_keys=True)))
+    # Check the "etymology_templates", "head_templates", and
+    # "inflection_templates" fields
+    for field in ["etymology_templates", "head_templates",
+                  "inflection_templates"]:
+        lst = dt.get(field)
+        if lst is None:
+            continue
+        for item in lst:
+            check_str_fields(wxr, dt, word, lang, pos, item, ["name"],
+                             mandatory=True)
+            check_str_fields(wxr, dt, word, lang, pos, item, ["expansion"],
+                             mandatory=False)
+            args = item.get("args")
+            if args is None:
+                continue
+            if not isinstance(args, dict):
+                check_error(wxr, dt, word, lang, pos,
+                            "{!r} item \"args\" value must be a dict: {}"
+                            .format(field, json.dumps(args, sort_keys=True)))
+                continue
+            for k, v in args.items():
+                if not isinstance(k, str) or not isinstance(v, str):
+                    check_error(wxr, dt, word, lang, pos,
+                                "{!r} item \"args\" must be a dict with "
+                                "string keys and values: {}"
+                                .format(field,
+                                        json.dumps(args, sort_keys=True)))
+                continue
+    # Check the "descendants" field
+    descendants = dt.get("descendants") or []
+    for item in descendants:
+        check_str_fields(wxr, dt, word, lang, pos, item, ["text"])
+        depth = item.get("depth")
+        if depth is not None and not isinstance(depth, int):
+            check_error(wxr, dt, word, lang, pos,
+                        "\"descentants\" field \"depth\" must be an int")
+        check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"])
+        # XXX should check that they are valid templates, perhaps turn
+        # template checking code above into a function
+
+
 def reprocess_wiktionary(
     wxr: WiktextractContext,
     num_processes: Optional[int],
@@ -205,6 +540,7 @@ def reprocess_wiktionary(
         ):
             wxr.config.merge_return(wtp_stats)
             for dt in page_data:
+                check_json_data(wxr, dt)
                 write_json_data(dt, out_f, human_readable)
                 word = dt.get("word")
                 lang_code = dt.get("lang_code")