Skip to content

Commit

Permalink
Implemented checks for valid output data format (in wiktionary.py).
Browse files Browse the repository at this point in the history
The checks generate debug messages.

Fixed a couple of typos in README.md.
  • Loading branch information
tatuylonen committed Feb 23, 2024
1 parent efa31c5 commit d135b89
Show file tree
Hide file tree
Showing 2 changed files with 339 additions and 3 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -712,14 +712,14 @@ sense is a dictionary that may contain the following keys (among others, and mor
* ``hypernyms`` - sense-disambiguated hypernym linkages for the word (see below)
* ``holonyms`` - sense-disambiguated linkages indicating being part of something (see below) (not systematically encoded)
* ``meronyms`` - sense-disambiguated linkages indicating having a part (see below) (fairly rare)
* ``coordianate_terms`` - sense-disambiguated coordinate_terms linkages (see below)
* ``coordinate_terms`` - sense-disambiguated coordinate_terms linkages (see below)
* ``derived`` - sense-disambiguated derived word linkages for the word (see below)
* ``related`` - sense-disambiguated related word linkages for the word (see below)
* ``senseid`` - list of textual identifiers collected for the sense. If there is a QID for the entry (e.g., Q123), those are stored in the ``wikidata`` field.
* ``wikidata`` - list of QIDs (e.g., Q123) for the sense
* ``wikipedia`` - list of Wikipedia page titles (with optional language code prefix)
* ``examples`` - list of usage examples, each example being a dictionary with ``text`` field containing the example text, optional ``ref`` field containing a source reference, optional ``english`` field containing English translation, optional ``type`` field containing example type (currently ``example`` or ``quotation`` if present), optional ``roman`` field containing romanization (for some languages written in non-Latin scripts), and optional (rare) ``note`` field contains English-language parenthesized note from the beginning of a non-english example.
* ``english`` - if the word sense has a qualifier that could not be parsed, that qualifier is put in this field (rare). Most qualifiers parsed into ``tags`` and/or ``topics``. The gloss with the qualifier still present can be found in ``raw_glosses``.
* ``english`` - if the word sense has a qualifier that could not be parsed, that qualifier is put in this field (rare). Most qualifiers are parsed into ``tags`` and/or ``topics``. The gloss with the qualifier still present can be found in ``raw_glosses``.

### Pronunciation

Expand Down
338 changes: 337 additions & 1 deletion src/wiktextract/wiktionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# from wiktionary. This file contains code to uncompress the Wiktionary
# dump file and to separate it into individual pages.
#
# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE and https://ylonen.org

import io
import json
Expand Down Expand Up @@ -162,6 +162,341 @@ def init_worker_process(worker_func, wxr: WiktextractContext) -> None:
worker_func.wxr = wxr


def check_error(
wxr: WiktextractContext,
dt: dict,
word: str,
lang: str,
pos: str,
msg: str
) -> None:
"""Formats and outputs an error message about data format checks."""
msg += ": " + json.dumps(dt, sort_keys=True)
prefix = word or ""
if lang:
prefix += "/" + lang
if pos:
prefix += "/" + pos
if prefix:
msg = prefix + ": " + msg
print(msg)
config = wxr.config
dt = { "msg": msg,
"trace": "",
"title": word,
"section": lang,
"subsection": pos,
"called_from": "XYZunsorted",
"path": [] }
config.debugs.append(dt)


def check_tags(
wxr: WiktextractContext,
dt: dict,
word: str,
lang: str,
pos: str,
item: dict
) -> None:
assert isinstance(item, dict)
tags = dt.get("tags")
if tags is None:
return
if not isinstance(tags, str):
check_error(wxr, dt, word, lang, pos,
"\"tags\" field value must be a string: {}"
.format(repr(tags)))
return
if tags.find(" ") >= 0:
check_error(wxr, dt, word, lang, pos,
"\"tags\" field should not contain duplicate spaces "
"(perhaps an empty tag was added?): {}"
.format(repr(tags)))
# XXX enable the following later (currently too many bogus tags in
# non-English editions). Tag values should be standardized across
# editions, except for uppercase tags (e.g., regional variants).
if wtp.lang_code in ("en",): # Check edition
for tag in tags.split():
if tag not in valid_tags:
check_error(wxr, dt, word, lang, pos,
"invalid tag {} not in valid_tags (or "
"uppercase_tags)"
.format(repr(tag)))

def check_str_fields(
wxr: WiktextractContext,
dt: dict,
word: str,
lang: str,
pos: str,
item: dict,
fields: list[str],
mandatory: bool = False,
empty_ok: bool = False
) -> None:
"""Checks that each of the listed fields contains a non-empty string.
Non-existent fields are ok unless ``mandatory`` is True."""
assert isinstance(item, dict)
for field in fields:
v = item.get(field)
if v is None:
if mandatory:
check_error(wxr, dt, word, lang, pos,
"{!r} should be a non-empty string (it is a "
"mandatory field): {}"
.format(field,
json.dumps(item, sort_keys=True)))
continue
if not isinstance(v, str):
check_error(wxr, dt, word, lang, pos,
"{!r} should be a non-empty string: {}"
.format(field,
json.dumps(item, sort_keys=True)))
if not v and not empty_ok:
check_error(wxr, dt, word, lang, pos,
"{!r} should contain a non-empty string: {}"
.format(field, json.dumps(item, sort_keys=True)))



def check_dict_list_fields(
wxr: WiktextractContext,
dt: dict,
word: str,
lang: str,
pos: str,
item: dict,
fields: list[str]
) -> bool:
"""Checks that each listed field, if present, is a list of dicts."""
assert isinstance(item, dict)
for field in fields:
lst = item.get(field)
if lst is None:
continue
if not isinstance(lst, (list, tuple)):
check_error(wxr, dt, word, lang, pos,
"{!r} should be a list of dicts: {}"
.format(field,
json.dumps(lst, sort_keys=True)))
return False
for x in lst:
if not isinstance(x, dict):
check_error(wxr, dt, word, lang, pos,
"{!r} should be a list of dicts: {}"
.format(field,
json.dumps(lst, sort_keys=True)))
return False
return True


def check_str_list_fields(
wxr: WiktextractContext,
dt: dict,
word: str,
lang: str,
pos: str,
item: dict,
fields: list[str]
) -> None:
"""Checks that each of the listed fields contains a list of non-empty
strings or is not present."""
assert isinstance(item, dict)
for field in fields:
lst = item.get(field)
if lst is None:
continue
if not isinstance(lst, (list, tuple)):
check_error(wxr, dt, word, lang, pos,
"{!r} should be a list of dicts: {}"
.format(field, json.dumps(item, sort_keys=True)))
continue
for x in lst:
if not isinstance(x, str) or not x:
check_error(wxr, dt, word, lang, pos,
"{!r} should be a list of non-empty strings: {}"
.format(field,
json.dumps(item, sort_keys=True)))
break


def check_json_data(
wxr: WiktextractContext,
dt: dict
) -> None:
"""Performs some basic checks on the generated data."""
word = dt.get("word")
if not word:
check_error(wxr, dt, None, None, None,
"missing \"word\" field in data")
return
lang = dt.get("lang")
if not lang:
check_error(wxr, dt, word, None, None,
"missing \"lang\" field in data")
return
pos = dt.get("pos")
if not pos:
check_error(wxr, dt, word, lang, pos,
"missing \"pos\" field in data")
return
if not dt.get("lang_code"):
check_error(wxr, dt, word, lang, pos,
"missing \"lang_code\" field in data")
check_tags(wxr, dt, word, lang, pos, dt)
check_str_fields(wxr, dt, word, lang, pos, dt,
["etymology_text"])
num = dt.get("etymology_number")
if num is not None and not isinstance(num, int):
check_error(wxr, dt, word, lang, pos,
"\"etymology_number\" must be an int")
# Check that certain fields, if present, contain lists of dicts
if not check_dict_list_fields(wxr, dt, word, lang, pos, dt,
["forms", "senses",
"synonyms", "antonyms", "hypernyms",
"holonyms", "meronyms",
"coordinate_terms", "derived",
"related",
"sounds", "translations",
"descendants", "etymology_templates",
"head_templates",
"inflection_templates"]):
return # Avoid further processing because it would cause type errors
# Check the "forms" field
forms = dt.get("forms") or []
for form in forms:
check_str_fields(wxr, dt, word, lang, pos, form, ["form"],
mandatory=True)
check_tags(wxr, dt, word, lang, pos, form)
check_str_list_fields(wxr, dt, word, lang, pos, dt,
["categories", "topics", "wikidata",
"wikipedia"])
# Check the "senses" field
senses = dt.get("senses") or []
if not senses:
check_error(wxr, dt, word, lang, pos,
"missing \"senses\" in data (must have at least one "
"sense, add empty sense with \"no-gloss\" tag if none "
"otherwise available)")
return
for sense in senses:
check_str_list_fields(wxr, dt, word, lang, pos, sense,
["glosses", "raw_glosses"])
# Extra check: should have no-gloss tag if no glosses
for field in ("glosses", "raw_glosses"):
glosses = sense.get(field) or []
if (not glosses and
isinstance(sense.get("tags"), str) and
"no-gloss" not in sense.get("tags", "").split()):
check_error(wxr, dt, word, lang, pos,
"{!r} should have at least one gloss or "
"\"no-gloss\" in \"tags\""
.format(field))
continue
check_tags(wxr, dt, word, lang, pos, sense)
check_str_list_fields(wxr, dt, word, lang, pos, sense,
["categories", "topics", "wikidata",
"wikipedia"])
check_str_fields(wxr, dt, word, lang, pos, sense,
["english"])
if not check_dict_list_fields(wxr, dt, word, lang, pos, sense,
["alt_of", "form_of",
"synonyms", "antonyms", "hypernyms",
"holonyms", "meronyms",
"coordinate_terms", "derived",
"related"]):
continue
for field in ("alt_of", "form_of"):
lst = sense.get(field)
if lst is None:
continue
for item in lst:
check_str_fields(wxr, dt, word, lang, pos, item,
["word"], mandatory=True)
check_str_fields(wxr, dt, word, lang, pos, item,
["extra"], mandatory=False)

for field in ("synonyms", "antonyms", "hypernyms", "holonyms",
"meronyms", "coordinate_terms", "derived", "related"):
lst = sense.get(field)
if lst is None:
continue
for item in lst:
check_str_fields(wxr, dt, word, lang, pos, item,
["word"], mandatory=True)
check_tags(wxr, dt, word, lang, pos, item)
check_str_fields(wxr, dt, word, lang, pos, item,
["english", "roman", "sense", "taxonomic"],
mandatory=False, empty_ok=True)
check_str_list_fields(wxr, dt, word, lang, pos, item,
["topics"])
# Check the "sounds" field
# We will permit having any number of different types (ipa, enpr, etc)
# in the same sound entry or in different sound entries.
sounds = dt.get("sounds") or []
for item in sounds:
check_str_fields(wxr, dt, word, lang, pos, item,
["ipa", "enpr", "audio", "ogg_url", "mp3_url",
"audio-ipa", "text"])
check_tags(wxr, dt, word, lang, pos, item)
check_str_list_fields(wxr, dt, word, lang, pos, item,
["homophones", "hyphenation"])
# Check the "translations" field
translations = dt.get("translations") or []
for item in translations:
check_str_fields(wxr, dt, word, lang, pos, item, ["word"],
mandatory=True)
check_tags(wxr, dt, word, lang, pos, item)
check_str_fields(wxr, dt, word, lang, pos, item,
["alt", "code", "english", "lang", "note", "roman",
"sense", "taxonomic"])
if not item.get("code") and not item.get("lang"):
check_error(wxr, dt, word, lang, pos,
"\"translations\" items must contain at least one "
"of \"code\" and \"lang\" (normally both): {}"
.format(json.dumps(item, sort_keys=True)))
# Check the "etymology_templates", "head_templates", and
# "inflection_templates" fields
for field in ["etymology_templates", "head_templates",
"inflection_templates"]:
lst = dt.get(field)
if lst is None:
continue
for item in lst:
check_str_fields(wxr, dt, word, lang, pos, item, ["name"],
mandatory=True)
check_str_fields(wxr, dt, word, lang, pos, item, ["expansion"],
mandatory=False)
args = item.get("args")
if args is None:
continue
if not isinstance(args, dict):
check_error(wxr, dt, word, lang, pos,
"{!r} item \"args\" value must be a dict: {}"
.format(field, json.dumps(args, sort_keys=True)))
continue
for k, v in args.items():
if not isinstance(k, str) or not isinstance(v, str):
check_error(wxr, dt, word, lang, pos,
"{!r} item \"args\" must be a dict with "
"string keys and values: {}"
.format(field,
json.dumps(args, sort_keys=True)))
continue
# Check the "descendants" field
descendants = dt.get("descendants") or []
for item in descendants:
check_str_fields(wxr, dt, word, lang, pos, item, ["text"])
depth = item.get("depth")
if depth is not None and not isinstance(depth, int):
check_error(wxr, dt, word, lang, pos,
"\"descentants\" field \"depth\" must be an int")
check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"])
# XXX should check that they are valid templates, perhaps turn
# template checking code above into a function


def reprocess_wiktionary(
wxr: WiktextractContext,
num_processes: Optional[int],
Expand Down Expand Up @@ -205,6 +540,7 @@ def reprocess_wiktionary(
):
wxr.config.merge_return(wtp_stats)
for dt in page_data:
check_json_data(wxr, dt)
write_json_data(dt, out_f, human_readable)
word = dt.get("word")
lang_code = dt.get("lang_code")
Expand Down

1 comment on commit d135b89

@xxyzz
Copy link
Collaborator

@xxyzz xxyzz commented on d135b89 Feb 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO some of the tests probably are not needed for non-English extractors because they use pydantic models, invalid types or fields can't be created.

Please sign in to comment.