diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 041f076e..89da35f1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -29,16 +29,16 @@ jobs: cache: 'pip' - run: | python -m pip install -U pip - python -m pip install --use-pep517 '.[dev]' - - run: make test + python -m pip install --use-pep517 -e '.[dev]' + - run: make test_coverage env: PYTHONWARNINGS: default - - name: Remove huge file taxondata_py.html - run: rm -f htmlcov/*_taxondata_py.html + - run: | + make coverage_report + make github_pages REPO=${{ github.repository }} SHA=${{ github.sha }} + if: github.ref_name == 'master' && matrix.python-version == '3.12' - uses: actions/upload-pages-artifact@v2 if: github.ref_name == 'master' && matrix.python-version == '3.12' - with: - path: htmlcov deploy: if: github.ref_name == 'master' diff --git a/.gitignore b/.gitignore index a9b02002..1271ff87 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ usertools/data/ bac-wikt-* pagesbac/ wikt-db* + +# GitHub Pages +_site diff --git a/Makefile b/Makefile index be49138b..7ca5dfcb 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,20 @@ -# Run "make test" to run tests (with coverage analysis left in ./htmlcov) +# Run "make test" to run tests # Run "make clean" to remove automatically generated files +REPO ?= tatuylonen/wiktextract +SHA ?= HEAD test: - rm -rf .coverage htmlcov - python -m nose2 --output-buffer --pretty-assert --with-coverage --coverage-report=html -quicktest: - python -m nose2 --output-buffer --pretty-assert + python -m unittest discover -b -s tests +test_coverage: + python -m coverage erase + python -m coverage run -m unittest discover -b -s tests +coverage_report: + python -m coverage combine + python -m coverage html +github_pages: + python tools/generate_schema.py + cp json_schema/*.json _site + python tools/github_pages.py $(REPO) $(SHA) clean: - rm -rf __pycache__ - rm -rf .coverage* htmlcov* + python -m coverage erase + rm -rf __pycache__ _site diff --git a/README.md b/README.md index b4aff931..bf3aec7c 100644 --- a/README.md +++ b/README.md @@ -322,13 +322,11 @@ python -m pip install -U pip python -m pip install --use-pep517 . ``` -This software requires Python 3. - ### Running tests This package includes tests written using the `unittest` framework. -They can be run using, for example, `nose2`, which can be installed -using `python -m pip install --use-pep517 -e ".[dev]"`. +The test dependencies can be installed with command +`python -m pip install --use-pep517 -e ".[dev]"`. To run the tests, use the following command in the top-level directory: @@ -358,22 +356,6 @@ updated regularly with the latest Wiktionary dump. Using the pre-extracted data may be the easiest option unless you have special needs or want to modify the code. -### Installing and running tests on Windows with VS Code - -Tested with Python 3.9.4. - -- Create [a Python virtual environment](https://code.visualstudio.com/docs/python/environments#_creating-environments) -(venv) in the VS Code workspace with the cloned repo. It should automatically install the package. - -- Open a new terminal. It should be PowerShell. You may need to [fix terminal permissions](https://stackoverflow.com/questions/56199111/visual-studio-code-cmd-error-cannot-be-loaded-because-running-scripts-is-disabl/67420296#67420296) -in order for it to pick up the virtual environment correclty. - -- In the terminal run this command: - -``` -py -m nose2 -B -``` - ## Using the command-line tool The ``wiktwords`` script is the easiest way to extract data from diff --git a/json_schema/de.json b/json_schema/de.json deleted file mode 100644 index a5d645d4..00000000 --- a/json_schema/de.json +++ /dev/null @@ -1,932 +0,0 @@ -{ - "$defs": { - "Example": { - "additionalProperties": false, - "properties": { - "ref": { - "anyOf": [ - { - "$ref": "#/$defs/Reference" - }, - { - "type": "null" - } - ], - "default": null, - "description": "" - }, - "text": { - "default": null, - "description": "Example usage sentence", - "title": "Text", - "type": "string" - } - }, - "title": "Example", - "type": "object" - }, - "Reference": { - "additionalProperties": false, - "properties": { - "accessdate": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Date of access of online reference", - "title": "Accessdate" - }, - "author": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Author's name", - "title": "Author" - }, - "collection": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Name of collection that reference was published in", - "title": "Collection" - }, - "comment": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Comment on the reference", - "title": "Comment" - }, - "date": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Date of publication", - "title": "Date" - }, - "day": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Day of publication", - "title": "Day" - }, - "edition": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Edition number", - "title": "Edition" - }, - "editor": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Editor", - "title": "Editor" - }, - "isbn": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "ISBN number", - "title": "Isbn" - }, - "month": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Month of publication", - "title": "Month" - }, - "number": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Issue number", - "title": "Number" - }, - "pages": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Page numbers", - "title": "Pages" - }, - "place": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Place of publication", - "title": "Place" - }, - "publisher": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Published by", - "title": "Publisher" - }, - "raw_ref": { - "default": null, - "description": "Raw reference string", - "title": "Raw Ref", - "type": "string" - }, - "title": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Title of the reference", - "title": "Title" - }, - "title_complement": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Complement to the title", - "title": "Title Complement" - }, - "translator": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Translator", - "title": "Translator" - }, - "url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "A web link. Not necessarily well-formated.", - "title": "Url" - }, - "volume": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Volume number", - "title": "Volume" - }, - "year": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Year of publication", - "title": "Year" - } - }, - "title": "Reference", - "type": "object" - }, - "Sense": { - "additionalProperties": false, - "properties": { - "antonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Antonyms" - }, - "categories": { - "default": [], - "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", - "items": { - "type": "string" - }, - "title": "Categories", - "type": "array" - }, - "coordinate_terms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Coordinate Terms" - }, - "derived": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Derived" - }, - "examples": { - "default": [], - "description": "List of examples", - "items": { - "$ref": "#/$defs/Example" - }, - "title": "Examples", - "type": "array" - }, - "expressions": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Expressions" - }, - "glosses": { - "default": [], - "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", - "items": { - "type": "string" - }, - "title": "Glosses", - "type": "array" - }, - "holonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Holonyms" - }, - "hypernyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hypernyms" - }, - "hyponyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hyponyms" - }, - "proverbs": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Proverbs" - }, - "raw_glosses": { - "default": [], - "description": "list of uncleaned raw glosses for the word sense (usually only one).", - "items": { - "type": "string" - }, - "title": "Raw Glosses", - "type": "array" - }, - "senseid": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Sense number used in Wiktionary", - "title": "Senseid" - }, - "synonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Synonyms" - }, - "tags": { - "default": [], - "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - }, - "translations": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Translation" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Translations" - } - }, - "title": "Sense", - "type": "object" - }, - "Sound": { - "additionalProperties": false, - "properties": { - "audio": { - "default": [], - "description": "Audio file name", - "items": { - "type": "string" - }, - "title": "Audio", - "type": "array" - }, - "flac_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Flac Url", - "type": "array" - }, - "ipa": { - "default": [], - "description": "International Phonetic Alphabet", - "items": { - "type": "string" - }, - "title": "Ipa", - "type": "array" - }, - "lang_code": { - "default": [], - "description": "Wiktionary language code", - "items": { - "type": "string" - }, - "title": "Lang Code", - "type": "array" - }, - "lang_name": { - "default": [], - "description": "Localized language name", - "items": { - "type": "string" - }, - "title": "Lang Name", - "type": "array" - }, - "mp3_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Mp3 Url", - "type": "array" - }, - "oga_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Oga Url", - "type": "array" - }, - "ogg_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Ogg Url", - "type": "array" - }, - "tags": { - "default": [], - "description": "Specifying the variant of the pronunciation", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - }, - "wav_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Wav Url", - "type": "array" - } - }, - "title": "Sound", - "type": "object" - }, - "Translation": { - "additionalProperties": false, - "properties": { - "lang_code": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Wiktionary language code of the translation term", - "title": "Lang Code" - }, - "lang_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Localized language name", - "title": "Lang Name" - }, - "notes": { - "default": [], - "description": "A list of notes", - "items": { - "type": "string" - }, - "title": "Notes", - "type": "array" - }, - "roman": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Transliteration in roman characters", - "title": "Roman" - }, - "sense": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "A gloss of the sense being translated", - "title": "Sense" - }, - "tags": { - "default": [], - "description": "Tags specifying the translated term, usually gender information", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - }, - "uncertain": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "null" - } - ], - "default": false, - "description": "Translation marked as uncertain", - "title": "Uncertain" - }, - "word": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Translation term", - "title": "Word" - } - }, - "title": "Translation", - "type": "object" - } - }, - "$id": "https://kaikki.org/de.json", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "additionalProperties": false, - "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", - "properties": { - "antonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Antonyms" - }, - "coordinate_terms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Coordinate Terms" - }, - "derived": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Derived" - }, - "expressions": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Expressions" - }, - "holonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Holonyms" - }, - "hypernyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hypernyms" - }, - "hyponyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hyponyms" - }, - "lang_code": { - "description": "Wiktionary language code", - "examples": [ - "es" - ], - "title": "Lang Code", - "type": "string" - }, - "lang_name": { - "description": "Localized language name of the word", - "examples": [ - "español" - ], - "title": "Lang Name", - "type": "string" - }, - "pos": { - "default": null, - "description": "Part of speech type", - "title": "Pos", - "type": "string" - }, - "proverbs": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Proverbs" - }, - "senses": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sense" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Senses" - }, - "sounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sound" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Sounds" - }, - "synonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Synonyms" - }, - "translations": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Translation" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Translations" - }, - "word": { - "description": "word string", - "title": "Word", - "type": "string" - } - }, - "required": [ - "word", - "lang_code", - "lang_name" - ], - "title": "German Wiktionary", - "type": "object" -} \ No newline at end of file diff --git a/json_schema/es.json b/json_schema/es.json deleted file mode 100644 index 7dc1d6f2..00000000 --- a/json_schema/es.json +++ /dev/null @@ -1,876 +0,0 @@ -{ - "$defs": { - "Example": { - "additionalProperties": false, - "properties": { - "ref": { - "anyOf": [ - { - "$ref": "#/$defs/Reference" - }, - { - "type": "null" - } - ], - "default": null, - "description": "" - }, - "text": { - "description": "Example usage sentence", - "title": "Text", - "type": "string" - }, - "translation": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Spanish translation of the example sentence", - "title": "Translation" - } - }, - "required": [ - "text" - ], - "title": "Example", - "type": "object" - }, - "Linkage": { - "additionalProperties": false, - "properties": { - "alternative_spelling": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Alternative spelling of the word", - "title": "Alternative Spelling" - }, - "note": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Note" - }, - "word": { - "title": "Word", - "type": "string" - } - }, - "required": [ - "word" - ], - "title": "Linkage", - "type": "object" - }, - "Reference": { - "additionalProperties": false, - "properties": { - "chapter": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Chapter name", - "title": "Chapter" - }, - "date": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Date of publication", - "title": "Date" - }, - "editor": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Editor", - "title": "Editor" - }, - "first_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Author's first name", - "title": "First Name" - }, - "journal": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Name of journal", - "title": "Journal" - }, - "last_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Author's last name", - "title": "Last Name" - }, - "pages": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Page numbers", - "title": "Pages" - }, - "place": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Place of publication", - "title": "Place" - }, - "title": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Title of the reference", - "title": "Title" - }, - "url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "A web link", - "title": "Url" - }, - "year": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Year of publication", - "title": "Year" - } - }, - "title": "Reference", - "type": "object" - }, - "Sense": { - "additionalProperties": false, - "properties": { - "antonyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Antonyms" - }, - "categories": { - "default": [], - "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", - "items": { - "type": "string" - }, - "title": "Categories", - "type": "array" - }, - "compounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Compounds" - }, - "derived": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Derived" - }, - "examples": { - "default": [], - "description": "List of examples", - "items": { - "$ref": "#/$defs/Example" - }, - "title": "Examples", - "type": "array" - }, - "glosses": { - "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", - "items": { - "type": "string" - }, - "title": "Glosses", - "type": "array" - }, - "hypernyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hypernyms" - }, - "hyponyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hyponyms" - }, - "idioms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Idioms" - }, - "meronyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Meronyms" - }, - "related": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Related" - }, - "senseid": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Sense number used in Wiktionary", - "title": "Senseid" - }, - "synonyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Synonyms" - }, - "tags": { - "default": [], - "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - } - }, - "required": [ - "glosses" - ], - "title": "Sense", - "type": "object" - }, - "Sound": { - "additionalProperties": false, - "properties": { - "audio": { - "default": [], - "description": "Audio file name", - "items": { - "type": "string" - }, - "title": "Audio", - "type": "array" - }, - "flac_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Flac Url", - "type": "array" - }, - "ipa": { - "default": [], - "description": "International Phonetic Alphabet", - "items": { - "type": "string" - }, - "title": "Ipa", - "type": "array" - }, - "mp3_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Mp3 Url", - "type": "array" - }, - "ogg_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Ogg Url", - "type": "array" - }, - "phonetic_transcription": { - "default": [], - "description": "Phonetic transcription, less exact than IPA.", - "items": { - "type": "string" - }, - "title": "Phonetic Transcription", - "type": "array" - }, - "roman": { - "default": [], - "description": "Translitaration to Roman characters", - "items": { - "type": "string" - }, - "title": "Roman", - "type": "array" - }, - "syllabic": { - "default": [], - "description": "Syllabic transcription", - "items": { - "type": "string" - }, - "title": "Syllabic", - "type": "array" - }, - "tag": { - "default": [], - "description": "Specifying the variant of the pronunciation", - "items": { - "type": "string" - }, - "title": "Tag", - "type": "array" - }, - "wav_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Wav Url", - "type": "array" - } - }, - "title": "Sound", - "type": "object" - }, - "Spelling": { - "additionalProperties": false, - "properties": { - "alternative": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Alternative spelling with same pronunciation", - "title": "Alternative" - }, - "note": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Note regarding alternative spelling", - "title": "Note" - }, - "same_pronunciation": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Whether the alternative spelling has the same pronunciation as the default spelling", - "title": "Same Pronunciation" - } - }, - "title": "Spelling", - "type": "object" - }, - "Translation": { - "additionalProperties": false, - "properties": { - "lang_code": { - "description": "Wiktionary language code of the translation term", - "title": "Lang Code", - "type": "string" - }, - "notes": { - "default": [], - "description": "A list of notes", - "items": { - "type": "string" - }, - "title": "Notes", - "type": "array" - }, - "roman": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Transliteration in roman characters", - "title": "Roman" - }, - "senseids": { - "default": [], - "description": "List of senseids where this translation applies", - "items": { - "type": "string" - }, - "title": "Senseids", - "type": "array" - }, - "tags": { - "default": [], - "description": "Tags specifying the translated term, usually gender information", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - }, - "word": { - "description": "Translation term", - "title": "Word", - "type": "string" - } - }, - "required": [ - "word", - "lang_code" - ], - "title": "Translation", - "type": "object" - } - }, - "$id": "https://kaikki.org/es.json", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "additionalProperties": false, - "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", - "properties": { - "antonyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Antonyms" - }, - "categories": { - "default": [], - "description": "list of non-disambiguated categories for the word", - "items": { - "type": "string" - }, - "title": "Categories", - "type": "array" - }, - "compounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Compounds" - }, - "derived": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Derived" - }, - "hypernyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hypernyms" - }, - "hyponyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hyponyms" - }, - "idioms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Idioms" - }, - "lang_code": { - "description": "Wiktionary language code", - "examples": [ - "es" - ], - "title": "Lang Code", - "type": "string" - }, - "lang_name": { - "description": "Localized language name of the word", - "examples": [ - "español" - ], - "title": "Lang Name", - "type": "string" - }, - "meronyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Meronyms" - }, - "pos": { - "default": null, - "description": "Part of speech type", - "title": "Pos", - "type": "string" - }, - "pos_title": { - "default": null, - "description": "Original POS title", - "title": "Pos Title", - "type": "string" - }, - "related": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Related" - }, - "senses": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sense" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Senses" - }, - "sounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sound" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Sounds" - }, - "spellings": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Spelling" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Spellings" - }, - "synonyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Synonyms" - }, - "translations": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Translation" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Translations" - }, - "word": { - "description": "word string", - "title": "Word", - "type": "string" - } - }, - "required": [ - "word", - "lang_code", - "lang_name" - ], - "title": "Spanish Wiktionary", - "type": "object" -} \ No newline at end of file diff --git a/json_schema/ru.json b/json_schema/ru.json deleted file mode 100644 index 8955fbee..00000000 --- a/json_schema/ru.json +++ /dev/null @@ -1,199 +0,0 @@ -{ - "$defs": { - "Sound": { - "additionalProperties": false, - "properties": { - "audio": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Audio file name", - "title": "Audio" - }, - "flac_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Flac Url" - }, - "homophones": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "description": "Words with same pronunciation", - "title": "Homophones" - }, - "ipa": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "International Phonetic Alphabet", - "title": "Ipa" - }, - "mp3_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Mp3 Url" - }, - "oga_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Oga Url" - }, - "ogg_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Ogg Url" - }, - "tags": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "description": "Specifying the variant of the pronunciation", - "title": "Tags" - }, - "wav_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Wav Url" - } - }, - "title": "Sound", - "type": "object" - } - }, - "$id": "https://kaikki.org/ru.json", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "additionalProperties": false, - "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", - "properties": { - "categories": { - "default": [], - "description": "list of non-disambiguated categories for the word", - "items": { - "type": "string" - }, - "title": "Categories", - "type": "array" - }, - "lang_code": { - "description": "Wiktionary language code", - "examples": [ - "ru" - ], - "title": "Lang Code", - "type": "string" - }, - "lang_name": { - "description": "Localized language name of the word", - "examples": [ - "Русский" - ], - "title": "Lang Name", - "type": "string" - }, - "pos": { - "default": null, - "description": "Part of speech type", - "title": "Pos", - "type": "string" - }, - "pos_title": { - "default": null, - "description": "Original POS title", - "title": "Pos Title", - "type": "string" - }, - "sounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sound" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Sounds" - }, - "word": { - "description": "word string", - "title": "Word", - "type": "string" - } - }, - "required": [ - "word", - "lang_code", - "lang_name" - ], - "title": "Russian Wiktionary", - "type": "object" -} \ No newline at end of file diff --git a/json_schema/validate.py b/json_schema/validate.py deleted file mode 100644 index 1fb53e46..00000000 --- a/json_schema/validate.py +++ /dev/null @@ -1,36 +0,0 @@ -import argparse -import json -from concurrent.futures import ProcessPoolExecutor -from functools import partial -from pathlib import Path - - -def worker(line, schema={}): - from jsonschema import validate - - validate(instance=json.loads(line), schema=schema) - - -def main(): - """ - Validate extracted JSONL file with JSON schema. - """ - parser = argparse.ArgumentParser() - parser.add_argument("jsonl_path", type=Path) - parser.add_argument("schema_path", type=Path) - args = parser.parse_args() - - with ( - args.jsonl_path.open(encoding="utf-8") as jsonl_f, - args.schema_path.open(encoding="utf-8") as schema_f, - ProcessPoolExecutor() as executor, - ): - schema = json.load(schema_f) - for _ in executor.map( - partial(worker, schema=schema), jsonl_f, chunksize=1000 - ): - pass - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml index d596dbfc..9b0bbe93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,11 +40,9 @@ dependencies = [ [project.optional-dependencies] dev = [ "black", - "jsonschema", + "coverage[toml]", "mypy", - "nose2[coverage_plugin]", "ruff", - "tomli; python_version <= '3.10'", # for coverage parsing TOML file ] [project.scripts] @@ -65,7 +63,13 @@ wiktextract = [ [tool.coverage.run] branch = true concurrency = ["multiprocessing"] -omit = ["tests/*"] +omit = [ + "tests/*", + "src/wiktextract/taxondata.py", # huge file +] + +[tool.coverage.html] +directory = "_site/htmlcov" [tool.black] line-length = 80 diff --git a/json_schema/generate_schema.py b/tools/generate_schema.py similarity index 86% rename from json_schema/generate_schema.py rename to tools/generate_schema.py index edfaa471..e224406f 100644 --- a/json_schema/generate_schema.py +++ b/tools/generate_schema.py @@ -1,5 +1,6 @@ import importlib import json +from pathlib import Path from importlib.resources import files @@ -11,6 +12,8 @@ def main() -> None: """ extractor_folder = files("wiktextract") / "extractor" + output_path = Path("_site") + output_path.mkdir(exist_ok=True) for extractor_folder in filter( lambda p: p.is_dir(), (files("wiktextract") / "extractor").iterdir() ): @@ -24,8 +27,8 @@ def main() -> None: model_schema[ "$schema" ] = "https://json-schema.org/draft/2020-12/schema" - with open( - f"json_schema/{lang_code}.json", "w", encoding="utf-8" + with (output_path / f"{lang_code}.json").open( + "w", encoding="utf-8" ) as f: json.dump( model_schema, diff --git a/tools/github_pages.py b/tools/github_pages.py new file mode 100644 index 00000000..64720749 --- /dev/null +++ b/tools/github_pages.py @@ -0,0 +1,56 @@ +import argparse +import json +from pathlib import Path + + +def main(): + """ + Generate a simple HTML page to list files in the `_site` folder. + """ + parser = argparse.ArgumentParser() + parser.add_argument("repo", help="The owner and repository name.") + parser.add_argument("sha", help="The commit SHA.") + args = parser.parse_args() + + html = """ + + + + + + wiktextract + + +

wiktextract

+

Coverage report

+

JSON schema

+ + + + + """ + + schema_paths = [ + path + for path in Path("_site").iterdir() + if path.is_file() and path.suffix == ".json" + ] + schema_paths.sort(key=lambda p: p.name) + schema_list_html = "" + for schema_path in schema_paths: + with schema_path.open(encoding="utf-8") as f: + schema_data = json.load(f) + schema_list_html += f"
  • {schema_data.get('title')}
  • " + html = html.replace("", schema_list_html) + + commit_sha = f"

    Commit: {args.sha[:7]}

    " + html = html.replace("", commit_sha) + + with open("_site/index.html", "w", encoding="utf-8") as f: + f.write(html) + + +if __name__ == "__main__": + main()