From 353dac958d4e3cfba5be5ec33ab86078d93cead3 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 6 Dec 2023 14:53:13 +0800 Subject: [PATCH 1/5] Use the `coverage` package directly ignore `taxondata.py` to speed up tests and generate HTML report --- .github/workflows/test.yml | 14 +++++++++----- Makefile | 16 +++++++++------- README.md | 22 ++-------------------- pyproject.toml | 8 +++++--- 4 files changed, 25 insertions(+), 35 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 041f076e..ee90bf08 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,19 +30,23 @@ jobs: - run: | python -m pip install -U pip python -m pip install --use-pep517 '.[dev]' - - run: make test + - run: make test_coverage env: PYTHONWARNINGS: default - - name: Remove huge file taxondata_py.html - run: rm -f htmlcov/*_taxondata_py.html + + coverage_report: + if: github.ref_name == 'master' + needs: test + runs-on: ubuntu-latest + steps: + - run: make coverage_report - uses: actions/upload-pages-artifact@v2 - if: github.ref_name == 'master' && matrix.python-version == '3.12' with: path: htmlcov deploy: if: github.ref_name == 'master' - needs: test + needs: coverage_report permissions: pages: write id-token: write diff --git a/Makefile b/Makefile index be49138b..67a9eb61 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,13 @@ # Run "make test" to run tests (with coverage analysis left in ./htmlcov) # Run "make clean" to remove automatically generated files - test: - rm -rf .coverage htmlcov - python -m nose2 --output-buffer --pretty-assert --with-coverage --coverage-report=html -quicktest: - python -m nose2 --output-buffer --pretty-assert + python -m unittest discover -s tests +test_coverage: + python -m coverage erase + python -m coverage run -m unittest discover -s tests +coverage_report: + python -m coverage combine + python -m coverage html clean: - rm -rf __pycache__ - rm -rf .coverage* htmlcov* + python -m coverage erase + rm -rf __pycache__ htmlcov* diff --git a/README.md b/README.md index b4aff931..bf3aec7c 100644 --- a/README.md +++ b/README.md @@ -322,13 +322,11 @@ python -m pip install -U pip python -m pip install --use-pep517 . ``` -This software requires Python 3. - ### Running tests This package includes tests written using the `unittest` framework. -They can be run using, for example, `nose2`, which can be installed -using `python -m pip install --use-pep517 -e ".[dev]"`. +The test dependencies can be installed with command +`python -m pip install --use-pep517 -e ".[dev]"`. To run the tests, use the following command in the top-level directory: @@ -358,22 +356,6 @@ updated regularly with the latest Wiktionary dump. Using the pre-extracted data may be the easiest option unless you have special needs or want to modify the code. -### Installing and running tests on Windows with VS Code - -Tested with Python 3.9.4. - -- Create [a Python virtual environment](https://code.visualstudio.com/docs/python/environments#_creating-environments) -(venv) in the VS Code workspace with the cloned repo. It should automatically install the package. - -- Open a new terminal. It should be PowerShell. You may need to [fix terminal permissions](https://stackoverflow.com/questions/56199111/visual-studio-code-cmd-error-cannot-be-loaded-because-running-scripts-is-disabl/67420296#67420296) -in order for it to pick up the virtual environment correclty. - -- In the terminal run this command: - -``` -py -m nose2 -B -``` - ## Using the command-line tool The ``wiktwords`` script is the easiest way to extract data from diff --git a/pyproject.toml b/pyproject.toml index d596dbfc..88d44ccb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,11 +40,10 @@ dependencies = [ [project.optional-dependencies] dev = [ "black", + "coverage[toml]", "jsonschema", "mypy", - "nose2[coverage_plugin]", "ruff", - "tomli; python_version <= '3.10'", # for coverage parsing TOML file ] [project.scripts] @@ -65,7 +64,10 @@ wiktextract = [ [tool.coverage.run] branch = true concurrency = ["multiprocessing"] -omit = ["tests/*"] +omit = [ + "tests/*", + "src/wiktextract/taxondata.py", # huge file +] [tool.black] line-length = 80 From ea62b0ad0854fd6ba72607d32a7e26cd0beb82d2 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 6 Dec 2023 16:44:09 +0800 Subject: [PATCH 2/5] Add JSON schema files to GitHub Pages --- .github/workflows/test.yml | 18 +- .gitignore | 3 + Makefile | 12 +- json_schema/de.json | 932 --------------------------------- json_schema/es.json | 876 ------------------------------- json_schema/generate_schema.py | 7 +- json_schema/ru.json | 199 ------- pyproject.toml | 3 + tools/github_pages.py | 43 ++ 9 files changed, 69 insertions(+), 2024 deletions(-) delete mode 100644 json_schema/de.json delete mode 100644 json_schema/es.json delete mode 100644 json_schema/ru.json create mode 100644 tools/github_pages.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ee90bf08..e463410f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -29,24 +29,20 @@ jobs: cache: 'pip' - run: | python -m pip install -U pip - python -m pip install --use-pep517 '.[dev]' + python -m pip install --use-pep517 -e '.[dev]' - run: make test_coverage env: PYTHONWARNINGS: default - - coverage_report: - if: github.ref_name == 'master' - needs: test - runs-on: ubuntu-latest - steps: - - run: make coverage_report + - run: | + make coverage_report + make github_pages + if: github.ref_name == 'master' && matrix.python-version == '3.12' - uses: actions/upload-pages-artifact@v2 - with: - path: htmlcov + if: github.ref_name == 'master' && matrix.python-version == '3.12' deploy: if: github.ref_name == 'master' - needs: coverage_report + needs: test permissions: pages: write id-token: write diff --git a/.gitignore b/.gitignore index a9b02002..1271ff87 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ usertools/data/ bac-wikt-* pagesbac/ wikt-db* + +# GitHub Pages +_site diff --git a/Makefile b/Makefile index 67a9eb61..eff16dda 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,17 @@ -# Run "make test" to run tests (with coverage analysis left in ./htmlcov) +# Run "make test" to run tests # Run "make clean" to remove automatically generated files test: - python -m unittest discover -s tests + python -m unittest discover -b -s tests test_coverage: python -m coverage erase - python -m coverage run -m unittest discover -s tests + python -m coverage run -m unittest discover -b -s tests coverage_report: python -m coverage combine python -m coverage html +github_pages: + python json_schema/generate_schema.py + cp json_schema/*.json _site + python tools/github_pages.py clean: python -m coverage erase - rm -rf __pycache__ htmlcov* + rm -rf __pycache__ _site diff --git a/json_schema/de.json b/json_schema/de.json deleted file mode 100644 index a5d645d4..00000000 --- a/json_schema/de.json +++ /dev/null @@ -1,932 +0,0 @@ -{ - "$defs": { - "Example": { - "additionalProperties": false, - "properties": { - "ref": { - "anyOf": [ - { - "$ref": "#/$defs/Reference" - }, - { - "type": "null" - } - ], - "default": null, - "description": "" - }, - "text": { - "default": null, - "description": "Example usage sentence", - "title": "Text", - "type": "string" - } - }, - "title": "Example", - "type": "object" - }, - "Reference": { - "additionalProperties": false, - "properties": { - "accessdate": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Date of access of online reference", - "title": "Accessdate" - }, - "author": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Author's name", - "title": "Author" - }, - "collection": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Name of collection that reference was published in", - "title": "Collection" - }, - "comment": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Comment on the reference", - "title": "Comment" - }, - "date": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Date of publication", - "title": "Date" - }, - "day": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Day of publication", - "title": "Day" - }, - "edition": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Edition number", - "title": "Edition" - }, - "editor": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Editor", - "title": "Editor" - }, - "isbn": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "ISBN number", - "title": "Isbn" - }, - "month": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Month of publication", - "title": "Month" - }, - "number": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Issue number", - "title": "Number" - }, - "pages": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Page numbers", - "title": "Pages" - }, - "place": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Place of publication", - "title": "Place" - }, - "publisher": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Published by", - "title": "Publisher" - }, - "raw_ref": { - "default": null, - "description": "Raw reference string", - "title": "Raw Ref", - "type": "string" - }, - "title": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Title of the reference", - "title": "Title" - }, - "title_complement": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Complement to the title", - "title": "Title Complement" - }, - "translator": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Translator", - "title": "Translator" - }, - "url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "A web link. Not necessarily well-formated.", - "title": "Url" - }, - "volume": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Volume number", - "title": "Volume" - }, - "year": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Year of publication", - "title": "Year" - } - }, - "title": "Reference", - "type": "object" - }, - "Sense": { - "additionalProperties": false, - "properties": { - "antonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Antonyms" - }, - "categories": { - "default": [], - "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", - "items": { - "type": "string" - }, - "title": "Categories", - "type": "array" - }, - "coordinate_terms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Coordinate Terms" - }, - "derived": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Derived" - }, - "examples": { - "default": [], - "description": "List of examples", - "items": { - "$ref": "#/$defs/Example" - }, - "title": "Examples", - "type": "array" - }, - "expressions": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Expressions" - }, - "glosses": { - "default": [], - "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", - "items": { - "type": "string" - }, - "title": "Glosses", - "type": "array" - }, - "holonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Holonyms" - }, - "hypernyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hypernyms" - }, - "hyponyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hyponyms" - }, - "proverbs": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Proverbs" - }, - "raw_glosses": { - "default": [], - "description": "list of uncleaned raw glosses for the word sense (usually only one).", - "items": { - "type": "string" - }, - "title": "Raw Glosses", - "type": "array" - }, - "senseid": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Sense number used in Wiktionary", - "title": "Senseid" - }, - "synonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Synonyms" - }, - "tags": { - "default": [], - "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - }, - "translations": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Translation" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Translations" - } - }, - "title": "Sense", - "type": "object" - }, - "Sound": { - "additionalProperties": false, - "properties": { - "audio": { - "default": [], - "description": "Audio file name", - "items": { - "type": "string" - }, - "title": "Audio", - "type": "array" - }, - "flac_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Flac Url", - "type": "array" - }, - "ipa": { - "default": [], - "description": "International Phonetic Alphabet", - "items": { - "type": "string" - }, - "title": "Ipa", - "type": "array" - }, - "lang_code": { - "default": [], - "description": "Wiktionary language code", - "items": { - "type": "string" - }, - "title": "Lang Code", - "type": "array" - }, - "lang_name": { - "default": [], - "description": "Localized language name", - "items": { - "type": "string" - }, - "title": "Lang Name", - "type": "array" - }, - "mp3_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Mp3 Url", - "type": "array" - }, - "oga_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Oga Url", - "type": "array" - }, - "ogg_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Ogg Url", - "type": "array" - }, - "tags": { - "default": [], - "description": "Specifying the variant of the pronunciation", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - }, - "wav_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Wav Url", - "type": "array" - } - }, - "title": "Sound", - "type": "object" - }, - "Translation": { - "additionalProperties": false, - "properties": { - "lang_code": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Wiktionary language code of the translation term", - "title": "Lang Code" - }, - "lang_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Localized language name", - "title": "Lang Name" - }, - "notes": { - "default": [], - "description": "A list of notes", - "items": { - "type": "string" - }, - "title": "Notes", - "type": "array" - }, - "roman": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Transliteration in roman characters", - "title": "Roman" - }, - "sense": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "A gloss of the sense being translated", - "title": "Sense" - }, - "tags": { - "default": [], - "description": "Tags specifying the translated term, usually gender information", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - }, - "uncertain": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "null" - } - ], - "default": false, - "description": "Translation marked as uncertain", - "title": "Uncertain" - }, - "word": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Translation term", - "title": "Word" - } - }, - "title": "Translation", - "type": "object" - } - }, - "$id": "https://kaikki.org/de.json", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "additionalProperties": false, - "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", - "properties": { - "antonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Antonyms" - }, - "coordinate_terms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Coordinate Terms" - }, - "derived": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Derived" - }, - "expressions": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Expressions" - }, - "holonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Holonyms" - }, - "hypernyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hypernyms" - }, - "hyponyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hyponyms" - }, - "lang_code": { - "description": "Wiktionary language code", - "examples": [ - "es" - ], - "title": "Lang Code", - "type": "string" - }, - "lang_name": { - "description": "Localized language name of the word", - "examples": [ - "español" - ], - "title": "Lang Name", - "type": "string" - }, - "pos": { - "default": null, - "description": "Part of speech type", - "title": "Pos", - "type": "string" - }, - "proverbs": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Proverbs" - }, - "senses": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sense" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Senses" - }, - "sounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sound" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Sounds" - }, - "synonyms": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Synonyms" - }, - "translations": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Translation" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Translations" - }, - "word": { - "description": "word string", - "title": "Word", - "type": "string" - } - }, - "required": [ - "word", - "lang_code", - "lang_name" - ], - "title": "German Wiktionary", - "type": "object" -} \ No newline at end of file diff --git a/json_schema/es.json b/json_schema/es.json deleted file mode 100644 index 7dc1d6f2..00000000 --- a/json_schema/es.json +++ /dev/null @@ -1,876 +0,0 @@ -{ - "$defs": { - "Example": { - "additionalProperties": false, - "properties": { - "ref": { - "anyOf": [ - { - "$ref": "#/$defs/Reference" - }, - { - "type": "null" - } - ], - "default": null, - "description": "" - }, - "text": { - "description": "Example usage sentence", - "title": "Text", - "type": "string" - }, - "translation": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Spanish translation of the example sentence", - "title": "Translation" - } - }, - "required": [ - "text" - ], - "title": "Example", - "type": "object" - }, - "Linkage": { - "additionalProperties": false, - "properties": { - "alternative_spelling": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Alternative spelling of the word", - "title": "Alternative Spelling" - }, - "note": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Note" - }, - "word": { - "title": "Word", - "type": "string" - } - }, - "required": [ - "word" - ], - "title": "Linkage", - "type": "object" - }, - "Reference": { - "additionalProperties": false, - "properties": { - "chapter": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Chapter name", - "title": "Chapter" - }, - "date": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Date of publication", - "title": "Date" - }, - "editor": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Editor", - "title": "Editor" - }, - "first_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Author's first name", - "title": "First Name" - }, - "journal": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Name of journal", - "title": "Journal" - }, - "last_name": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Author's last name", - "title": "Last Name" - }, - "pages": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Page numbers", - "title": "Pages" - }, - "place": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Place of publication", - "title": "Place" - }, - "title": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Title of the reference", - "title": "Title" - }, - "url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "A web link", - "title": "Url" - }, - "year": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Year of publication", - "title": "Year" - } - }, - "title": "Reference", - "type": "object" - }, - "Sense": { - "additionalProperties": false, - "properties": { - "antonyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Antonyms" - }, - "categories": { - "default": [], - "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", - "items": { - "type": "string" - }, - "title": "Categories", - "type": "array" - }, - "compounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Compounds" - }, - "derived": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Derived" - }, - "examples": { - "default": [], - "description": "List of examples", - "items": { - "$ref": "#/$defs/Example" - }, - "title": "Examples", - "type": "array" - }, - "glosses": { - "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", - "items": { - "type": "string" - }, - "title": "Glosses", - "type": "array" - }, - "hypernyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hypernyms" - }, - "hyponyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hyponyms" - }, - "idioms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Idioms" - }, - "meronyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Meronyms" - }, - "related": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Related" - }, - "senseid": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Sense number used in Wiktionary", - "title": "Senseid" - }, - "synonyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Synonyms" - }, - "tags": { - "default": [], - "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - } - }, - "required": [ - "glosses" - ], - "title": "Sense", - "type": "object" - }, - "Sound": { - "additionalProperties": false, - "properties": { - "audio": { - "default": [], - "description": "Audio file name", - "items": { - "type": "string" - }, - "title": "Audio", - "type": "array" - }, - "flac_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Flac Url", - "type": "array" - }, - "ipa": { - "default": [], - "description": "International Phonetic Alphabet", - "items": { - "type": "string" - }, - "title": "Ipa", - "type": "array" - }, - "mp3_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Mp3 Url", - "type": "array" - }, - "ogg_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Ogg Url", - "type": "array" - }, - "phonetic_transcription": { - "default": [], - "description": "Phonetic transcription, less exact than IPA.", - "items": { - "type": "string" - }, - "title": "Phonetic Transcription", - "type": "array" - }, - "roman": { - "default": [], - "description": "Translitaration to Roman characters", - "items": { - "type": "string" - }, - "title": "Roman", - "type": "array" - }, - "syllabic": { - "default": [], - "description": "Syllabic transcription", - "items": { - "type": "string" - }, - "title": "Syllabic", - "type": "array" - }, - "tag": { - "default": [], - "description": "Specifying the variant of the pronunciation", - "items": { - "type": "string" - }, - "title": "Tag", - "type": "array" - }, - "wav_url": { - "default": [], - "items": { - "type": "string" - }, - "title": "Wav Url", - "type": "array" - } - }, - "title": "Sound", - "type": "object" - }, - "Spelling": { - "additionalProperties": false, - "properties": { - "alternative": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Alternative spelling with same pronunciation", - "title": "Alternative" - }, - "note": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Note regarding alternative spelling", - "title": "Note" - }, - "same_pronunciation": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Whether the alternative spelling has the same pronunciation as the default spelling", - "title": "Same Pronunciation" - } - }, - "title": "Spelling", - "type": "object" - }, - "Translation": { - "additionalProperties": false, - "properties": { - "lang_code": { - "description": "Wiktionary language code of the translation term", - "title": "Lang Code", - "type": "string" - }, - "notes": { - "default": [], - "description": "A list of notes", - "items": { - "type": "string" - }, - "title": "Notes", - "type": "array" - }, - "roman": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Transliteration in roman characters", - "title": "Roman" - }, - "senseids": { - "default": [], - "description": "List of senseids where this translation applies", - "items": { - "type": "string" - }, - "title": "Senseids", - "type": "array" - }, - "tags": { - "default": [], - "description": "Tags specifying the translated term, usually gender information", - "items": { - "type": "string" - }, - "title": "Tags", - "type": "array" - }, - "word": { - "description": "Translation term", - "title": "Word", - "type": "string" - } - }, - "required": [ - "word", - "lang_code" - ], - "title": "Translation", - "type": "object" - } - }, - "$id": "https://kaikki.org/es.json", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "additionalProperties": false, - "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", - "properties": { - "antonyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Antonyms" - }, - "categories": { - "default": [], - "description": "list of non-disambiguated categories for the word", - "items": { - "type": "string" - }, - "title": "Categories", - "type": "array" - }, - "compounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Compounds" - }, - "derived": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Derived" - }, - "hypernyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hypernyms" - }, - "hyponyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Hyponyms" - }, - "idioms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Idioms" - }, - "lang_code": { - "description": "Wiktionary language code", - "examples": [ - "es" - ], - "title": "Lang Code", - "type": "string" - }, - "lang_name": { - "description": "Localized language name of the word", - "examples": [ - "español" - ], - "title": "Lang Name", - "type": "string" - }, - "meronyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Meronyms" - }, - "pos": { - "default": null, - "description": "Part of speech type", - "title": "Pos", - "type": "string" - }, - "pos_title": { - "default": null, - "description": "Original POS title", - "title": "Pos Title", - "type": "string" - }, - "related": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Related" - }, - "senses": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sense" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Senses" - }, - "sounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sound" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Sounds" - }, - "spellings": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Spelling" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Spellings" - }, - "synonyms": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Linkage" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Synonyms" - }, - "translations": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Translation" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Translations" - }, - "word": { - "description": "word string", - "title": "Word", - "type": "string" - } - }, - "required": [ - "word", - "lang_code", - "lang_name" - ], - "title": "Spanish Wiktionary", - "type": "object" -} \ No newline at end of file diff --git a/json_schema/generate_schema.py b/json_schema/generate_schema.py index edfaa471..e224406f 100644 --- a/json_schema/generate_schema.py +++ b/json_schema/generate_schema.py @@ -1,5 +1,6 @@ import importlib import json +from pathlib import Path from importlib.resources import files @@ -11,6 +12,8 @@ def main() -> None: """ extractor_folder = files("wiktextract") / "extractor" + output_path = Path("_site") + output_path.mkdir(exist_ok=True) for extractor_folder in filter( lambda p: p.is_dir(), (files("wiktextract") / "extractor").iterdir() ): @@ -24,8 +27,8 @@ def main() -> None: model_schema[ "$schema" ] = "https://json-schema.org/draft/2020-12/schema" - with open( - f"json_schema/{lang_code}.json", "w", encoding="utf-8" + with (output_path / f"{lang_code}.json").open( + "w", encoding="utf-8" ) as f: json.dump( model_schema, diff --git a/json_schema/ru.json b/json_schema/ru.json deleted file mode 100644 index 8955fbee..00000000 --- a/json_schema/ru.json +++ /dev/null @@ -1,199 +0,0 @@ -{ - "$defs": { - "Sound": { - "additionalProperties": false, - "properties": { - "audio": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "Audio file name", - "title": "Audio" - }, - "flac_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Flac Url" - }, - "homophones": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "description": "Words with same pronunciation", - "title": "Homophones" - }, - "ipa": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "description": "International Phonetic Alphabet", - "title": "Ipa" - }, - "mp3_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Mp3 Url" - }, - "oga_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Oga Url" - }, - "ogg_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Ogg Url" - }, - "tags": { - "anyOf": [ - { - "items": { - "type": "string" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "description": "Specifying the variant of the pronunciation", - "title": "Tags" - }, - "wav_url": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Wav Url" - } - }, - "title": "Sound", - "type": "object" - } - }, - "$id": "https://kaikki.org/ru.json", - "$schema": "https://json-schema.org/draft/2020-12/schema", - "additionalProperties": false, - "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", - "properties": { - "categories": { - "default": [], - "description": "list of non-disambiguated categories for the word", - "items": { - "type": "string" - }, - "title": "Categories", - "type": "array" - }, - "lang_code": { - "description": "Wiktionary language code", - "examples": [ - "ru" - ], - "title": "Lang Code", - "type": "string" - }, - "lang_name": { - "description": "Localized language name of the word", - "examples": [ - "Русский" - ], - "title": "Lang Name", - "type": "string" - }, - "pos": { - "default": null, - "description": "Part of speech type", - "title": "Pos", - "type": "string" - }, - "pos_title": { - "default": null, - "description": "Original POS title", - "title": "Pos Title", - "type": "string" - }, - "sounds": { - "anyOf": [ - { - "items": { - "$ref": "#/$defs/Sound" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "default": [], - "title": "Sounds" - }, - "word": { - "description": "word string", - "title": "Word", - "type": "string" - } - }, - "required": [ - "word", - "lang_code", - "lang_name" - ], - "title": "Russian Wiktionary", - "type": "object" -} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 88d44ccb..26ab1dcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,9 @@ omit = [ "src/wiktextract/taxondata.py", # huge file ] +[tool.coverage.html] +directory = "_site/htmlcov" + [tool.black] line-length = 80 diff --git a/tools/github_pages.py b/tools/github_pages.py new file mode 100644 index 00000000..7fb1cfeb --- /dev/null +++ b/tools/github_pages.py @@ -0,0 +1,43 @@ +from pathlib import Path + + +def main(): + """ + Generate a simple HTML page to list files in the `_site` folder. + """ + html = """ + + + + + + wiktextract + + +

wiktextract

+

Coverage report

+

JSON schema

+ + + + """ + + with open("_site/index.html", "w", encoding="utf-8") as f: + f.write(html) + + +if __name__ == "__main__": + main() From 48865502a0e3802f60c442b9950e4cb6fb07ae9c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Dec 2023 09:45:39 +0800 Subject: [PATCH 3/5] Move `generate_schema.py` to `tools` folder --- Makefile | 2 +- json_schema/validate.py | 36 ----------------------- pyproject.toml | 1 - {json_schema => tools}/generate_schema.py | 0 4 files changed, 1 insertion(+), 38 deletions(-) delete mode 100644 json_schema/validate.py rename {json_schema => tools}/generate_schema.py (100%) diff --git a/Makefile b/Makefile index eff16dda..5d52f200 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ coverage_report: python -m coverage combine python -m coverage html github_pages: - python json_schema/generate_schema.py + python tools/generate_schema.py cp json_schema/*.json _site python tools/github_pages.py clean: diff --git a/json_schema/validate.py b/json_schema/validate.py deleted file mode 100644 index 1fb53e46..00000000 --- a/json_schema/validate.py +++ /dev/null @@ -1,36 +0,0 @@ -import argparse -import json -from concurrent.futures import ProcessPoolExecutor -from functools import partial -from pathlib import Path - - -def worker(line, schema={}): - from jsonschema import validate - - validate(instance=json.loads(line), schema=schema) - - -def main(): - """ - Validate extracted JSONL file with JSON schema. - """ - parser = argparse.ArgumentParser() - parser.add_argument("jsonl_path", type=Path) - parser.add_argument("schema_path", type=Path) - args = parser.parse_args() - - with ( - args.jsonl_path.open(encoding="utf-8") as jsonl_f, - args.schema_path.open(encoding="utf-8") as schema_f, - ProcessPoolExecutor() as executor, - ): - schema = json.load(schema_f) - for _ in executor.map( - partial(worker, schema=schema), jsonl_f, chunksize=1000 - ): - pass - - -if __name__ == "__main__": - main() diff --git a/pyproject.toml b/pyproject.toml index 26ab1dcd..9b0bbe93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,6 @@ dependencies = [ dev = [ "black", "coverage[toml]", - "jsonschema", "mypy", "ruff", ] diff --git a/json_schema/generate_schema.py b/tools/generate_schema.py similarity index 100% rename from json_schema/generate_schema.py rename to tools/generate_schema.py From afcf31c927b714e5bedc9a7ba271ff8781802f5f Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Dec 2023 10:18:46 +0800 Subject: [PATCH 4/5] Use schema title in file link --- tools/github_pages.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/github_pages.py b/tools/github_pages.py index 7fb1cfeb..967eb38a 100644 --- a/tools/github_pages.py +++ b/tools/github_pages.py @@ -1,3 +1,4 @@ +import json from pathlib import Path @@ -18,23 +19,24 @@ def main():

Coverage report

JSON schema

    + +
+ + """ - json_schemas = [ - path.name + schema_paths = [ + path for path in Path("_site").iterdir() if path.is_file() and path.suffix == ".json" ] - json_schemas.sort() - for schema in json_schemas: - html += f"
  • {schema}
  • " - - html += """ - - - - """ - + schema_paths.sort(key=lambda p: p.name) + schema_list_html = "" + for schema_path in schema_paths: + with schema_path.open(encoding="utf-8") as f: + schema_data = json.load(f) + schema_list_html += f"
  • {schema_data.get('title')}
  • " + html = html.replace("", schema_list_html) with open("_site/index.html", "w", encoding="utf-8") as f: f.write(html) From 4620fc82b00f270daa02f6ed35881ed4e1ef7e2b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Dec 2023 10:36:55 +0800 Subject: [PATCH 5/5] Add commit link that triggered the action --- .github/workflows/test.yml | 2 +- Makefile | 5 ++++- tools/github_pages.py | 11 +++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e463410f..89da35f1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -35,7 +35,7 @@ jobs: PYTHONWARNINGS: default - run: | make coverage_report - make github_pages + make github_pages REPO=${{ github.repository }} SHA=${{ github.sha }} if: github.ref_name == 'master' && matrix.python-version == '3.12' - uses: actions/upload-pages-artifact@v2 if: github.ref_name == 'master' && matrix.python-version == '3.12' diff --git a/Makefile b/Makefile index 5d52f200..7ca5dfcb 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,8 @@ # Run "make test" to run tests # Run "make clean" to remove automatically generated files +REPO ?= tatuylonen/wiktextract +SHA ?= HEAD + test: python -m unittest discover -b -s tests test_coverage: @@ -11,7 +14,7 @@ coverage_report: github_pages: python tools/generate_schema.py cp json_schema/*.json _site - python tools/github_pages.py + python tools/github_pages.py $(REPO) $(SHA) clean: python -m coverage erase rm -rf __pycache__ _site diff --git a/tools/github_pages.py b/tools/github_pages.py index 967eb38a..64720749 100644 --- a/tools/github_pages.py +++ b/tools/github_pages.py @@ -1,3 +1,4 @@ +import argparse import json from pathlib import Path @@ -6,6 +7,11 @@ def main(): """ Generate a simple HTML page to list files in the `_site` folder. """ + parser = argparse.ArgumentParser() + parser.add_argument("repo", help="The owner and repository name.") + parser.add_argument("sha", help="The commit SHA.") + args = parser.parse_args() + html = """ @@ -21,6 +27,7 @@ def main():
    + """ @@ -37,6 +44,10 @@ def main(): schema_data = json.load(f) schema_list_html += f"
  • {schema_data.get('title')}
  • " html = html.replace("", schema_list_html) + + commit_sha = f"

    Commit: {args.sha[:7]}

    " + html = html.replace("", commit_sha) + with open("_site/index.html", "w", encoding="utf-8") as f: f.write(html)