diff --git a/.github/workflows/compare.yaml b/.github/workflows/compare.yaml new file mode 100644 index 00000000..d9d98038 --- /dev/null +++ b/.github/workflows/compare.yaml @@ -0,0 +1,20 @@ +name: Compare spec to docs +on: + workflow_dispatch: +jobs: + compare: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Install deps + working-directory: ./_docmatch + run: pip install . + - name: Fetch docs + working-directory: ./_docmatch + run: curl -L https://spoonacular.com/food-api/docs > docs.html + - name: Compare + working-directory: ./_docmatch + run: python src/docmatch.py diff --git a/_docmatch/.gitignore b/_docmatch/.gitignore new file mode 100644 index 00000000..16c37027 --- /dev/null +++ b/_docmatch/.gitignore @@ -0,0 +1,70 @@ +docs.html + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm-project.org/#use-with-ide +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ diff --git a/_docmatch/README.md b/_docmatch/README.md new file mode 100644 index 00000000..ba293903 --- /dev/null +++ b/_docmatch/README.md @@ -0,0 +1,14 @@ +# docmatch + +This is a script that compares the spec with the HTML docs. + +Using [pdm](https://pdm-project.org/) for dependencies: + +```shell +curl -L https://spoonacular.com/food-api/docs > docs.html +pdm install +pdm run src/docmatch.py +``` + +(just pip can be used for deps too, `pip install .`, +with a manually managed virtualenv or in an otherwise isolated environment like a container) diff --git a/_docmatch/pdm.lock b/_docmatch/pdm.lock new file mode 100644 index 00000000..9a316ab9 --- /dev/null +++ b/_docmatch/pdm.lock @@ -0,0 +1,64 @@ +# This file is @generated by PDM. +# It is not intended for manual editing. + +[metadata] +groups = ["default", "dev"] +strategy = ["cross_platform", "inherit_metadata"] +lock_version = "4.4.1" +content_hash = "sha256:73271ac30340365272424743c809342bbbed0255cbc87da4d053a6e366e73874" + +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +requires_python = ">=3.6.0" +summary = "Screen-scraping library" +groups = ["default"] +dependencies = [ + "soupsieve>1.2", +] +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[[package]] +name = "genson" +version = "1.2.2" +summary = "GenSON is a powerful, user-friendly JSON Schema generator." +groups = ["default"] +files = [ + {file = "genson-1.2.2.tar.gz", hash = "sha256:8caf69aa10af7aee0e1a1351d1d06801f4696e005f06cedef438635384346a16"}, +] + +[[package]] +name = "json-repair" +version = "0.17.1" +requires_python = ">=3.7" +summary = "A package to repair broken json strings" +groups = ["default"] +files = [ + {file = "json_repair-0.17.1-py3-none-any.whl", hash = "sha256:e649d7b7f35a7ecfcef795a460b033dd4cd542dd8637235e8918f3dbee9f8827"}, + {file = "json_repair-0.17.1.tar.gz", hash = "sha256:d32768d3397dc9256b3cabdc731812c11ac60e6baa2f50df3ad8b5f006363ede"}, +] + +[[package]] +name = "jsonref" +version = "1.1.0" +requires_python = ">=3.7" +summary = "jsonref is a library for automatic dereferencing of JSON Reference objects for Python." +groups = ["default"] +files = [ + {file = "jsonref-1.1.0-py3-none-any.whl", hash = "sha256:590dc7773df6c21cbf948b5dac07a72a251db28b0238ceecce0a2abfa8ec30a9"}, + {file = "jsonref-1.1.0.tar.gz", hash = "sha256:32fe8e1d85af0fdefbebce950af85590b22b60f9e95443176adbde4e1ecea552"}, +] + +[[package]] +name = "soupsieve" +version = "2.5" +requires_python = ">=3.8" +summary = "A modern CSS selector implementation for Beautiful Soup." +groups = ["default"] +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] diff --git a/_docmatch/pyproject.toml b/_docmatch/pyproject.toml new file mode 100644 index 00000000..99344465 --- /dev/null +++ b/_docmatch/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "docmatch" +version = "0.1.0" +description = "Matches the spoonacular API docs to the OpenAPI spec" +authors = [ + {name = "Val Packett", email = "val@packett.cool"}, +] +dependencies = [ + "beautifulsoup4>=4.12.3", + "genson>=1.2.2", + "json-repair>=0.17.1", + "jsonref>=1.1.0", +] +requires-python = ">=3.10.0" +readme = "README.md" +license = {text = "MIT"} + +[tool.pdm] +distribution = false diff --git a/_docmatch/src/docmatch.py b/_docmatch/src/docmatch.py new file mode 100644 index 00000000..916e6688 --- /dev/null +++ b/_docmatch/src/docmatch.py @@ -0,0 +1,214 @@ +import json +import jsonref +import json_repair +import bs4 +import sys +import re +from collections import defaultdict +from pprint import pprint +from genson import SchemaBuilder +from typing import List, Dict, Optional +from dataclasses import dataclass + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + +# https://github.com/n-takumasa/json-with-comments/blob/main/jsonc/_util.py +_REMOVE_C_COMMENT = r""" + ( # String Literal + \"(?:\\.|[^\\\"])*?\" + ) + | + ( # Comment + \/\*.*?\*\/ + | + \/\/[^\r\n]*?(?:[\r\n]) + ) + """ + +def _remove_c_comment(text: str) -> str: + if text[-1] != "\n": + text = text + "\n" + return re.sub( + _REMOVE_C_COMMENT, + lambda x: x.group(1), + text, + flags=re.DOTALL | re.VERBOSE, + ) + +def t(el: bs4.Tag | bs4.NavigableString | None) -> bs4.Tag: + assert isinstance(el, bs4.Tag) + return el + +def json2schema(els: List[str]) -> dict: + builder = SchemaBuilder() + for el in els: + el = _remove_c_comment(el.replace("...\n", "\n")) + try: + # builder.add_object(pyjson5.decode(el)) + builder.add_object(json_repair.loads(el)) + except Exception as e: + eprint('JSON failed to parse', el, e) + schema = builder.to_schema() + del schema['$schema'] + return schema + +@dataclass +class Param: + description: str + where: str + typ: str + example: str + required: bool + +@dataclass +class Doc: + method: str + url: str + summary: str + description: str + params: Dict[str, Param] + req_schema: Optional[dict] + resp_schema: dict + +def parsedoc(doc: bs4.Tag): + if doc.find('div', class_='api-method') is None: + return + method = t(doc.find('div', class_='api-method')).get_text() + url = t(doc.find('div', class_='api-path')).get_text().replace('https://api.spoonacular.com', '') + params = {} + tbl = doc.find('table', class_='api-parameter-table') + for param in (t(tbl).find_all('tr') if tbl else []): + tds = param.find_all('td') + if len(tds) == 0: + continue + [par, typ, exa, des] = tds + req = not (par.find('b') is None) + par = t(par.find('code')) + par_where = [c for c in par['class'] if c.startswith('api-in-')][0] + par = par.get_text() + typ = typ.get_text() + exa = exa.get_text() + if typ == 'number': + exa = float(exa) + if typ == 'boolean': + exa = exa == 'true' + params[par] = Param( + description=des.get_text(), + where=par_where.replace('api-in-', ''), + typ=typ, + example=exa, + required=req, + ) + examples = {} + for hdr in doc.find_all('div', class_='apiPathHeader'): + ex = [] + for sib in hdr.find_next_siblings(): + if not isinstance(sib, bs4.Tag): + continue + if sib.name == 'div' and 'apiPathHeader' in sib['class']: + break + if sib.name == 'pre' and 'language-json' in sib['class']: + ex.append(sib.get_text()) + if len(ex) > 0: + examples[hdr.get_text()] = ex + examples_req = [] + examples_res = [] + if '(post body)' in params: + examples_req.append(params['(post body)'].example) + del params['(post body)'] + for hdr, exs in examples.items(): + if hdr == 'Example Response' or hdr == 'Example Request and Response': + examples_res += exs + elif hdr == 'Example Request' or hdr == 'Example Request Body' or hdr == 'Example Request and Body': + examples_req += exs + else: + eprint('Unknown apiPathHeader:', hdr) + # pprint(params) + # print('Req:') + # pprint(json2schema(examples_req)) + # print('Res:') + ret = Doc( + summary=str(doc['jss-title']), + description=t(doc.find('p')).get_text(strip=True), + method=method, + url=url, + params=params, + req_schema=None, + resp_schema=json2schema(examples_res) + ) + if len(examples_req) > 0: + ret.req_schema = json2schema(examples_req) + return ret + +def compare_schema(location, l_name, l, r_name, r): + if l.get('type') != r.get('type'): + if (l.get('type') == 'integer' and r.get('type') == 'number') \ + or (r.get('type') == 'integer' and l.get('type') == 'number') \ + or (l.get('type') == 'string' and r.get('type') == 'text') \ + or (r.get('type') == 'string' and l.get('type') == 'text'): + return + print(f"** ! `{location}`: type mismatch: {l_name} `{l.get('type')}` vs {r_name} `{r.get('type')}`") + return + l_props = l.get('properties', {}) + r_props = r.get('properties', {}) + for k in r_props.keys(): + if k not in l_props: + print(f"** ! `{location}`: property `{k}` present in {r_name} but not in {l_name}") + for k, v in l_props.items(): + if k not in r_props: + print(f"** ! `{location}`: property `{k}` present in {l_name} but not in {r_name}") + continue + compare_schema(location + '.' + k, l_name, v, r_name, r_props[k]) + if 'items' in l and 'items' in r: + compare_schema(location + '[]', l_name, l['items'], r_name, r['items']) + +with open("../spoonacular-openapi-3.json") as f: + spec = jsonref.load(f) + +with open("docs.html") as f: + soup = bs4.BeautifulSoup(f, 'html.parser') + docs = soup.find_all('section', class_='jss-doc') + for doc in docs: + pdoc = parsedoc(doc) + if not pdoc: continue + if not pdoc.url in spec['paths']: + print(f"* ! `{pdoc.url}` not found in spec") + continue + scm = spec['paths'][pdoc.url] + if not pdoc.method.lower() in scm: + print(f"* ! method `{pdoc.method.lower()}` not found in spec for `{pdoc.url}`") + continue + scm = scm[pdoc.method.lower()] + print(f"* `{pdoc.method.lower()}` `{pdoc.url}`") + + spec_params = {} + for p in scm.get('parameters', []): + if not 'name' in p: + print(f"** ! unresolved/unnamed param in spec: {p}") + continue + spec_params[p['name']] = p + for name in spec_params.keys(): + if not name in pdoc.params: + print(f"** ! param `{name}` found in spec, but not in doc") + for name, doc_param in pdoc.params.items(): + if not name in spec_params: + print(f"** ! param `{name}` found in doc, but not in spec") + continue + spec_param = spec_params[name] + def print_cmp(name, par, spec, doc): + if (spec == 'integer' and doc == 'number') or (spec == 'string' and doc == 'text'): + return + if spec != doc: + print(f"** ! param `{name}`.`{par}`: spec `{spec}` vs doc `{doc}`") + print_cmp(name, 'required', spec_param.get('required'), doc_param.required) + print_cmp(name, 'type', spec_param.get('schema', {}).get('type'), doc_param.typ) + print_cmp(name, 'in', spec_param.get('in'), doc_param.where) + + spec_req_sch = scm.get('requestBody', {}).get('content', {}).get('application/json', {}).get('schema') + if spec_req_sch and not pdoc.req_schema: + print(f"** request body schema found in spec, but could not infer one from docs") + if not spec_req_sch and pdoc.req_schema: + print(f"** ! request body schema not found in spec, but could infer one from docs") + if spec_req_sch and pdoc.req_schema: + compare_schema('(req body)', 'spec', spec_req_sch, 'docs-inferred', pdoc.req_schema)