Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add validate scopes command #570

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,10 @@
language: python
exclude: ^.*$ # prevent passing any files
always_run: true
- id: validate-scopes
name: Validate Schema publishers
description: Validate Amsterdam Schema scope files
entry: schema validate-scopes
language: python
exclude: ^.*$ # prevent passing any files
always_run: true
4 changes: 3 additions & 1 deletion src/schematools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
PUBLISHER_DIR: Final[str] = "publishers"
# Files that can exist in publishers directory but should be ignored by
# the FileLoaders
PUBLISHER_EXCLUDE_FILES: Final[str] = ["publishers.json", "index.json"]
PUBLISHER_EXCLUDE_FILES: Final[list[str]] = ["publishers.json", "index.json"]
# The directory where all scope objects are defined for amsterdam-schema
SCOPE_DIR: Final[str] = "scopes"

# Common coordinate reference systems
CRS_WGS84: Final[str] = "EPSG:4326" # World Geodetic System 1984, used in GPS
Expand Down
79 changes: 77 additions & 2 deletions src/schematools/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
revoke_permissions,
)
from schematools.provenance.create import ProvenanceIteration
from schematools.types import DatasetSchema, Publisher, SemVer
from schematools.types import DatasetSchema, Publisher, Scope, SemVer

# Configure a simple stdout logger for permissions output
logger = logging.getLogger("schematools.permissions")
Expand Down Expand Up @@ -372,7 +372,10 @@ def _fetch_json(location: str) -> dict[str, Any]:
JSON data as a dictionary.
"""
if not location.startswith("http"):
with open(location) as f:
schema_file = Path(location)
if schema_file.is_dir():
schema_file = schema_file / "schema.json"
with open(schema_file) as f:
json_obj = json.load(f)
else:
response = requests.get(location, timeout=60)
Expand Down Expand Up @@ -524,6 +527,63 @@ def validate_publishers(schema_url: str, meta_schema_url: tuple[str]) -> None:
sys.exit(1)


@schema.command()
@option_schema_url
@click.argument("meta_schema_url", nargs=-1)
def validate_scopes(schema_url: str, meta_schema_url: tuple[str]) -> None:
"""Validate all scopes against the Amsterdam Schema meta schema.

Args:

\b
META_SCHEMA_URL: URL where the meta schema for Amsterdam Schema definitions can be found.
If multiple are given, schematools will try to validate against the largest version,
working backwards and stopping at the first version that the objects are valid against.

Options:

\b
SCHEMA_URL: URL where the datasets for Amsterdam Schema definitions can be found. The path
component of this uri is dropped to find the scopes in the root. For example, if
SCHEMA_URL=https://example.com/datasets, the scopes are extracted from
https://example.com/scopes.
""" # noqa: D301,D412,D417
for meta_schema_version, url in sorted(
[(version_from_metaschema_url(u), u) for u in set(meta_schema_url)],
reverse=True,
):
meta_schema = _fetch_json(url)
if meta_schema_version.major not in COMPATIBLE_METASCHEMAS:
raise IncompatibleMetaschema(
f"Schematools {pkg_version} is not"
f"compatible with metaschema {meta_schema_version}"
)

click.echo(f"Validating against metaschema {meta_schema_version}")
scopes = _get_scopes(schema_url)
print(scopes)
structural_errors = False
for id_, scope in scopes.items():
try:
click.echo(f"Validating scope with id {id_}")
jsonschema.validate(
instance=scope.json_data(),
schema=meta_schema,
format_checker=draft7_format_checker,
)
except (jsonschema.ValidationError, jsonschema.SchemaError) as e:
click.echo("Structural validation: ", nl=False)
structural_errors = True
click.echo(format_schema_error(e), err=True)

if structural_errors:
continue
click.echo(f"All scopes are structurally valid against {meta_schema_version}.")
sys.exit(0)
click.echo("Scopes are structurally invalid against all supplied metaschema versions")
sys.exit(1)


@schema.command()
@click.argument("meta_schema_url")
@click.argument("schema_files", nargs=-1)
Expand Down Expand Up @@ -888,6 +948,21 @@ def _get_publishers(schema_url: str) -> dict[str, Publisher]:
raise click.ClickException(str(e)) from None


def _get_scopes(schema_url: str) -> dict[str, Scope]:
"""Find the scopes from the given schema_url.

Args:
dataset_id: id of the dataset.
schema_url: url of the location where the collection of amsterdam schemas is found.
prefetch_related: related schemas should be prefetched.
"""
loader = get_schema_loader(schema_url)
try:
return loader.get_all_scopes()
except SchemaObjectNotFound as e:
raise click.ClickException(str(e)) from None


@create.command("extra_index")
@option_db_url
@option_schema_url
Expand Down
38 changes: 37 additions & 1 deletion src/schematools/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,21 @@
DEFAULT_SCHEMA_URL,
PUBLISHER_DIR,
PUBLISHER_EXCLUDE_FILES,
SCOPE_DIR,
)
from schematools.exceptions import (
DatasetNotFound,
DatasetTableNotFound,
SchemaObjectNotFound,
)
from schematools.types import DatasetSchema, DatasetTableSchema, Json, ProfileSchema, Publisher
from schematools.types import (
DatasetSchema,
DatasetTableSchema,
Json,
ProfileSchema,
Publisher,
Scope,
)

__all__ = (
"get_schema_loader",
Expand Down Expand Up @@ -66,6 +74,13 @@ def get_all_publishers(self) -> dict[str, Publisher]:
"""
raise NotImplementedError

def get_all_scopes(self) -> dict[str, Scope]:
"""Get all scopes from the schema location

The return value maps scope ids to Scope objects.
"""
raise NotImplementedError

def get_publisher(self, publisher_id: str) -> dict[str, Publisher]:
raise NotImplementedError

Expand All @@ -90,8 +105,10 @@ def __init__(self, loader: SchemaLoader | None):
self._loader = loader
self._cache: dict[str, DatasetSchema] = {}
self._publisher_cache: dict[str, Publisher] = {}
self._scopes_cache: dict[str, Scope] = {}
self._table_cache: dict[tuple[str, str], DatasetTableSchema] = {}
self._has_all_publishers = False
self._has_all_scopes = False
self._has_all = False

def __repr__(self):
Expand Down Expand Up @@ -175,6 +192,17 @@ def get_all_publishers(self) -> dict[str, Publisher]:

return self._publisher_cache

def get_all_scopes(self) -> dict[str, Scope]:
"""Load all publishers, and fill the cache"""
if not self._has_all_scopes:
if self._loader is None:
raise RuntimeError("This dataset collection can't retrieve new scopes")

self._scopes_cache = self._loader.get_all_scopes()
self._has_all_scopes = True

return self._scopes_cache


class _FileBasedSchemaLoader(SchemaLoader):
"""Common logic for any schema loader that works with files (URLs or paths)"""
Expand Down Expand Up @@ -298,6 +326,14 @@ def get_all_publishers(self) -> dict[str, Publisher]:

return result

def get_all_scopes(self) -> dict[str, Scope]:
result = {}
for subdir in (self.root.parent / SCOPE_DIR).iterdir():
for file in subdir.glob("*.json"):
scope = Scope.from_dict(_read_json_path(file))
result[scope.id] = scope
return result


class FileSystemSchemaLoader(_FileBasedSchemaLoader):
"""Loader that loads dataset schemas from the filesystem."""
Expand Down
24 changes: 21 additions & 3 deletions src/schematools/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def __missing__(self, key: str) -> NoReturn:


class SchemaType(JsonDict):
"""Base class for top-level schema objects (dataset, table, profile, publisher).
"""Base class for top-level schema objects (dataset, table, profile, publisher, scope).

Each object should have an "id" and "type" property.
"""
Expand Down Expand Up @@ -2239,10 +2239,10 @@ def _normalize_scopes(auth: None | str | list | tuple) -> frozenset[str]:
return frozenset({_PUBLIC_SCOPE})
elif isinstance(auth, (list, tuple, set)):
# Multiple scopes act choices (OR match).
return frozenset(auth)
return frozenset([str(a) for a in auth])
else:
# Normalize single scope to set return type too.
return frozenset({auth})
return frozenset({str(auth)})


@dataclasses.dataclass
Expand All @@ -2262,3 +2262,21 @@ def from_file(cls, filename: str) -> Publisher:
@classmethod
def from_dict(cls, obj: Json) -> Publisher:
return cls(copy.deepcopy(obj))


class Scope(SchemaType):
id: str
name: str
owner: dict[str, str]

def __str__(self) -> str:
return self.id

@classmethod
def from_file(cls, filename: str) -> Scope:
with open(filename) as fh:
return cls.from_dict(json.load(fh))

@classmethod
def from_dict(cls, obj: Json) -> Scope:
return cls(copy.deepcopy(obj))
7 changes: 7 additions & 0 deletions tests/files/scopes/GLEBZ/glebzscope.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"name": "GLEBZscope",
"id": "GLEBZ",
"owner": {
"$ref": "publishers/GLEBZ"
}
}
7 changes: 7 additions & 0 deletions tests/files/scopes/HARRY/harryscope1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"name": "HARRYscope1",
"id": "HARRY/ONE",
"owner": {
"$ref": "publishers/HARRY"
}
}
7 changes: 7 additions & 0 deletions tests/files/scopes/HARRY/harryscope2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"name": "HARRYscope2",
"id": "HARRY/TWO",
"owner": {
"$ref": "publishers/HARRY"
}
}
30 changes: 30 additions & 0 deletions tests/test_loaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from schematools.loaders import URLSchemaLoader
from schematools.types import Scope


def test_load_all_publishers(schema_loader):
Expand Down Expand Up @@ -30,3 +31,32 @@ def test_load_all_publishers(schema_loader):
def test_publisher_url():
loader = URLSchemaLoader("https://foo.bar/baz/datasets/")
assert loader._get_publisher_url() == "https://foo.bar/baz/publishers"


def test_load_all_scopes(schema_loader):
scopes = schema_loader.get_all_scopes()
# Unclear why this needs the Scope() objects, while the test_load_all_publishers
# test does not need the Publisher() objects.
assert scopes == {
"GLEBZ": Scope(
{
"name": "GLEBZscope",
"id": "GLEBZ",
"owner": {"$ref": "publishers/GLEBZ"},
}
),
"HARRY/ONE": Scope(
{
"name": "HARRYscope1",
"id": "HARRY/ONE",
"owner": {"$ref": "publishers/HARRY"},
}
),
"HARRY/TWO": Scope(
{
"name": "HARRYscope2",
"id": "HARRY/TWO",
"owner": {"$ref": "publishers/HARRY"},
}
),
}
Loading