Amsterdam · pstokkink · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml
@@ -11,3 +11,10 @@
   language: python
   exclude: ^.*$ # prevent passing any files
   always_run: true
+- id: validate-scopes
+  name: Validate Schema publishers
+  description: Validate Amsterdam Schema scope files
+  entry: schema validate-scopes
+  language: python
+  exclude: ^.*$ # prevent passing any files
+  always_run: true
diff --git a/src/schematools/__init__.py b/src/schematools/__init__.py
@@ -18,7 +18,9 @@
 PUBLISHER_DIR: Final[str] = "publishers"
 # Files that can exist in publishers directory but should be ignored by
 # the FileLoaders
-PUBLISHER_EXCLUDE_FILES: Final[str] = ["publishers.json", "index.json"]
+PUBLISHER_EXCLUDE_FILES: Final[list[str]] = ["publishers.json", "index.json"]
+# The directory where all scope objects are defined for amsterdam-schema
+SCOPE_DIR: Final[str] = "scopes"
 
 # Common coordinate reference systems
 CRS_WGS84: Final[str] = "EPSG:4326"  # World Geodetic System 1984, used in GPS

diff --git a/src/schematools/cli.py b/src/schematools/cli.py
@@ -57,7 +57,7 @@
     revoke_permissions,
 )
 from schematools.provenance.create import ProvenanceIteration
-from schematools.types import DatasetSchema, Publisher, SemVer
+from schematools.types import DatasetSchema, Publisher, Scope, SemVer
 
 # Configure a simple stdout logger for permissions output
 logger = logging.getLogger("schematools.permissions")
@@ -372,7 +372,10 @@ def _fetch_json(location: str) -> dict[str, Any]:
         JSON data as a dictionary.
     """
     if not location.startswith("http"):
-        with open(location) as f:
+        schema_file = Path(location)
+        if schema_file.is_dir():
+            schema_file = schema_file / "schema.json"
+        with open(schema_file) as f:
             json_obj = json.load(f)
     else:
         response = requests.get(location, timeout=60)
@@ -524,6 +527,63 @@ def validate_publishers(schema_url: str, meta_schema_url: tuple[str]) -> None:
     sys.exit(1)
 
 
+@schema.command()
+@option_schema_url
+@click.argument("meta_schema_url", nargs=-1)
+def validate_scopes(schema_url: str, meta_schema_url: tuple[str]) -> None:
+    """Validate all scopes against the Amsterdam Schema meta schema.
+
+    Args:
+
+    \b
+        META_SCHEMA_URL: URL where the meta schema for Amsterdam Schema definitions can be found.
+        If multiple are given, schematools will try to validate against the largest version,
+        working backwards and stopping at the first version that the objects are valid against.
+
+    Options:
+
+    \b
+        SCHEMA_URL: URL where the datasets for Amsterdam Schema definitions can be found. The path
+        component of this uri is dropped to find the scopes in the root. For example, if
+        SCHEMA_URL=https://example.com/datasets, the scopes are extracted from
+        https://example.com/scopes.
+    """  # noqa: D301,D412,D417
+    for meta_schema_version, url in sorted(
+        [(version_from_metaschema_url(u), u) for u in set(meta_schema_url)],
+        reverse=True,
+    ):
+        meta_schema = _fetch_json(url)
+        if meta_schema_version.major not in COMPATIBLE_METASCHEMAS:
+            raise IncompatibleMetaschema(
+                f"Schematools {pkg_version} is not"
+                f"compatible with metaschema {meta_schema_version}"
+            )
+
+        click.echo(f"Validating against metaschema {meta_schema_version}")
+        scopes = _get_scopes(schema_url)
+        print(scopes)
+        structural_errors = False
+        for id_, scope in scopes.items():
+            try:
+                click.echo(f"Validating scope with id {id_}")
+                jsonschema.validate(
+                    instance=scope.json_data(),
+                    schema=meta_schema,
+                    format_checker=draft7_format_checker,
+                )
+            except (jsonschema.ValidationError, jsonschema.SchemaError) as e:
+                click.echo("Structural validation: ", nl=False)
+                structural_errors = True
+                click.echo(format_schema_error(e), err=True)
+
+        if structural_errors:
+            continue
+        click.echo(f"All scopes are structurally valid against {meta_schema_version}.")
+        sys.exit(0)
+    click.echo("Scopes are structurally invalid against all supplied metaschema versions")
+    sys.exit(1)
+
+
 @schema.command()
 @click.argument("meta_schema_url")
 @click.argument("schema_files", nargs=-1)
@@ -888,6 +948,21 @@ def _get_publishers(schema_url: str) -> dict[str, Publisher]:
         raise click.ClickException(str(e)) from None
 
 
+def _get_scopes(schema_url: str) -> dict[str, Scope]:
+    """Find the scopes from the given schema_url.
+
+    Args:
+        dataset_id: id of the dataset.
+        schema_url: url of the location where the collection of amsterdam schemas is found.
+        prefetch_related: related schemas should be prefetched.
+    """
+    loader = get_schema_loader(schema_url)
+    try:
+        return loader.get_all_scopes()
+    except SchemaObjectNotFound as e:
+        raise click.ClickException(str(e)) from None
+
+
 @create.command("extra_index")
 @option_db_url
 @option_schema_url

diff --git a/src/schematools/loaders.py b/src/schematools/loaders.py
@@ -16,13 +16,21 @@
     DEFAULT_SCHEMA_URL,
     PUBLISHER_DIR,
     PUBLISHER_EXCLUDE_FILES,
+    SCOPE_DIR,
 )
 from schematools.exceptions import (
     DatasetNotFound,
     DatasetTableNotFound,
     SchemaObjectNotFound,
 )
-from schematools.types import DatasetSchema, DatasetTableSchema, Json, ProfileSchema, Publisher
+from schematools.types import (
+    DatasetSchema,
+    DatasetTableSchema,
+    Json,
+    ProfileSchema,
+    Publisher,
+    Scope,
+)
 
 __all__ = (
     "get_schema_loader",
@@ -66,6 +74,13 @@ def get_all_publishers(self) -> dict[str, Publisher]:
         """
         raise NotImplementedError
 
+    def get_all_scopes(self) -> dict[str, Scope]:
+        """Get all scopes from the schema location
+
+        The return value maps scope ids to Scope objects.
+        """
+        raise NotImplementedError
+
     def get_publisher(self, publisher_id: str) -> dict[str, Publisher]:
         raise NotImplementedError
 
@@ -90,8 +105,10 @@ def __init__(self, loader: SchemaLoader | None):
         self._loader = loader
         self._cache: dict[str, DatasetSchema] = {}
         self._publisher_cache: dict[str, Publisher] = {}
+        self._scopes_cache: dict[str, Scope] = {}
         self._table_cache: dict[tuple[str, str], DatasetTableSchema] = {}
         self._has_all_publishers = False
+        self._has_all_scopes = False
         self._has_all = False
 
     def __repr__(self):
@@ -175,6 +192,17 @@ def get_all_publishers(self) -> dict[str, Publisher]:
 
         return self._publisher_cache
 
+    def get_all_scopes(self) -> dict[str, Scope]:
+        """Load all publishers, and fill the cache"""
+        if not self._has_all_scopes:
+            if self._loader is None:
+                raise RuntimeError("This dataset collection can't retrieve new scopes")
+
+            self._scopes_cache = self._loader.get_all_scopes()
+            self._has_all_scopes = True
+
+        return self._scopes_cache
+
 
 class _FileBasedSchemaLoader(SchemaLoader):
     """Common logic for any schema loader that works with files (URLs or paths)"""
@@ -298,6 +326,14 @@ def get_all_publishers(self) -> dict[str, Publisher]:
 
         return result
 
+    def get_all_scopes(self) -> dict[str, Scope]:
+        result = {}
+        for subdir in (self.root.parent / SCOPE_DIR).iterdir():
+            for file in subdir.glob("*.json"):
+                scope = Scope.from_dict(_read_json_path(file))
+                result[scope.id] = scope
+        return result
+
 
 class FileSystemSchemaLoader(_FileBasedSchemaLoader):
     """Loader that loads dataset schemas from the filesystem."""

diff --git a/src/schematools/types.py b/src/schematools/types.py
@@ -241,7 +241,7 @@ def __missing__(self, key: str) -> NoReturn:
 
 
 class SchemaType(JsonDict):
-    """Base class for top-level schema objects (dataset, table, profile, publisher).
+    """Base class for top-level schema objects (dataset, table, profile, publisher, scope).
 
     Each object should have an "id" and "type" property.
     """
@@ -2239,10 +2239,10 @@ def _normalize_scopes(auth: None | str | list | tuple) -> frozenset[str]:
         return frozenset({_PUBLIC_SCOPE})
     elif isinstance(auth, (list, tuple, set)):
         # Multiple scopes act choices (OR match).
-        return frozenset(auth)
+        return frozenset([str(a) for a in auth])
     else:
         # Normalize single scope to set return type too.
-        return frozenset({auth})
+        return frozenset({str(auth)})
 
 
 @dataclasses.dataclass
@@ -2262,3 +2262,21 @@ def from_file(cls, filename: str) -> Publisher:
     @classmethod
     def from_dict(cls, obj: Json) -> Publisher:
         return cls(copy.deepcopy(obj))
+
+
+class Scope(SchemaType):
+    id: str
+    name: str
+    owner: dict[str, str]
+
+    def __str__(self) -> str:
+        return self.id
+
+    @classmethod
+    def from_file(cls, filename: str) -> Scope:
+        with open(filename) as fh:
+            return cls.from_dict(json.load(fh))
+
+    @classmethod
+    def from_dict(cls, obj: Json) -> Scope:
+        return cls(copy.deepcopy(obj))
diff --git a/tests/files/scopes/GLEBZ/glebzscope.json b/tests/files/scopes/GLEBZ/glebzscope.json
@@ -0,0 +1,7 @@
+{
+    "name": "GLEBZscope",
+    "id": "GLEBZ",
+    "owner": {
+        "$ref": "publishers/GLEBZ"
+    }
+}
diff --git a/tests/files/scopes/HARRY/harryscope1.json b/tests/files/scopes/HARRY/harryscope1.json
@@ -0,0 +1,7 @@
+{
+    "name": "HARRYscope1",
+    "id": "HARRY/ONE",
+    "owner": {
+        "$ref": "publishers/HARRY"
+    }
+}
diff --git a/tests/files/scopes/HARRY/harryscope2.json b/tests/files/scopes/HARRY/harryscope2.json
@@ -0,0 +1,7 @@
+{
+    "name": "HARRYscope2",
+    "id": "HARRY/TWO",
+    "owner": {
+        "$ref": "publishers/HARRY"
+    }
+}
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from schematools.loaders import URLSchemaLoader
+from schematools.types import Scope
 
 
 def test_load_all_publishers(schema_loader):
@@ -30,3 +31,32 @@ def test_load_all_publishers(schema_loader):
 def test_publisher_url():
     loader = URLSchemaLoader("https://foo.bar/baz/datasets/")
     assert loader._get_publisher_url() == "https://foo.bar/baz/publishers"
+
+
+def test_load_all_scopes(schema_loader):
+    scopes = schema_loader.get_all_scopes()
+    # Unclear why this needs the Scope() objects, while the test_load_all_publishers
+    # test does not need the Publisher() objects.
+    assert scopes == {
+        "GLEBZ": Scope(
+            {
+                "name": "GLEBZscope",
+                "id": "GLEBZ",
+                "owner": {"$ref": "publishers/GLEBZ"},
+            }
+        ),
+        "HARRY/ONE": Scope(
+            {
+                "name": "HARRYscope1",
+                "id": "HARRY/ONE",
+                "owner": {"$ref": "publishers/HARRY"},
+            }
+        ),
+        "HARRY/TWO": Scope(
+            {
+                "name": "HARRYscope2",
+                "id": "HARRY/TWO",
+                "owner": {"$ref": "publishers/HARRY"},
+            }
+        ),
+    }