Skip to content

Commit

Permalink
Merge branch 'shacl-validation'
Browse files Browse the repository at this point in the history
  • Loading branch information
amercader committed Jul 15, 2024
2 parents 515586f + 7cdef98 commit 51d6513
Show file tree
Hide file tree
Showing 18 changed files with 1,681 additions and 170 deletions.
1 change: 0 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,3 @@ jobs:
ckan -c test.ini db pending-migrations --apply
- name: Run tests
run: pytest --ckan-ini=test.ini --cov=ckanext.dcat --cov-report=term-missing --cov-append --disable-warnings ckanext/dcat/tests

12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,21 @@
in the `ckanext/dcat/schemas` folder. See the [documentation](https://github.com/ckan/ckanext-dcat?tab=readme-ov-file#schemas)
for all details. Some highlights of the new scheming based profiles:

* Actual list support in the API ooutput for list properties like `dct:language`
* Actual list support in the API output for list properties like `dct:language`
* Multiple objects now allowed for properties like `dcat:ContactPoint`, `dct:spatial` or `dct:temporal`
* Custom validators for date values that allow `xsd:gYear`, `xsd:gYearMonth`, `xsd:date` and `xsd:dateTime`

(#281)
* [SHACL validation](https://github.com/SEMICeu/DCAT-AP/tree/master/releases/2.1.1) for DCAT-AP 2.1.1 profile (scheming and legacy).
SHACL validation made surface the following issues in the existing profiles, which are now fixed:
* Cast `dcat:byteSize` and `dcat:spatialResolutionInMeters` as Decimal, not float
* Allow only one value of `dcat:spatialResolutionInMeters` and `dcat:temporalResolution`
* Only output the WKT version of geometries in `locn:geometry`, `dcat:bbox` and `dcat:centroid`. Sites that for some reason
require GeoJSON (or both) can use the `ckanext.dcat.output_spatial_format` config option
to choose which format to use
* When using the `euro_dcat_ap_2` profile, don't output temporal extent namespaced
both with `schema` and `dcat`, just with the latter (`dcat:startDate` and `dcat:endDate`)
(#288)
* New `ckan dcat consume` and `ckan dcat produce` CLI commands (#279)
* Parse dcat:spatialResolutionInMeters as float (#285)
* Split profile classes into their own separate files (#282)
Expand Down
53 changes: 35 additions & 18 deletions ckanext/dcat/profiles/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS
from geomet import wkt, InvalidGeoJSONException

from ckantoolkit import config, url_for, asbool, get_action, ObjectNotFound
from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound
from ckan.model.license import LicenseRegister
from ckan.lib.helpers import resource_formats
from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS
Expand Down Expand Up @@ -46,6 +46,8 @@

GEOJSON_IMT = "https://www.iana.org/assignments/media-types/application/vnd.geo+json"

DEFAULT_SPATIAL_FORMATS = ["wkt"]

ROOT_DATASET_FIELDS = [
'name',
'title',
Expand Down Expand Up @@ -728,26 +730,41 @@ def _read_list_value(self, value):

def _add_spatial_value_to_graph(self, spatial_ref, predicate, value):
"""
Adds spatial triples to the graph.
Adds spatial triples to the graph. Assumes that value is a GeoJSON string
or object.
"""
# GeoJSON
self.g.add((spatial_ref, predicate, Literal(value, datatype=GEOJSON_IMT)))
# WKT, because GeoDCAT-AP says so
try:
if isinstance(value, str):
spatial_formats = aslist(
config.get(
"ckanext.dcat.output_spatial_format", DEFAULT_SPATIAL_FORMATS
)
)

if isinstance(value, str):
try:
value = json.loads(value)
self.g.add(
(
spatial_ref,
predicate,
Literal(
wkt.dumps(value, decimals=4),
datatype=GSP.wktLiteral,
),
except (TypeError, ValueError):
return

if "wkt" in spatial_formats:
# WKT, because GeoDCAT-AP says so
try:
self.g.add(
(
spatial_ref,
predicate,
Literal(
wkt.dumps(value, decimals=4),
datatype=GSP.wktLiteral,
),
)
)
)
except (TypeError, ValueError, InvalidGeoJSONException) as e:
pass
except (TypeError, ValueError, InvalidGeoJSONException):
pass

if "geojson" in spatial_formats:
# GeoJSON
self.g.add((spatial_ref, predicate, Literal(json.dumps(value), datatype=GEOJSON_IMT)))


def _add_spatial_to_dict(self, dataset_dict, key, spatial):
if spatial.get(key):
Expand Down
5 changes: 3 additions & 2 deletions ckanext/dcat/profiles/euro_dcat_ap.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from decimal import Decimal, DecimalException

from rdflib import term, URIRef, BNode, Literal
import ckantoolkit as toolkit
Expand Down Expand Up @@ -545,10 +546,10 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
(
distribution,
DCAT.byteSize,
Literal(float(resource_dict["size"]), datatype=XSD.decimal),
Literal(Decimal(resource_dict["size"]), datatype=XSD.decimal),
)
)
except (ValueError, TypeError):
except (ValueError, TypeError, DecimalException):
g.add((distribution, DCAT.byteSize, Literal(resource_dict["size"])))
# Checksum
if resource_dict.get("hash"):
Expand Down
50 changes: 36 additions & 14 deletions ckanext/dcat/profiles/euro_dcat_ap_2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from decimal import Decimal, DecimalException

from rdflib import URIRef, BNode, Literal
from ckanext.dcat.utils import resource_uri
Expand All @@ -11,6 +12,7 @@
DCATAP,
DCT,
XSD,
SCHEMA,
)

from .euro_dcat_ap import EuropeanDCATAPProfile
Expand All @@ -31,9 +33,13 @@ def parse_dataset(self, dataset_dict, dataset_ref):
# call super method
super(EuropeanDCATAP2Profile, self).parse_dataset(dataset_dict, dataset_ref)

# Standard values
value = self._object_value(dataset_ref, DCAT.temporalResolution)
if value:
dataset_dict["extras"].append({"key": "temporal_resolution", "value": value})

# Lists
for key, predicate in (
("temporal_resolution", DCAT.temporalResolution),
("is_referenced_by", DCT.isReferencedBy),
("applicable_legislation", DCATAP.applicableLegislation),
("hvd_category", DCATAP.hvdCategory),
Expand All @@ -54,14 +60,20 @@ def parse_dataset(self, dataset_dict, dataset_ref):
self._add_spatial_to_dict(dataset_dict, key, spatial)

# Spatial resolution in meters
spatial_resolution_in_meters = self._object_value_float_list(
spatial_resolution = self._object_value_float_list(
dataset_ref, DCAT.spatialResolutionInMeters
)
if spatial_resolution_in_meters:
if spatial_resolution:
# For some reason we incorrectly allowed lists in this property at some point
# keep support for it but default to single value
value = (
spatial_resolution[0] if len(spatial_resolution) == 1
else json.dumps(spatial_resolution)
)
dataset_dict["extras"].append(
{
"key": "spatial_resolution_in_meters",
"value": json.dumps(spatial_resolution_in_meters),
"value": value,
}
)

Expand Down Expand Up @@ -147,15 +159,17 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
dataset_dict, dataset_ref
)

# Standard values
self._add_triple_from_dict(
dataset_dict,
dataset_ref,
DCAT.temporalResolution,
"temporal_resolution",
_datatype=XSD.duration,
)

# Lists
for key, predicate, fallbacks, type, datatype in (
(
"temporal_resolution",
DCAT.temporalResolution,
None,
Literal,
XSD.duration,
),
("is_referenced_by", DCT.isReferencedBy, None, URIRefOrLiteral, None),
(
"applicable_legislation",
Expand All @@ -178,6 +192,14 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
)

# Temporal

# The profile for DCAT-AP 1 stored triples using schema:startDate,
# remove them to avoid duplication
for temporal in self.g.objects(dataset_ref, DCT.temporal):
if SCHEMA.startDate in [t for t in self.g.predicates(temporal, None)]:
self.g.remove((temporal, None, None))
self.g.remove((dataset_ref, DCT.temporal, temporal))

start = self._get_dataset_value(dataset_dict, "temporal_start")
end = self._get_dataset_value(dataset_dict, "temporal_end")
if start or end:
Expand Down Expand Up @@ -216,10 +238,10 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
(
dataset_ref,
DCAT.spatialResolutionInMeters,
Literal(float(value), datatype=XSD.decimal),
Literal(Decimal(value), datatype=XSD.decimal),
)
)
except (ValueError, TypeError):
except (ValueError, TypeError, DecimalException):
self.g.add(
(dataset_ref, DCAT.spatialResolutionInMeters, Literal(value))
)
Expand Down Expand Up @@ -278,7 +300,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):
("license", DCT.license, None, URIRefOrLiteral),
("access_rights", DCT.accessRights, None, URIRefOrLiteral),
("title", DCT.title, None, Literal),
("endpoint_description", DCAT.endpointDescription, None, Literal),
("endpoint_description", DCAT.endpointDescription, None, URIRefOrLiteral),
("description", DCT.description, None, Literal),
]

Expand Down
4 changes: 2 additions & 2 deletions ckanext/dcat/profiles/euro_dcat_ap_scheming.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ def _not_empty_dict(data_dict):
temporal_ref = BNode()
self.g.add((temporal_ref, RDF.type, DCT.PeriodOfTime))
if item.get("start"):
self._add_date_triple(temporal_ref, SCHEMA.startDate, item["start"])
self._add_date_triple(temporal_ref, DCAT.startDate, item["start"])
if item.get("end"):
self._add_date_triple(temporal_ref, SCHEMA.endDate, item["end"])
self._add_date_triple(temporal_ref, DCAT.endDate, item["end"])
self.g.add((dataset_ref, DCT.temporal, temporal_ref))

spatial = dataset_dict.get("spatial_coverage")
Expand Down
13 changes: 9 additions & 4 deletions ckanext/dcat/schemas/dcat_ap_2.1_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,6 @@ dataset_fields:

- field_name: temporal_resolution
label: Temporal resolution
preset: multiple_text
validators: ignore_missing scheming_multiple_text
help_text: Minimum time period resolvable in the dataset.

- field_name: spatial_coverage
Expand All @@ -169,8 +167,6 @@ dataset_fields:

- field_name: spatial_resolution_in_meters
label: Spatial resolution in meters
preset: multiple_text
validators: ignore_missing scheming_multiple_number
help_text: Minimum spatial separation resolvable in a dataset, measured in meters.

- field_name: access_rights
Expand Down Expand Up @@ -368,9 +364,18 @@ resource_fields:
- field_name: title
label: Title

- field_name: endpoint_description
label: Endpoint description

- field_name: endpoint_url
label: Endpoint URL
preset: multiple_text

- field_name: serves_dataset
label: Serves dataset
preset: multiple_text
validators: ignore_missing scheming_multiple_text

help_text: A data service that gives access to the resource.

# Note: if not provided, this will be autogenerated
Expand Down
Loading

0 comments on commit 51d6513

Please sign in to comment.