Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New ckan dcat consume and ckan dcat produce commands #279

Merged
merged 6 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ It also offers other features related to Semantic Data like exposing the necessa
- [XML DCAT harvester (deprecated)](#xml-dcat-harvester-deprecated)
- [Translation of fields](#translation-of-fields)
- [Structured Data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing)
- [CLI](#cli)
- [Running the Tests](#running-the-tests)
- [Releases](#releases)
- [Acknowledgements](#acknowledgements)
Expand Down Expand Up @@ -944,6 +945,25 @@ Example output of structured data in JSON-LD:
</html>


## CLI

The `ckan dcat` command offers utilites to transform between DCAT RDF Serializations and CKAN datasets (`ckan dcat consume`) and
viceversa (`ckan dcat produce`). In both cases the input can be provided as a path to a file:

ckan dcat consume -f ttl examples/dataset.ttl

ckan dcat produce -f jsonld examples/ckan_datasets.json

or be read from stdin:

ckan dcat consume -

The latter form allows chaininig commands for more complex metadata processing, e.g.:

curl https://demo.ckan.org/api/action/package_search | jq .result.results | ckan dcat produce -f jsonld -

For the full list of options check `ckan dcat consume --help` and `ckan dcat produce --help`.

## Running the Tests

To run the tests do:
Expand Down
127 changes: 117 additions & 10 deletions ckanext/dcat/cli.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,132 @@
# -*- coding: utf-8 -*-
import json

import click

import ckan.plugins.toolkit as tk

import ckanext.dcat.utils as utils
from ckanext.dcat.processors import RDFParser, RDFSerializer, DEFAULT_RDF_PROFILES

@click.group()
def generate_static():
"""Generates static files containing all datasets.

"""
@click.group()
def dcat():
"""DCAT utilities for CKAN"""
pass

@generate_static.command()
@click.argument('output', type=click.File(mode="w"))
def json(output):
"""The generate command will generate a static file containing all of
the datasets in the catalog in JSON format.

@dcat.command()
@click.argument("output", type=click.File(mode="w"))
def generate_static(output):
"""[Deprecated] Generate a static datasets file in JSON format
(requires the dcat_json_interface plugin) .
"""
utils.generate_static_json(output)


@dcat.command(context_settings={"show_default": True})
@click.argument("input", type=click.File(mode="r"))
@click.option(
"-o",
"--output",
type=click.File(mode="w"),
default="-",
help="By default the command will output the result to stdin, "
"alternatively you can provide a file path with this option",
)
@click.option(
"-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)"
)
@click.option(
"-p",
"--profiles",
default=" ".join(DEFAULT_RDF_PROFILES),
help="RDF profiles to use",
)
@click.option(
"-P", "--pretty", is_flag=True, help="Make the output more human readable"
)
@click.option(
"-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)"
)
def consume(input, output, format, profiles, pretty, compat_mode):
"""
Parses DCAT RDF graphs into CKAN dataset JSON objects.

The input serializations can be provided as a path to a file, e.g.:

ckan dcat consume examples/dataset.ttl

Or be read from stdin:

ckan dcat consume -
"""
contents = input.read()

if profiles:
profiles = profiles.split()
parser = RDFParser(profiles=profiles, compatibility_mode=compat_mode)
parser.parse(contents, _format=format)

ckan_datasets = [d for d in parser.datasets()]

indent = 4 if pretty else None
out = json.dumps(ckan_datasets, indent=indent)

output.write(out)


@dcat.command(context_settings={"show_default": True})
@click.argument("input", type=click.File(mode="r"))
@click.option(
"-o",
"--output",
type=click.File(mode="w"),
default="-",
help="By default the command will output the result to stdin, "
"alternatively you can provide a file path with this option",
)
@click.option(
"-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)"
)
@click.option(
"-p",
"--profiles",
default=" ".join(DEFAULT_RDF_PROFILES),
help="RDF profiles to use",
)
@click.option(
"-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)"
)
def produce(input, output, format, profiles, compat_mode):
"""
Transforms CKAN dataset JSON objects into DCAT RDF serializations.

The input datasets can be provided as a path to a file, e.g.:

ckan dcat consume examples/ckan_dataset.json

Or be read from stdin:

ckan dcat produce -
"""
contents = input.read()

if profiles:
profiles = profiles.split()
serializer = RDFSerializer(
profiles=profiles,
compatibility_mode=compat_mode
)

dataset = json.loads(contents)
if isinstance(dataset, list):
out = serializer.serialize_datasets(dataset, _format=format)
else:
out = serializer.serialize_dataset(dataset, _format=format)

output.write(out)


def get_commands():
return [generate_static]
return [dcat]
72 changes: 16 additions & 56 deletions ckanext/dcat/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,22 @@ def serialize_dataset(self, dataset_dict, _format='xml'):

return output

def serialize_datasets(self, dataset_dicts, _format='xml'):
'''
Given a list of CKAN dataset dicts, returns an RDF serialization

The serialization format can be defined using the `_format` parameter.
It must be one of the ones supported by RDFLib, defaults to `xml`.

Returns a string with the serialized datasets
'''
out = []
for dataset_dict in dataset_dicts:
out.append(self.serialize_dataset(dataset_dict, _format))
return '\n'.join(out)



def serialize_catalog(self, catalog_dict=None, dataset_dicts=None,
_format='xml', pagination_info=None):
'''
Expand Down Expand Up @@ -394,59 +410,3 @@ def _get_from_extra(key):
g.add((agent, predicate, _type(val)))

return catalog_ref


if __name__ == '__main__':

parser = argparse.ArgumentParser(
description='DCAT RDF - CKAN operations')
parser.add_argument('mode',
default='consume',
help='''
Operation mode.
`consume` parses DCAT RDF graphs to CKAN dataset JSON objects.
`produce` serializes CKAN dataset JSON objects into DCAT RDF.
''')
parser.add_argument('file', nargs='?', type=argparse.FileType('r'),
default=sys.stdin,
help='Input file. If omitted will read from stdin')
parser.add_argument('-f', '--format',
default='xml',
help='''Serialization format (as understood by rdflib)
eg: xml, n3 ... Defaults to \'xml\'.''')
parser.add_argument('-P', '--pretty',
action='store_true',
help='Make the output more human readable')
parser.add_argument('-p', '--profile', nargs='*',
action='store',
help='RDF Profiles to use, defaults to euro_dcat_ap_2')
parser.add_argument('-m', '--compat-mode',
action='store_true',
help='Enable compatibility mode')

parser.add_argument('-s', '--subcatalogs', action='store_true', dest='subcatalogs',
default=False,
help="Enable subcatalogs handling (dct:hasPart support)")
args = parser.parse_args()

contents = args.file.read()

config.update({DCAT_EXPOSE_SUBCATALOGS: args.subcatalogs})

if args.mode == 'produce':
serializer = RDFSerializer(profiles=args.profile,
compatibility_mode=args.compat_mode)

dataset = json.loads(contents)
out = serializer.serialize_dataset(dataset, _format=args.format)
print(out)
else:
parser = RDFParser(profiles=args.profile,
compatibility_mode=args.compat_mode)

parser.parse(contents, _format=args.format)

ckan_datasets = [d for d in parser.datasets()]

indent = 4 if args.pretty else None
print(json.dumps(ckan_datasets, indent=indent))
33 changes: 33 additions & 0 deletions ckanext/dcat/tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import json
import os

from ckanext.dcat.cli import dcat as dcat_cli


def test_consume(cli):

path = os.path.join(
os.path.dirname(__file__), "..", "..", "..", "examples", "dataset_afs.ttl"
)

result = cli.invoke(dcat_cli, ["consume", "-f", "ttl", path])
assert result.exit_code == 0

assert json.loads(result.stdout)[0]["title"] == "A test dataset on your catalogue"


def test_produce(cli):

path = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"..",
"examples",
"full_ckan_dataset.json",
)

result = cli.invoke(dcat_cli, ["produce", "-f", "jsonld", path])
assert result.exit_code == 0

assert json.loads(result.stdout)["@context"]["dcat"] == "http://www.w3.org/ns/dcat#"
Loading