diff --git a/README.md b/README.md index aaec4f33..c79fc710 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ It also offers other features related to Semantic Data like exposing the necessa - [XML DCAT harvester (deprecated)](#xml-dcat-harvester-deprecated) - [Translation of fields](#translation-of-fields) - [Structured Data and Google Dataset Search indexing](#structured-data-and-google-dataset-search-indexing) +- [CLI](#cli) - [Running the Tests](#running-the-tests) - [Releases](#releases) - [Acknowledgements](#acknowledgements) @@ -944,6 +945,25 @@ Example output of structured data in JSON-LD: +## CLI + +The `ckan dcat` command offers utilites to transform between DCAT RDF Serializations and CKAN datasets (`ckan dcat consume`) and +viceversa (`ckan dcat produce`). In both cases the input can be provided as a path to a file: + + ckan dcat consume -f ttl examples/dataset.ttl + + ckan dcat produce -f jsonld examples/ckan_datasets.json + +or be read from stdin: + + ckan dcat consume - + +The latter form allows chaininig commands for more complex metadata processing, e.g.: + + curl https://demo.ckan.org/api/action/package_search | jq .result.results | ckan dcat produce -f jsonld - + +For the full list of options check `ckan dcat consume --help` and `ckan dcat produce --help`. + ## Running the Tests To run the tests do: diff --git a/ckanext/dcat/cli.py b/ckanext/dcat/cli.py index ade76959..83d8b539 100644 --- a/ckanext/dcat/cli.py +++ b/ckanext/dcat/cli.py @@ -1,25 +1,132 @@ # -*- coding: utf-8 -*- +import json import click + import ckan.plugins.toolkit as tk + import ckanext.dcat.utils as utils +from ckanext.dcat.processors import RDFParser, RDFSerializer, DEFAULT_RDF_PROFILES -@click.group() -def generate_static(): - """Generates static files containing all datasets. - """ +@click.group() +def dcat(): + """DCAT utilities for CKAN""" pass -@generate_static.command() -@click.argument('output', type=click.File(mode="w")) -def json(output): - """The generate command will generate a static file containing all of - the datasets in the catalog in JSON format. +@dcat.command() +@click.argument("output", type=click.File(mode="w")) +def generate_static(output): + """[Deprecated] Generate a static datasets file in JSON format + (requires the dcat_json_interface plugin) . """ utils.generate_static_json(output) +@dcat.command(context_settings={"show_default": True}) +@click.argument("input", type=click.File(mode="r")) +@click.option( + "-o", + "--output", + type=click.File(mode="w"), + default="-", + help="By default the command will output the result to stdin, " + "alternatively you can provide a file path with this option", +) +@click.option( + "-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)" +) +@click.option( + "-p", + "--profiles", + default=" ".join(DEFAULT_RDF_PROFILES), + help="RDF profiles to use", +) +@click.option( + "-P", "--pretty", is_flag=True, help="Make the output more human readable" +) +@click.option( + "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)" +) +def consume(input, output, format, profiles, pretty, compat_mode): + """ + Parses DCAT RDF graphs into CKAN dataset JSON objects. + + The input serializations can be provided as a path to a file, e.g.: + + ckan dcat consume examples/dataset.ttl + + Or be read from stdin: + + ckan dcat consume - + """ + contents = input.read() + + if profiles: + profiles = profiles.split() + parser = RDFParser(profiles=profiles, compatibility_mode=compat_mode) + parser.parse(contents, _format=format) + + ckan_datasets = [d for d in parser.datasets()] + + indent = 4 if pretty else None + out = json.dumps(ckan_datasets, indent=indent) + + output.write(out) + + +@dcat.command(context_settings={"show_default": True}) +@click.argument("input", type=click.File(mode="r")) +@click.option( + "-o", + "--output", + type=click.File(mode="w"), + default="-", + help="By default the command will output the result to stdin, " + "alternatively you can provide a file path with this option", +) +@click.option( + "-f", "--format", default="xml", help="Serialization format (eg ttl, jsonld)" +) +@click.option( + "-p", + "--profiles", + default=" ".join(DEFAULT_RDF_PROFILES), + help="RDF profiles to use", +) +@click.option( + "-m", "--compat_mode", is_flag=True, help="Compatibility mode (deprecated)" +) +def produce(input, output, format, profiles, compat_mode): + """ + Transforms CKAN dataset JSON objects into DCAT RDF serializations. + + The input datasets can be provided as a path to a file, e.g.: + + ckan dcat consume examples/ckan_dataset.json + + Or be read from stdin: + + ckan dcat produce - + """ + contents = input.read() + + if profiles: + profiles = profiles.split() + serializer = RDFSerializer( + profiles=profiles, + compatibility_mode=compat_mode + ) + + dataset = json.loads(contents) + if isinstance(dataset, list): + out = serializer.serialize_datasets(dataset, _format=format) + else: + out = serializer.serialize_dataset(dataset, _format=format) + + output.write(out) + + def get_commands(): - return [generate_static] + return [dcat] diff --git a/ckanext/dcat/processors.py b/ckanext/dcat/processors.py index 864f734c..e6093443 100644 --- a/ckanext/dcat/processors.py +++ b/ckanext/dcat/processors.py @@ -291,6 +291,22 @@ def serialize_dataset(self, dataset_dict, _format='xml'): return output + def serialize_datasets(self, dataset_dicts, _format='xml'): + ''' + Given a list of CKAN dataset dicts, returns an RDF serialization + + The serialization format can be defined using the `_format` parameter. + It must be one of the ones supported by RDFLib, defaults to `xml`. + + Returns a string with the serialized datasets + ''' + out = [] + for dataset_dict in dataset_dicts: + out.append(self.serialize_dataset(dataset_dict, _format)) + return '\n'.join(out) + + + def serialize_catalog(self, catalog_dict=None, dataset_dicts=None, _format='xml', pagination_info=None): ''' @@ -394,59 +410,3 @@ def _get_from_extra(key): g.add((agent, predicate, _type(val))) return catalog_ref - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser( - description='DCAT RDF - CKAN operations') - parser.add_argument('mode', - default='consume', - help=''' -Operation mode. -`consume` parses DCAT RDF graphs to CKAN dataset JSON objects. -`produce` serializes CKAN dataset JSON objects into DCAT RDF. - ''') - parser.add_argument('file', nargs='?', type=argparse.FileType('r'), - default=sys.stdin, - help='Input file. If omitted will read from stdin') - parser.add_argument('-f', '--format', - default='xml', - help='''Serialization format (as understood by rdflib) - eg: xml, n3 ... Defaults to \'xml\'.''') - parser.add_argument('-P', '--pretty', - action='store_true', - help='Make the output more human readable') - parser.add_argument('-p', '--profile', nargs='*', - action='store', - help='RDF Profiles to use, defaults to euro_dcat_ap_2') - parser.add_argument('-m', '--compat-mode', - action='store_true', - help='Enable compatibility mode') - - parser.add_argument('-s', '--subcatalogs', action='store_true', dest='subcatalogs', - default=False, - help="Enable subcatalogs handling (dct:hasPart support)") - args = parser.parse_args() - - contents = args.file.read() - - config.update({DCAT_EXPOSE_SUBCATALOGS: args.subcatalogs}) - - if args.mode == 'produce': - serializer = RDFSerializer(profiles=args.profile, - compatibility_mode=args.compat_mode) - - dataset = json.loads(contents) - out = serializer.serialize_dataset(dataset, _format=args.format) - print(out) - else: - parser = RDFParser(profiles=args.profile, - compatibility_mode=args.compat_mode) - - parser.parse(contents, _format=args.format) - - ckan_datasets = [d for d in parser.datasets()] - - indent = 4 if args.pretty else None - print(json.dumps(ckan_datasets, indent=indent)) diff --git a/ckanext/dcat/tests/test_cli.py b/ckanext/dcat/tests/test_cli.py new file mode 100644 index 00000000..ecacdc37 --- /dev/null +++ b/ckanext/dcat/tests/test_cli.py @@ -0,0 +1,33 @@ +import json +import os + +from ckanext.dcat.cli import dcat as dcat_cli + + +def test_consume(cli): + + path = os.path.join( + os.path.dirname(__file__), "..", "..", "..", "examples", "dataset_afs.ttl" + ) + + result = cli.invoke(dcat_cli, ["consume", "-f", "ttl", path]) + assert result.exit_code == 0 + + assert json.loads(result.stdout)[0]["title"] == "A test dataset on your catalogue" + + +def test_produce(cli): + + path = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "..", + "examples", + "full_ckan_dataset.json", + ) + + result = cli.invoke(dcat_cli, ["produce", "-f", "jsonld", path]) + assert result.exit_code == 0 + + assert json.loads(result.stdout)["@context"]["dcat"] == "http://www.w3.org/ns/dcat#"