From 72ed34cdb102528b4d910f923268f1068146759f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cristiano=20K=C3=B6hler?= <42555442+kohlerca@users.noreply.github.com> Date: Wed, 6 Dec 2023 10:46:05 +0100 Subject: [PATCH] Support for semantic annotations in the captured provenance using ontologies (#26) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Support for annotating provenance with an ontology (functions with arguments and returns, data objects with attribute and metadata annotations) * Documentation * Support for multiple class annotations --------- Co-authored-by: Cristiano Köhler --- alpaca/alpaca_types.py | 2 +- alpaca/data_information.py | 7 + alpaca/decorator.py | 10 + alpaca/ontology/annotation.py | 293 +++++++ alpaca/serialization/neo.py | 75 +- alpaca/serialization/prov.py | 166 +++- alpaca/test/test_code_analysis.py | 1 + alpaca/test/test_ontology_annotation.py | 978 ++++++++++++++++++++++++ doc/api.rst | 1 + doc/reference/ontologies.rst | 6 + 10 files changed, 1485 insertions(+), 54 deletions(-) create mode 100644 alpaca/ontology/annotation.py create mode 100644 alpaca/test/test_ontology_annotation.py create mode 100644 doc/reference/ontologies.rst diff --git a/alpaca/alpaca_types.py b/alpaca/alpaca_types.py index 6610cbe..c2d96bf 100644 --- a/alpaca/alpaca_types.py +++ b/alpaca/alpaca_types.py @@ -41,7 +41,7 @@ 'execution_id') ) -FunctionInfo = namedtuple('FunctionInfo', ('name', 'module', 'version')) +FunctionInfo = namedtuple('FunctionInfo', ('name', 'module', 'version',)) # NAMED TUPLE TO STORE ARGUMENTS THAT ARE CONTAINERS diff --git a/alpaca/data_information.py b/alpaca/data_information.py index 03dc8d3..08611ef 100644 --- a/alpaca/data_information.py +++ b/alpaca/data_information.py @@ -16,6 +16,7 @@ from pathlib import Path import logging from collections.abc import Iterable +from collections import defaultdict import joblib import numpy as np @@ -23,6 +24,7 @@ from dill._dill import save_function from alpaca.alpaca_types import DataObject, File +from alpaca.ontology.annotation import _OntologyInformation, ONTOLOGY_INFORMATION # Need to use `dill` pickling function to support lambdas. # Some objects may have attributes that are lambdas. One example is the @@ -318,6 +320,11 @@ def info(self, obj): elif obj_type in self._store_values: obj_value = str(obj) + # Add ontology information if not present + if (not ONTOLOGY_INFORMATION.get(obj_type) and + _OntologyInformation.get_ontology_information(obj)): + ONTOLOGY_INFORMATION[obj_type] = _OntologyInformation(obj) + return DataObject(hash=obj_hash, hash_method=hash_method, type=obj_type, id=obj_id, details=details, value=obj_value) diff --git a/alpaca/decorator.py b/alpaca/decorator.py index da16640..3434d21 100644 --- a/alpaca/decorator.py +++ b/alpaca/decorator.py @@ -6,6 +6,8 @@ from functools import wraps import itertools from collections.abc import Iterable +from collections import defaultdict + from importlib.metadata import version, PackageNotFoundError import inspect import ast @@ -18,8 +20,10 @@ from alpaca.code_analysis.ast import _CallAST from alpaca.code_analysis.source_code import _SourceCode from alpaca.serialization import AlpacaProvDocument +from alpaca.serialization.identifiers import _get_function_name from alpaca.utils.files import RDF_FILE_FORMAT_MAP from alpaca.settings import _ALPACA_SETTINGS +from alpaca.ontology.annotation import _OntologyInformation, ONTOLOGY_INFORMATION from pprint import pprint @@ -419,6 +423,12 @@ def _capture_code_and_function_provenance(self, lineno, function): function_info = FunctionInfo(name=function_name, module=module, version=module_version) + function_id = _get_function_name(function_info) + if not ONTOLOGY_INFORMATION.get(function_id): + if _OntologyInformation.get_ontology_information(function): + ONTOLOGY_INFORMATION[function_id] = \ + _OntologyInformation(function) + return source_line, ast_tree, return_targets, function_info def _capture_input_and_parameters_provenance(self, function, args, kwargs, diff --git a/alpaca/ontology/annotation.py b/alpaca/ontology/annotation.py new file mode 100644 index 0000000..0bd2228 --- /dev/null +++ b/alpaca/ontology/annotation.py @@ -0,0 +1,293 @@ +""" +Alpaca has functionality to embed semantic information provided by ontologies +by reading annotations that were inserted into Python objects. The annotations +are used by the :class:`alpaca.AlpacaProvDocument` class when serializing the +provenance information as RDF. The annotations will be inserted as additional +`rdf:type` triples alongside the classes already defined by the Alpaca PROV +model. + +It expects that the Python object has a dictionary stored as the special +`__ontology__` attribute. All the specific annotations for the Python object +are contained in this dictionary. The annotations are URIs of the +relevant ontology classes that represent the Python object (e.g., a function) +or one of its elements (e.g., the parameters of the function). If providing a +full URI (i.e., without an ontology namespace as prefix - CURIEs), the URI +must start with `<` and end with `>` (e.g., +``). CURIEs are allowed in the form +`ontology:Class`. + +Currently, annotations for two main Python objects are supported: functions +(intended for a Python function object) and data objects (intended for +instances of objects instantiated from a Python class). Specific keys in the +`__ontology__` dictionary will define the main URI describing either the +function or the data object: + +* 'function' : str or list of str + A URI to the ontology class representing the Python function. Multiple URIs + can be passed as a list, if the function is represented by multiple classes. +* 'data_object' : str or list of str + A URI to the ontology class representing the Python data object. Multiple + URIs can be passed as a list, if the object is represented by multiple + classes. + +Additional annotations can be stored depending on whether a function or data +object is being annotated. + +For functions, the additional items that can be stored in the `__ontology__` +dictionary are: + +* 'arguments' : dict + A dictionary where the keys are argument names (cf. the function + declaration in the `def` statement) and the values are the URI(s) + to the ontology class(es) representing the argument. +* 'returns' : dict + A dictionary where the keys are function outputs, and the values define the + URI(s) to the ontology class(es) representing each output identified by a + key. + The keys in the `returns` dictionary can have three possible values: + 1. a string with one output name (if this is the name of an argument, cf. + the function declaration in the `def` statement), which assumes that a + function uses one of the arguments as output, or + 2. an integer corresponding to the order of the output (as defined by the + function `return` statements), or + 3. a string of `*`s. In this case, the function returns a container (e.g., + a list, or list of lists), and the number of `*`s defines the depth + within the container whose elements are defined by the ontology class in + the value. For instance, if the function returns a list of lists + `[[obj1, obj2], [obj3, obj4], ...]`, the objects are at the third level. If + each element `objX` is represented by an ontology class defined by ``, + one can annotate the elements with a key `'***'` and value `''`. An + annotation with key `'*'` would be used to annotate the main list returned, + and the key `'**'` would be used to annotate each inner list inside the + main list (e.g., `[obj1, obj2]` or `[obj3, obj4]`). This is useful for using + ontologies that define concepts that represent groups of elements. + +For data objects, the additional items that can be stored in the `__ontology__` +dictionary are: + +* 'attributes' : dict + A dictionary where the keys are object attribute names and the values are + the URI(s) to the ontology class(es) representing the attribute. +* 'annotations' : dict + A dictionary where the keys are annotation names and the values are the + URI(s) to the ontology class(es) representing the annotation. Annotations + are key-pair values specified in dictionaries stored as one attribute of the + object (e.g., `obj.annotations`). + +Finally, the ontology annotations can be defined using namespaces so that the +URIs are shortened (CURIEs). Namespaces are defined for both functions and +data objects using the `namespaces` key in the `__ontology__` dictionary: + +* 'namespaces' : dict + A dictionary where the keys are the names used as prefixes in the + annotations in `__ontology__` and the values are the prefix URIs to be + expanded to get the final URI. For example, for an ontology with URIs with + prefix `http://example.org/ontology#`, such as + `` and + `` a namespace + `'ontology'="http://example.org/ontology#"` can be defined such as the + classes can be referred to as `ontology:ClassA` and `ontology:ClassB`. + + +Examples +-------- + +Consider the Python function `process` that takes an input `data`, is +controlled by a parameter `param`, and returns a tuple of two elements: + +>>> def process(data, param): +>>> ... # process `data` into `output1` and `output2` using `param` +>>> return output1, output2 + +To use an ontology defined by the base URI `` to annotate +the `process` Python function, whose concept is defined by +``, and where `data` is represented by +``, `param` by +`` and the second return element (`output2`) +needs to be annotated with class ``, +the following dictionary could be inserted into the `process` function object: + +>>> process.__ontology__ = { +>>> 'function': "my_ontology:ProcessClass", +>>> 'arguments': {'data': "my_ontology:DataClass", +>>> 'param': "my_ontology:ParameterClass"}, +>>> 'returns': {1: "my_ontology:ProcessOutputClass"}, +>>> 'namespaces': {"my_ontology": "http://my-ontology#"} +>>> } + +For a Python function `process_container`, represented by +`` that takes the same inputs but +returns a list of objects that need to be annotated with the ontology class +``, the following dictionary +could be used: + +>>> def process_container(data, param): +>>> ... # process `data` into several outputs grouped as a list `output` +>>> return output # [obj1, obj2, ...] + +>>> process_container.__ontology__ = { +>>> 'function': "my_ontology:ProcessContainerClass", +>>> 'arguments': {'data': "my_ontology:DataClass", +>>> 'param': "my_ontology:ParameterClass"}, +>>> 'returns': {'**': "my_ontology:ProcessOutputElementClass"}, +>>> 'namespaces': {"my_ontology": "http://my-ontology#"} +>>> } + +""" + +import rdflib +from copy import deepcopy + + +# Two types of Python objects can be annotated: functions or data objects. +# For each, specific additional information can also be annotated (e.g., +# the parameters of a function). This dictionary defines which can be defined +# for each entity, and the strings that are used as keys in the `__ontology__` +# dictionary. +VALID_INFORMATION = { + 'data_object': {'namespaces', 'attributes', 'annotations'}, + 'function': {'namespaces', 'arguments', 'returns'} +} +VALID_OBJECTS = set(VALID_INFORMATION.keys()) + + +# Global dictionary to store ontology information during the capture. +# This is used later for the serialization. +ONTOLOGY_INFORMATION = {} + + +class _OntologyInformation(object): + """ + Class used to parse information from the `__ontology__` annotation + dictionary from Python functions or data objects. + + This class provides easy access to the definitions when serializing the + provenance information with extended ontology annotations. It also manages + namespaces across different objects and functions, such that no ambiguities + or multiple definitions are introduced, and the full URIs can be retrieved. + + This class is used internally by Alpaca when serializing the provenance + as RDF. + + Parameters + ---------- + obj : object + Python function or data object with an attribute named `__ontology__` + that stores a dictionary with specific ontology annotations. + """ + + namespaces = {} + + @classmethod + def add_namespace(cls, name, uri): + if name in cls.namespaces: + if cls.namespaces[name] != uri: + raise ValueError("Attempting to redefine an existing " + "namespace. This is not allowed as other " + "terms expect a different URI.") + else: + cls.namespaces[name] = rdflib.Namespace(uri) + + @classmethod + def bind_namespaces(cls, namespace_manager): + for name, namespace in cls.namespaces.items(): + namespace_manager.bind(name, namespace) + + @staticmethod + def get_ontology_information(obj): + if hasattr(obj, "__ontology__"): + return getattr(obj, "__ontology__") + elif (hasattr(obj, "__wrapped__") and + hasattr(obj.__wrapped__, "__ontology__")): + return getattr(obj.__wrapped__, "__ontology__") + return None + + def __init__(self, obj): + + ontology_info = self.get_ontology_information(obj) + if ontology_info: + # An ontology annotation with semantic information is present + # Store each element inside this object + + for information_type, information in ontology_info.items(): + if information_type in VALID_OBJECTS: + # Function or data object URI + setattr(self, information_type, information) + elif information_type == "namespaces": + # Add all namespaces, checking for inconsistencies + for prefix, uri in information.items(): + self.add_namespace(prefix, uri) + else: + # Add additional information on the function or data + # object + setattr(self, information_type, deepcopy(information)) + + def has_information(self, information_type): + return hasattr(self, information_type) + + def get_container_returns(self): + returns = getattr(self, 'returns', None) + if returns: + return [key for key in returns.keys() if isinstance(key, str) and + key == '*' * len(key)] + return None + + def get_uri(self, information_type, element=None): + if information_type in VALID_OBJECTS: + # Information on 'function' and 'data_object' are strings or + # lists, stored directly as attributes + information_value = getattr(self, information_type) + else: + # Specific information of 'function' and 'data_object' are + # stored in dictionaries (e.g., 'attributes', 'parameters'...) + information = getattr(self, information_type, None) + + if information is None: + # No information available + return None + + # If annotating all elements (e.g., multiple returns in a + # container). The actual element will not be present, but + # there will be an entry identified by '*'. + information_value = information.get(element, None) + if not information_value: + return None + + if not isinstance(information_value, list): + information_value = [information_value] + + # Process URI(s) to get `rdflib.URIRef` elements, resolving any + # namespace. + uris = [] + for uri in information_value: + if (uri[0], uri[-1]) == ("<", ">"): + # This is a full URI + uris.append(rdflib.URIRef(uri[1:-1])) + else: + # If not full URIs, information must be CURIEs. + # Get the `URIRef` from the namespace. + prefix, value = uri.split(":") + uris.append(self.namespaces[prefix][value]) + + if len(uris) == 1: + # Return annotation with a single URI directly + return uris[0] + return uris + + def __repr__(self): + repr_str = "OntologyInformation(" + information = [] + for obj_type in VALID_OBJECTS: + if self.has_information(obj_type): + information.append(f"{obj_type}='{getattr(self, obj_type)}'") + for specific_information in \ + sorted(VALID_INFORMATION[obj_type]): + if self.has_information(specific_information): + specific_info = getattr(self, specific_information) + info_str = str(specific_info) \ + if not isinstance(specific_info, str) else \ + f"'{specific_info}'" + information.append( + f"{specific_information}={info_str}") + repr_str = f"{repr_str}{', '.join(information)})" + return repr_str diff --git a/alpaca/serialization/neo.py b/alpaca/serialization/neo.py index 5d36a63..af6adfb 100644 --- a/alpaca/serialization/neo.py +++ b/alpaca/serialization/neo.py @@ -1,11 +1,14 @@ """ -Module to handle the serialization of Neo objects. +Module to handle the serialization of Neo objects. This defines plugin +functions that are used by Alpaca when serializing information from Neo +objects in the captured provenance. As Neo provides a specific data model for electrophysiology data, some -attributes need special handling. The key component is the annotation +attributes need special handling. The key component is the `annotations` dictionary, that cannot be stored as a standard Python attribute as the information would not be accessible. A special property `hasAnnotation` is -used for that case. +used for those cases, where each key-value pair in the annotations dictionary +is identified as object metadata. A special converter for attribute values is also provided, so that they can be properly serialized to strings. @@ -37,32 +40,29 @@ def _neo_to_prov(value, displayed_attributes=DISPLAYED_ATTRIBUTES): type_information = type(value) neo_class = f"{type_information.__module__}.{type_information.__name__}" - repr = f"{neo_class}(" - - counter = 0 - + attr_repr = [] for attribute in displayed_attributes: if hasattr(value, attribute): - if counter > 0: - repr += ", " - attr_value = getattr(value, attribute) - repr += f"{attribute}={_ensure_type(attr_value)}" + attr_repr.append(f"{attribute}={_ensure_type(attr_value)}") - counter += 1 - - repr += ")" - return repr + neo_repr = f"{neo_class}({', '.join(attr_repr)})" + return neo_repr def _neo_object_metadata(graph, uri, metadata): # Adds metadata of a Neo object to an entity in the RDF graph `graph`. # `uri` is the identifier of the object in the graph, and `metadata` is # the dictionary of object metadata captured by Alpaca. + # Returns a dictionary of attribute/annotation names with blank node + # URIs, that are used later for inserting semantic information if + # ontology annotations are defined. from alpaca.serialization.converters import _ensure_type from alpaca.serialization.prov import _add_name_value_pair + metadata_nodes = {'attributes': {}, 'annotations': {}} + for name, value in metadata.items(): if name in NEO_COLLECTIONS: @@ -73,25 +73,22 @@ def _neo_object_metadata(graph, uri, metadata): # readable name of each Neo object in the list. They will be # enclosed in brackets [] as the value stored in the # serialized file. - attr_value = "[" - counter = 0 + attr_values = [] for item in value: - if counter > 0: - attr_value += ", " - attr_value += _neo_to_prov(item) - counter += 1 - attr_value += "]" + attr_values.append(_neo_to_prov(item)) + attr_value = f"[{', '.join(attr_values)}]" else: # This is a container Neo object. Just get the readable # name of the object attr_value = _neo_to_prov(value) # Add the attribute relationship to the object Entity - _add_name_value_pair(graph, - uri=uri, - predicate=ALPACA.hasAttribute, - name=name, - value=attr_value) + blank_node = _add_name_value_pair(graph, + uri=uri, + predicate=ALPACA.hasAttribute, + name=name, + value=attr_value) + metadata_nodes['attributes'][name] = blank_node elif name in ('annotations', 'array_annotations') and \ isinstance(value, dict): @@ -103,19 +100,23 @@ def _neo_object_metadata(graph, uri, metadata): annotation_value = _ensure_type(annotation_value) # Add the annotation relationship - _add_name_value_pair(graph, - uri=uri, - predicate=ALPACA.hasAnnotation, - name=annotation, - value=annotation_value) + blank_node = _add_name_value_pair(graph, + uri=uri, + predicate=ALPACA.hasAnnotation, + name=annotation, + value=annotation_value) + metadata_nodes['annotations'][name] = blank_node else: # Other attributes, just add them value = _ensure_type(value) # Add attribute relationship - _add_name_value_pair(graph, - uri=uri, - predicate=ALPACA.hasAttribute, - name=name, - value=value) + blank_node = _add_name_value_pair(graph, + uri=uri, + predicate=ALPACA.hasAttribute, + name=name, + value=value) + metadata_nodes['attributes'][name] = blank_node + + return metadata_nodes diff --git a/alpaca/serialization/prov.py b/alpaca/serialization/prov.py index 7b4f188..5f80cf9 100644 --- a/alpaca/serialization/prov.py +++ b/alpaca/serialization/prov.py @@ -9,7 +9,7 @@ """ -from itertools import product +from itertools import product, chain import numpy as np import numbers @@ -21,16 +21,19 @@ file_identifier, function_identifier, script_identifier, - execution_identifier) + execution_identifier, + _get_function_name) from alpaca.serialization.converters import _ensure_type from alpaca.serialization.neo import _neo_object_metadata from alpaca.utils.files import _get_prov_file_format from alpaca.alpaca_types import DataObject, File, Container from alpaca.settings import _ALPACA_SETTINGS +from alpaca.ontology.annotation import _OntologyInformation, ONTOLOGY_INFORMATION from tqdm import tqdm + def _add_name_value_pair(graph, uri, predicate, name, value): # Add a relationship defined by `predicate` using a blank node as object. # The object will be of type `alpaca:NameValuePair`. @@ -39,6 +42,7 @@ def _add_name_value_pair(graph, uri, predicate, name, value): graph.add((blank_node, RDF.type, ALPACA.NameValuePair)) graph.add((blank_node, ALPACA.pairName, Literal(name))) graph.add((blank_node, ALPACA.pairValue, Literal(value))) + return blank_node class AlpacaProvDocument(object): @@ -71,14 +75,23 @@ class AlpacaProvDocument(object): def __init__(self): self.graph = Graph() - self.graph.namespace_manager.bind('alpaca', ALPACA) - self.graph.namespace_manager.bind('prov', PROV) + namespace_manager = self.graph.namespace_manager + namespace_manager.bind('alpaca', ALPACA) + namespace_manager.bind('prov', PROV) self._authority = _ALPACA_SETTINGS['authority'] + # Gets all OntologyInformation objects generated with annotation + # information during the run. Update the current graph namespaces + # accordingly + _OntologyInformation.bind_namespaces(namespace_manager) + # Metadata plugins are used for packages (e.g., Neo) that require # special handling of metadata when adding to the PROV records. # Plugins are external functions that take the graph, the object URI, - # and the metadata dict as parameters. + # and the metadata dict as parameters. The function should return a + # dictionary mapping all blank nodes generated to represent attributes + # and annotations, to allow the use of any semantic information + # defined by ontology annotations (i.e., __ontology__ attribute). self._metadata_plugins = { 'neo': _neo_object_metadata } @@ -87,6 +100,14 @@ def __init__(self): # there is a fast lookup self._entity_uris = set() + # Store functions that have container output ontology annotations, + # To add the identification to the objects after the graph is built + self._container_output_functions = {} + for obj_type, info in ONTOLOGY_INFORMATION.items(): + container_returns = info.get_container_returns() + if container_returns: + self._container_output_functions[obj_type] = container_returns + # PROV relationships methods def _wasAttributedTo(self, entity, agent): @@ -128,14 +149,29 @@ def _add_Function(self, function_info): Literal(function_info.version))) return uri + def _add_ontology_information(self, target_uri, ontology_info, + information_type, element=None): + class_info = ontology_info.get_uri(information_type, element) + if class_info: + if isinstance(class_info, list): + for class_uri in class_info: + self.graph.add((target_uri, RDF.type, class_uri)) + else: + self.graph.add((target_uri, RDF.type, class_info)) + def _add_FunctionExecution(self, script_info, session_id, execution_id, function_info, params, execution_order, - code_statement, start, end, function): + code_statement, start, end, function, + ontology_info=None): # Adds a FunctionExecution record from the Alpaca PROV model uri = URIRef(execution_identifier( script_info, function_info, session_id, execution_id, self._authority)) self.graph.add((uri, RDF.type, ALPACA.FunctionExecution)) + + if ontology_info: + self._add_ontology_information(uri, ontology_info, 'function') + self.graph.add((uri, PROV.startedAtTime, Literal(start, datatype=XSD.dateTime))) self.graph.add((uri, PROV.endedAtTime, @@ -147,8 +183,13 @@ def _add_FunctionExecution(self, script_info, session_id, execution_id, for name, value in params.items(): value = _ensure_type(value) - _add_name_value_pair(self.graph, uri, ALPACA.hasParameter, - name, value) + parameter_node = _add_name_value_pair(self.graph, uri, + ALPACA.hasParameter, + name, value) + if ontology_info: + self._add_ontology_information(parameter_node, + ontology_info, 'arguments', + name) return uri # Entity methods @@ -184,6 +225,7 @@ def _add_DataObjectEntity(self, info): if uri in self._entity_uris: return uri + self.graph.add((uri, RDF.type, ALPACA.DataObjectEntity)) self.graph.add((uri, ALPACA.hashSource, Literal(info.hash_method))) @@ -192,7 +234,11 @@ def _add_DataObjectEntity(self, info): self.graph.add((uri, PROV.value, Literal(info.value, datatype=value_datatype))) - self._add_entity_metadata(uri, info) + ontology_info = ONTOLOGY_INFORMATION.get(info.type, None) + if ontology_info: + self._add_ontology_information(uri, ontology_info, 'data_object') + + self._add_entity_metadata(uri, info, ontology_info) self._entity_uris.add(uri) return uri @@ -204,7 +250,7 @@ def _add_FileEntity(self, info): Literal(info.path, datatype=XSD.string))) return uri - def _add_entity_metadata(self, uri, info): + def _add_entity_metadata(self, uri, info, ontology_info=None): # Add data object metadata (attributes, annotations) to the entities, # using properties from the Alpaca PROV model package_name = info.type.split(".")[0] @@ -213,17 +259,28 @@ def _add_entity_metadata(self, uri, info): if package_name in self._metadata_plugins: # Handle objects like Neo objects (i.e., to avoid dumping all the # information in collections such as `segments` or `events`) - self._metadata_plugins[package_name](self.graph, uri, metadata) + metadata_nodes = self._metadata_plugins[package_name]( + self.graph, uri, metadata) + + # Process metadata nodes of the object, if ontology information + # defined + if ontology_info: + for metadata_type, elements in metadata_nodes.items(): + for element, node in elements.items(): + self._add_ontology_information(node, ontology_info, + metadata_type, element) else: # Add metadata using default handling, i.e., all attributes for name, value in metadata.items(): # Make sure that types such as list and Quantity are handled value = _ensure_type(value) - _add_name_value_pair(self.graph, uri=uri, - predicate=ALPACA.hasAttribute, - name=name, - value=value) + blank_node = _add_name_value_pair(self.graph, uri=uri, + predicate=ALPACA.hasAttribute, name=name, value=value) + + if ontology_info: + self._add_ontology_information(blank_node, ontology_info, + 'attributes', name) def _add_membership(self, container, child, params): # Add membership relationships according to the standard PROV model @@ -274,6 +331,10 @@ def _is_membership(function_info): # This is a function execution. Add Function activity cur_function = self._add_Function(function_info) + # ID to identify ontology annotations + info_id = _get_function_name(function_info) + ontology_info = ONTOLOGY_INFORMATION.get(info_id) + # Get the FunctionExecution node with function parameters and # other provenance info cur_activity = self._add_FunctionExecution( @@ -284,7 +345,7 @@ def _is_membership(function_info): code_statement=execution.code_statement, start=execution.time_stamp_start, end=execution.time_stamp_end, - function=cur_function, + function=cur_function, ontology_info=ontology_info ) # Add all the inputs as entities, and create a `used` association @@ -293,6 +354,8 @@ def _is_membership(function_info): input_entities = [] for key, value in execution.input.items(): cur_entities = [] + has_input_uri = ontology_info and \ + bool(ontology_info.get_uri('arguments', key)) if isinstance(value, Container): # If this is a Container, several objects are inside. @@ -307,6 +370,10 @@ def _is_membership(function_info): self._used(activity=cur_activity, entity=cur_entity) self._wasAttributedTo(entity=cur_entity, agent=script_agent) + if has_input_uri: + self._add_ontology_information(cur_entity, + ontology_info, + 'arguments', key) # Add all the outputs as entities, and create the `wasGenerated` # relationship. @@ -316,6 +383,9 @@ def _is_membership(function_info): output_entities.append(cur_entity) self._wasGeneratedBy(entity=cur_entity, activity=cur_activity) self._wasAttributedTo(entity=cur_entity, agent=script_agent) + if ontology_info: + self._add_ontology_information(cur_entity, ontology_info, + 'returns', element=key) # Iterate over the input/output pairs to add the `wasDerived` # relationship @@ -327,6 +397,69 @@ def _is_membership(function_info): # Associate the activity to the script self._wasAssociatedWith(activity=cur_activity, agent=script_agent) + def _add_annotations_for_container_outputs(self): + # For functions that the Provenance decorator identified elements + # inside returned containers, the elements linked by `prov:hasMember` + # functions need to be annotated. The list of functions is already + # stored in a search list. Iterate over the nodes of the function + # and annotate the correct level of membership + + for info_id, levels in self._container_output_functions.items(): + + # Initialize a container to store the URIs of elements of each + # output level starting from the function. Since the capture can + # ignore root levels, and to avoid recursion, we will map + # container entities up to the maximum possible level taken from + # the 'returns' annotations. Later, we take the annotations + # starting from the deepest level. + + int_levels = list(map(lambda x: len(x), levels)) + max_level = max(int_levels) + elements_by_level = {level: [] for level in range(max_level)} + + # Fetch information on the function, to identify nodes in the graph + ontology_info = ONTOLOGY_INFORMATION[info_id] + function_type = ontology_info.get_uri('function') + executions = self.graph.subjects(RDF.type, function_type) + + # For every execution, get the output nodes + # This is the first level + for execution in executions: + elements_by_level[0].extend( + self.graph.subjects(PROV.wasGeneratedBy, execution)) + + # Traverse the remaining levels + for level in range(1, max_level): + for element in chain(elements_by_level[level-1]): + members = self.graph.objects(element, PROV.hadMember) + elements_by_level[level].extend(members) + + # Go from the deepest annotation level, annotating the deepest + # node level with elements + level_depth = max_level - 1 + level_str = '*' * max_level + obj_uri = ontology_info.get_uri('returns', level_str) + + while level_depth >= 0: + if obj_uri: + has_elements = False + for element in chain(elements_by_level[level_depth]): + has_elements = True + self.graph.add((element, RDF.type, obj_uri)) + else: + # No annotation requested for this level + # Consider the level traversed + has_elements = True + + if has_elements: + # Fetch annotation information for the parent level + level_str = '*' * (len(level_str) - 1) + obj_uri = ontology_info.get_uri('returns', level_str) + + # If no element found, keep the annotation level, but + # try to annotate the elements of an upper node level + level_depth -= 1 + def add_history(self, script_info, session_id, history, show_progress=False): """ @@ -352,6 +485,7 @@ def add_history(self, script_info, session_id, history, disable=not show_progress): self._add_function_execution(execution, script_agent, script_info, session_id) + self._add_annotations_for_container_outputs() def read_records(self, file_name, file_format='turtle'): """ diff --git a/alpaca/test/test_code_analysis.py b/alpaca/test/test_code_analysis.py index 957de08..2da810c 100644 --- a/alpaca/test/test_code_analysis.py +++ b/alpaca/test/test_code_analysis.py @@ -16,6 +16,7 @@ from alpaca import Provenance, activate, deactivate from alpaca.alpaca_types import FunctionInfo, DataObject +from alpaca.ontology.annotation import _OntologyInformation # Shortcut for SHA1 hashing using joblib diff --git a/alpaca/test/test_ontology_annotation.py b/alpaca/test/test_ontology_annotation.py new file mode 100644 index 0000000..0750e48 --- /dev/null +++ b/alpaca/test/test_ontology_annotation.py @@ -0,0 +1,978 @@ +import unittest +import io +from rdflib import Literal, URIRef, Namespace, Graph, RDF, PROV + +from alpaca import activate, deactivate, Provenance, save_provenance +from alpaca.ontology.annotation import (_OntologyInformation, + ONTOLOGY_INFORMATION) +from alpaca.ontology import ALPACA + +from collections import Counter + +# Ontology namespace definition used for the tests +EXAMPLE_NS = {'ontology': "http://example.org/ontology#"} + + +############################## +# Test objects to be annotated +############################## + +class InputObject: + __ontology__ = { + "data_object": "ontology:InputObject", + "namespaces": EXAMPLE_NS} + + +class OutputObject: + __ontology__ = { + "data_object": "ontology:OutputObject", + "attributes": {'name': "ontology:Attribute"}, + "namespaces": EXAMPLE_NS} + + def __init__(self, name, channel): + self.name = name + self.channel = channel + + +class InputObjectURI: + __ontology__ = {"data_object": ""} + + +####################################################### +# Test functions to be annotated and provenance tracked +####################################################### + +@Provenance(inputs=['input']) +def process(input, param_1): + return OutputObject("SpikeTrain#1", 45) + +process.__wrapped__.__ontology__ = { + "function": "ontology:ProcessFunction", + "namespaces": EXAMPLE_NS, + "arguments": {'param_1': "ontology:Parameter"}, + "returns": {0: "ontology:ProcessedData"} +} + + +@Provenance(inputs=['input']) +def process_one_and_process_two(input, param_1): + return OutputObject("SpikeTrain#1", 45) + +process_one_and_process_two.__wrapped__.__ontology__ = { + "function": ["ontology:Process1Function", "ontology:Process2Function"], + "namespaces": EXAMPLE_NS, + "arguments": {'param_1': "ontology:Parameter"}, + "returns": {0: "ontology:ProcessedData"} +} + + +@Provenance(inputs=['input']) +def process_multiple(input, param_1): + return "not_annotated", OutputObject("SpikeTrain#2", 34) + +process_multiple.__wrapped__.__ontology__ = { + "function": "ontology:ProcessFunctionMultiple", + "namespaces": EXAMPLE_NS, + "arguments": {'param_1': "ontology:Parameter"}, + "returns": {1: "ontology:ProcessedDataMultiple"} +} + + +@Provenance(inputs=[], container_output=True) +def process_container_output(): + return list(range(3)) + +process_container_output.__wrapped__.__ontology__ = { + "function": "ontology:ProcessContainerOutput", + "namespaces": EXAMPLE_NS, + "returns": {'*': "ontology:ProcessedContainerOutput"} +} + + +@Provenance(inputs=[], container_output=1) +def process_multiple_container_output(): + return [list(range(i, i + 3)) for i in range(0, 7, 3)] + +process_multiple_container_output.__wrapped__.__ontology__ = { + "function": "ontology:ProcessMultipleContainerOutput", + "namespaces": EXAMPLE_NS, + "returns": {'***': "ontology:ProcessedMultipleContainerOutput"} +} + + +@Provenance(inputs=[], container_output=1) +def process_multiple_container_output_multiple_annotations(): + return [list(range(i, i + 3)) for i in range(0, 4, 3)] + +process_multiple_container_output_multiple_annotations.__wrapped__.__ontology__ = { + "function": "ontology:ProcessMultipleContainerOutputMultipleAnnotations", + "namespaces": EXAMPLE_NS, + "returns": { + '**': "ontology:ProcessedMultipleContainerOutputLevel1", + '***': "ontology:ProcessedMultipleContainerOutputLevel2"} +} + + +@Provenance(inputs=[], container_output=1) +def process_multiple_container_output_multiple_annotations_root(): + return [list(range(i, i + 3)) for i in range(0, 4, 3)] + +process_multiple_container_output_multiple_annotations_root.__wrapped__.__ontology__ = { + "function": "ontology:ProcessMultipleContainerOutputMultipleAnnotationsRoot", + "namespaces": EXAMPLE_NS, + "returns": { + '*': "ontology:ProcessedMultipleContainerOutputLevel0", + '**': "ontology:ProcessedMultipleContainerOutputLevel1", + '***': "ontology:ProcessedMultipleContainerOutputLevel2"} +} + + +@Provenance(inputs=[], container_output=(1, 1)) +def process_multiple_container_output_multiple_annotations_range(): + return [list(range(i, i + 3)) for i in range(0, 4, 3)] + +process_multiple_container_output_multiple_annotations_range.__wrapped__.__ontology__ = { + "function": "ontology:ProcessMultipleContainerOutputMultipleAnnotationsRange", + "namespaces": EXAMPLE_NS, + "returns": { + '*': "ontology:ProcessedMultipleContainerOutputLevel0", + '**': "ontology:ProcessedMultipleContainerOutputLevel1", + '***': "ontology:ProcessedMultipleContainerOutputLevel2"} +} + + +@Provenance(inputs=['input']) +def process_input_annotation(input, param): + return input + 2 + +process_input_annotation.__wrapped__.__ontology__ = { + "function": "ontology:ProcessInputAnnotation", + "namespaces": EXAMPLE_NS, + "arguments": {'input': "ontology:Input", 'param': "ontology:Param"} +} + + +@Provenance(inputs=['input'], container_input=['input_list']) +def process_container_input_annotation(input, input_list, param): + return [i + input for i in input_list] + +process_container_input_annotation.__wrapped__.__ontology__ = { + "function": "ontology:ProcessContainerInputAnnotation", + "namespaces": EXAMPLE_NS, + "arguments": {'input': "ontology:Input", + 'input_list': "ontology:ContainerElementInput", + 'param': "ontology:Param"} +} + + +@Provenance(inputs=['input']) +def process_no_annotations(input): + return input + 2 + + +############ +# Unit tests +############ + +class OntologyAnnotationTestCase(unittest.TestCase): + + @classmethod + def setUpClass(cls): + # Create rdflib Namespace for tests + cls.ONTOLOGY = Namespace(EXAMPLE_NS['ontology']) + + def setUp(self): + _OntologyInformation.namespaces.clear() + ONTOLOGY_INFORMATION.clear() + + def test_redefine_namespaces(self): + obj = InputObject() + self.assertDictEqual(_OntologyInformation.namespaces, {}) + info = _OntologyInformation(obj) + + # At this point, the class should be updated with the 'ontology' + # namespace + with self.assertRaises(ValueError): + info.add_namespace('ontology', "http://purl.org/ontology") + + info.add_namespace('purl_ontology', "http://purl.org/ontology") + self.assertEqual(len(info.namespaces), 2) + + self.assertEqual(info.namespaces['ontology'], self.ONTOLOGY) + self.assertEqual(info.namespaces['purl_ontology'], + Namespace("http://purl.org/ontology")) + + def test_annotation_object_input_uri(self): + obj = InputObjectURI() + self.assertIsNotNone( + _OntologyInformation.get_ontology_information(obj)) + info = _OntologyInformation(obj) + self.assertEqual( + info.get_uri("data_object"), + URIRef("http://purl.org/ontology#InputObject")) + + # Namespaces included in representation as this is a class attribute + self.assertEqual( + str(info), + "OntologyInformation(data_object='" + "', namespaces={})" + ) + + def test_annotation_object_input(self): + obj = InputObject() + self.assertIsNotNone( + _OntologyInformation.get_ontology_information(obj)) + info = _OntologyInformation(obj) + self.assertEqual( + info.get_uri("data_object"), + URIRef("http://example.org/ontology#InputObject")) + self.assertEqual( + str(info), + "OntologyInformation(data_object='ontology:InputObject', " + f"namespaces={{'ontology': {repr(self.ONTOLOGY)}}})" + ) + + def test_annotation_object_output(self): + obj = OutputObject("test", 45) + self.assertIsNotNone( + _OntologyInformation.get_ontology_information(obj)) + info = _OntologyInformation(obj) + self.assertEqual( + info.get_uri("data_object"), + URIRef("http://example.org/ontology#OutputObject")) + self.assertEqual( + info.get_uri("attributes", "name"), + URIRef("http://example.org/ontology#Attribute")) + self.assertEqual( + str(info), + "OntologyInformation(data_object='ontology:OutputObject', " + "attributes={'name': 'ontology:Attribute'}, " + f"namespaces={{'ontology': {repr(self.ONTOLOGY)}}})" + ) + + def test_annotation_function(self): + self.assertIsNotNone( + _OntologyInformation.get_ontology_information(process)) + info = _OntologyInformation(process) + self.assertEqual( + info.get_uri("function"), + URIRef("http://example.org/ontology#ProcessFunction")) + self.assertEqual( + info.get_uri("arguments", "param_1"), + URIRef("http://example.org/ontology#Parameter")) + self.assertEqual( + info.get_uri("returns", 0), + URIRef("http://example.org/ontology#ProcessedData")) + self.assertEqual( + str(info), + "OntologyInformation(function='ontology:ProcessFunction', " + "arguments={'param_1': 'ontology:Parameter'}, " + f"namespaces={{'ontology': {repr(self.ONTOLOGY)}}}, " + "returns={0: 'ontology:ProcessedData'})" + ) + + def test_annotation_function_multiple_annotations(self): + self.assertIsNotNone( + _OntologyInformation.get_ontology_information( + process_one_and_process_two)) + info = _OntologyInformation(process_one_and_process_two) + self.assertListEqual( + info.get_uri("function"), + [URIRef("http://example.org/ontology#Process1Function"), + URIRef("http://example.org/ontology#Process2Function")]) + self.assertEqual( + info.get_uri("arguments", "param_1"), + URIRef("http://example.org/ontology#Parameter")) + self.assertEqual( + info.get_uri("returns", 0), + URIRef("http://example.org/ontology#ProcessedData")) + self.assertEqual( + str(info), + "OntologyInformation(function='['ontology:Process1Function', " + "'ontology:Process2Function']', " + "arguments={'param_1': 'ontology:Parameter'}, " + f"namespaces={{'ontology': {repr(self.ONTOLOGY)}}}, " + "returns={0: 'ontology:ProcessedData'})") + + def test_annotation_function_multiple(self): + self.assertIsNotNone( + _OntologyInformation.get_ontology_information(process_multiple)) + info = _OntologyInformation(process_multiple) + self.assertEqual( + info.get_uri("function"), + URIRef("http://example.org/ontology#ProcessFunctionMultiple")) + self.assertEqual( + info.get_uri("arguments", "param_1"), + URIRef("http://example.org/ontology#Parameter")) + self.assertEqual( + info.get_uri("returns", 1), + URIRef("http://example.org/ontology#ProcessedDataMultiple")) + self.assertEqual( + str(info), + "OntologyInformation(function='ontology:ProcessFunctionMultiple', " + "arguments={'param_1': 'ontology:Parameter'}, " + f"namespaces={{'ontology': {repr(self.ONTOLOGY)}}}, " + "returns={1: 'ontology:ProcessedDataMultiple'})" + ) + + def test_invalid_object_annotations(self): + obj = InputObject() + info = _OntologyInformation(obj) + self.assertIsNone(info.get_uri("attributes", "name")) + self.assertIsNone(info.get_uri("attributes", "channel")) + self.assertIsNone(info.get_uri("non_existent")) + self.assertIsNone(info.get_uri("non_existent", "test")) + + output_obj = OutputObject("test", 45) + output_info = _OntologyInformation(output_obj) + self.assertIsNotNone(output_info.get_uri("attributes", "name")) + self.assertIsNone(output_info.get_uri("attributes", "channel")) + self.assertIsNone(output_info.get_uri("non_existent")) + self.assertIsNone(output_info.get_uri("non_existent", "test")) + + def test_namespaces(self): + input_obj = InputObject() + output_obj = OutputObject("test", 45) + + input_info = _OntologyInformation(input_obj) + output_info = _OntologyInformation(output_obj) + function_info = _OntologyInformation(process) + + for info in (input_info, output_info, function_info): + self.assertEqual(info.namespaces['ontology'], self.ONTOLOGY) + self.assertTupleEqual(tuple(info.namespaces.keys()), ('ontology',)) + + def test_provenance_no_annotation(self): + activate(clear=True) + result = process_no_annotations(5) + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that no other annotations are present + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + types = list(prov_graph.objects(execution_uri, RDF.type)) + self.assertListEqual(types, [ALPACA.FunctionExecution]) + + def test_provenance_annotation(self): + activate(clear=True) + input_object = InputObject() + output_object = process(input_object, 34) + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist (1 per class is expected) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Parameter))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessFunction))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedData))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.InputObject))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.OutputObject))) + ), 1) + + # FunctionExecution is ProcessFunction + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, + RDF.type, + self.ONTOLOGY.ProcessFunction) in prov_graph) + + # Check parameter name + parameter_node = list( + prov_graph.subjects(RDF.type, self.ONTOLOGY.Parameter))[0] + self.assertTrue((parameter_node, + ALPACA.pairName, Literal("param_1")) in prov_graph) + self.assertTrue((parameter_node, + ALPACA.pairValue, Literal(34)) in prov_graph) + + # Check returned value + output_node = list( + prov_graph.subjects(RDF.type, self.ONTOLOGY.ProcessedData))[0] + self.assertTrue((output_node, + PROV.wasGeneratedBy, execution_uri) in prov_graph) + self.assertTrue((output_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + RDF.type, self.ONTOLOGY.OutputObject) in prov_graph) + + # Check attributes of returned value + expected_attributes = { + 'name': "SpikeTrain#1", + 'channel': 45, + } + for attribute in prov_graph.objects(output_node, ALPACA.hasAttribute): + name = prov_graph.value(attribute, ALPACA.pairName).toPython() + value = prov_graph.value(attribute, ALPACA.pairValue).toPython() + self.assertEqual(value, expected_attributes[name]) + + # Check if attribute annotation is present for `name` + if name == 'name': + self.assertTrue((attribute, RDF.type, self.ONTOLOGY.Attribute) + in prov_graph) + + # Check input value + input_node = list( + prov_graph.subjects(RDF.type, self.ONTOLOGY.InputObject))[0] + self.assertTrue((execution_uri, PROV.used, input_node) in prov_graph) + self.assertTrue((input_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + PROV.wasDerivedFrom, input_node) in prov_graph) + + def test_provenance_multiple_annotations(self): + activate(clear=True) + input_object = InputObject() + output_object = process_one_and_process_two(input_object, 34) + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist (1 per class is expected) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Parameter))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Process1Function))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Process2Function))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedData))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.InputObject))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.OutputObject))) + ), 1) + + # FunctionExecution is ProcessFunction + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, + RDF.type, + self.ONTOLOGY.Process1Function) in prov_graph) + + self.assertTrue((execution_uri, + RDF.type, + self.ONTOLOGY.Process2Function) in prov_graph) + + # Check parameter name + parameter_node = list( + prov_graph.subjects(RDF.type, self.ONTOLOGY.Parameter))[0] + self.assertTrue((parameter_node, + ALPACA.pairName, Literal("param_1")) in prov_graph) + self.assertTrue((parameter_node, + ALPACA.pairValue, Literal(34)) in prov_graph) + + # Check returned value + output_node = list( + prov_graph.subjects(RDF.type, self.ONTOLOGY.ProcessedData))[0] + self.assertTrue((output_node, + PROV.wasGeneratedBy, execution_uri) in prov_graph) + self.assertTrue((output_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + RDF.type, self.ONTOLOGY.OutputObject) in prov_graph) + + # Check attributes of returned value + expected_attributes = { + 'name': "SpikeTrain#1", + 'channel': 45, + } + for attribute in prov_graph.objects(output_node, ALPACA.hasAttribute): + name = prov_graph.value(attribute, ALPACA.pairName).toPython() + value = prov_graph.value(attribute, ALPACA.pairValue).toPython() + self.assertEqual(value, expected_attributes[name]) + + # Check if attribute annotation is present for `name` + if name == 'name': + self.assertTrue((attribute, RDF.type, self.ONTOLOGY.Attribute) + in prov_graph) + + # Check input value + input_node = list( + prov_graph.subjects(RDF.type, self.ONTOLOGY.InputObject))[0] + self.assertTrue((execution_uri, PROV.used, input_node) in prov_graph) + self.assertTrue((input_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + PROV.wasDerivedFrom, input_node) in prov_graph) + + def test_provenance_annotation_multiple_returns(self): + activate(clear=True) + input_object = InputObject() + name, output_object = process_multiple(input_object, 45) + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist (1 per class is expected. None + # are expected for the classes of `process`) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Parameter))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessFunctionMultiple))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedDataMultiple))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.InputObject))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.OutputObject))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessFunction))) + ), 0) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedData))) + ), 0) + + # FunctionExecution is ProcessFunctionMultiple + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, RDF.type, + self.ONTOLOGY.ProcessFunctionMultiple) in prov_graph) + + # Check parameter name + parameter_node = list( + prov_graph.subjects(RDF.type, self.ONTOLOGY.Parameter))[0] + self.assertTrue((parameter_node, + ALPACA.pairName, Literal("param_1")) in prov_graph) + self.assertTrue((parameter_node, + ALPACA.pairValue, Literal(45)) in prov_graph) + + # Check returned value + output_node = list( + prov_graph.subjects(RDF.type, + self.ONTOLOGY.ProcessedDataMultiple))[0] + self.assertTrue((output_node, + PROV.wasGeneratedBy, execution_uri) in prov_graph) + self.assertTrue((output_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + RDF.type, self.ONTOLOGY.OutputObject) in prov_graph) + + # Check attributes of returned value + expected_attributes = { + 'name': "SpikeTrain#2", + 'channel': 34, + } + for attribute in prov_graph.objects(output_node, ALPACA.hasAttribute): + name = prov_graph.value(attribute, ALPACA.pairName).toPython() + value = prov_graph.value(attribute, ALPACA.pairValue).toPython() + self.assertEqual(value, expected_attributes[name]) + + # Check if attribute annotation is present for `name` + if name == 'name': + self.assertTrue((attribute, RDF.type, self.ONTOLOGY.Attribute) + in prov_graph) + + # Check input value + input_node = list( + prov_graph.subjects(RDF.type, self.ONTOLOGY.InputObject))[0] + self.assertTrue((execution_uri, + PROV.used, input_node) in prov_graph) + self.assertTrue((input_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + PROV.wasDerivedFrom, input_node) in prov_graph) + + def test_provenance_annotation_container_output(self): + activate(clear=True) + container = process_container_output() + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist (1 per class is expected. None + # are expected for the classes of `process`) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessContainerOutput))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedContainerOutput))) + ), 3) + + # FunctionExecution is ProcessContainerOutput + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, RDF.type, + self.ONTOLOGY.ProcessContainerOutput) in prov_graph) + + # Check returned values + output_nodes = prov_graph.subjects(RDF.type, + self.ONTOLOGY.ProcessedContainerOutput) + for output_node in output_nodes: + self.assertTrue((output_node, + PROV.wasGeneratedBy, execution_uri) in prov_graph) + self.assertTrue((output_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + RDF.type, self.ONTOLOGY.ProcessedContainerOutput) + in prov_graph) + + def test_provenance_annotation_container_multiple_output(self): + activate(clear=True) + container = process_multiple_container_output() + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessMultipleContainerOutput))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutput))) + ), 9) + + # FunctionExecution is ProcessMultipleContainerOutput + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, RDF.type, + self.ONTOLOGY.ProcessMultipleContainerOutput) in prov_graph) + + # Check returned values + output_nodes = prov_graph.subjects(RDF.type, + self.ONTOLOGY.ProcessedMultipleContainerOutput) + for output_node in output_nodes: + self.assertTrue((None, PROV.hadMember, output_node) in prov_graph) + self.assertTrue((output_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutput) + in prov_graph) + members = list(prov_graph.objects(output_node, PROV.hadMember)) + self.assertEqual(len(members), 0) + + def test_provenance_annotation_container_multiple_output_multiple_annotations(self): + activate(clear=True) + container = process_multiple_container_output_multiple_annotations() + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessMultipleContainerOutputMultipleAnnotations))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel2))) + ), 6) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel1))) + ), 2) + + # FunctionExecution is ProcessMultipleContainerOutputMultipleAnnotations + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, RDF.type, + self.ONTOLOGY.ProcessMultipleContainerOutputMultipleAnnotations) in prov_graph) + + # Check returned values + output_nodes = prov_graph.subjects(RDF.type, + self.ONTOLOGY.ProcessedMultipleContainerOutputLevel1) + for output_node in output_nodes: + self.assertTrue((None, PROV.hadMember, output_node) in prov_graph) + self.assertTrue((output_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_node, + RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel1) + in prov_graph) + members = list(prov_graph.objects(output_node, PROV.hadMember)) + self.assertEqual(len(members), 3) + for element in prov_graph.objects(output_node, PROV.hadMember): + self.assertTrue( + (element, RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue( + (element, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel2) in prov_graph) + members = list(prov_graph.objects(element, PROV.hadMember)) + self.assertEqual(len(members), 0) + + def test_provenance_annotation_container_multiple_output_multiple_annotations_root(self): + activate(clear=True) + container = process_multiple_container_output_multiple_annotations_root() + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessMultipleContainerOutputMultipleAnnotationsRoot))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel2))) + ), 6) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel1))) + ), 2) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel0))) + ), 1) + + # FunctionExecution is ProcessMultipleContainerOutputMultipleAnnotationsRoot + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, RDF.type, + self.ONTOLOGY.ProcessMultipleContainerOutputMultipleAnnotationsRoot) in prov_graph) + + # Check returned values + output_nodes = prov_graph.subjects(RDF.type, + self.ONTOLOGY.ProcessedMultipleContainerOutputLevel0) + for output_level0 in output_nodes: + self.assertTrue((output_level0, PROV.wasGeneratedBy, execution_uri) in prov_graph) + self.assertTrue((output_level0, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_level0, + RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel0) + in prov_graph) + members = list(prov_graph.objects(output_level0, PROV.hadMember)) + self.assertEqual(len(members), 2) + for output_level1 in prov_graph.objects(output_level0, PROV.hadMember): + self.assertTrue( + (output_level1, RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue( + (output_level1, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel1) in prov_graph) + members = list(prov_graph.objects(output_level1, PROV.hadMember)) + self.assertEqual(len(members), 3) + for output_level2 in prov_graph.objects(output_level1, PROV.hadMember): + self.assertTrue( + (output_level2, RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue( + (output_level2, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel2) in prov_graph) + members = list(prov_graph.objects(output_level2, PROV.hadMember)) + self.assertEqual(len(members), 0) + + def test_provenance_annotation_container_multiple_output_multiple_annotations_range(self): + activate(clear=True) + container = process_multiple_container_output_multiple_annotations_range() + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessMultipleContainerOutputMultipleAnnotationsRange))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel2))) + ), 6) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel1))) + ), 2) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel0))) + ), 0) + + # FunctionExecution is ProcessMultipleContainerOutputMultipleAnnotationsRange + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, RDF.type, + self.ONTOLOGY.ProcessMultipleContainerOutputMultipleAnnotationsRange) in prov_graph) + + # Check returned values + # First level was skipped, so only classes for Levels 1 and 2 should + # be present + output_nodes = prov_graph.subjects(RDF.type, + self.ONTOLOGY.ProcessedMultipleContainerOutputLevel1) + for output_level1 in output_nodes: + self.assertTrue((output_level1, PROV.wasGeneratedBy, execution_uri) in prov_graph) + self.assertTrue((output_level1, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((output_level1, + RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel1) + in prov_graph) + members = list(prov_graph.objects(output_level1, PROV.hadMember)) + self.assertEqual(len(members), 3) + for output_level2 in prov_graph.objects(output_level1, PROV.hadMember): + self.assertTrue( + (output_level2, RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue( + (output_level2, RDF.type, self.ONTOLOGY.ProcessedMultipleContainerOutputLevel2) in prov_graph) + members = list(prov_graph.objects(output_level2, PROV.hadMember)) + self.assertEqual(len(members), 0) + + def test_provenance_annotation_input(self): + activate(clear=True) + result = process_input_annotation(5, 6) + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessInputAnnotation))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Input))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Param))) + ), 1) + + # FunctionExecution is ProcessInputAnnotation + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, RDF.type, + self.ONTOLOGY.ProcessInputAnnotation) in prov_graph) + + + # Check input values + input_nodes = prov_graph.objects(execution_uri, PROV.used) + for input_node in input_nodes: + self.assertTrue((execution_uri, PROV.used, input_node) in prov_graph) + self.assertTrue((input_node, + RDF.type, ALPACA.DataObjectEntity) in prov_graph) + self.assertTrue((input_node, + RDF.type, self.ONTOLOGY.Input) in prov_graph) + + def test_provenance_annotation_container_input(self): + activate(clear=True) + input_list = [20, 30, 40] + result = process_container_input_annotation(5, input_list, 6) + deactivate() + + prov_data = save_provenance() + + # Read PROV information as RDF + prov_graph = Graph() + with io.StringIO(prov_data) as data_stream: + prov_graph.parse(data_stream, format='turtle') + + # Check that the annotations exist + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ProcessContainerInputAnnotation))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Input))) + ), 1) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.ContainerElementInput))) + ), 3) + self.assertEqual( + len(list(prov_graph.triples( + (None, RDF.type, self.ONTOLOGY.Param))) + ), 1) + + # FunctionExecution is ProcessContainerInputAnnotation + execution_uri = list( + prov_graph.subjects(RDF.type, ALPACA.FunctionExecution))[0] + self.assertTrue((execution_uri, RDF.type, + self.ONTOLOGY.ProcessContainerInputAnnotation) in prov_graph) + + + # Check input values have the expected number of classes + input_nodes = prov_graph.objects(execution_uri, PROV.used) + input_node_count = Counter() + for input_node in input_nodes: + self.assertTrue((execution_uri, PROV.used, input_node) in prov_graph) + for input_node_type in prov_graph.objects(input_node, RDF.type): + input_node_count[input_node_type] += 1 + + self.assertEqual(input_node_count[ALPACA.DataObjectEntity], 4) + self.assertEqual(input_node_count[self.ONTOLOGY.Input], 1) + self.assertEqual(input_node_count[self.ONTOLOGY.ContainerElementInput], 3) \ No newline at end of file diff --git a/doc/api.rst b/doc/api.rst index 5609d8e..4ef9d8b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -9,5 +9,6 @@ API Reference reference/decorator reference/graph reference/serialization + reference/ontologies reference/utility reference/settings diff --git a/doc/reference/ontologies.rst b/doc/reference/ontologies.rst new file mode 100644 index 0000000..23ae541 --- /dev/null +++ b/doc/reference/ontologies.rst @@ -0,0 +1,6 @@ +************************************************************** +Annotate provenance with semantic information using ontologies +************************************************************** + +.. automodule:: alpaca.ontology.annotation +