From 404be3b6a8e147208fd8836585e9f7cd52b0886a Mon Sep 17 00:00:00 2001 From: David Habgood Date: Mon, 26 Aug 2024 16:38:34 +1000 Subject: [PATCH] Implement RDF Patch serializer (#2877) * Implement RDF Patch serializer. Supports serialization from Dataset instances only; triples and quads within a Dataset are supported. * Add examples for Patch serialization. * Remove unnecessary use of addN -> add * Handle RDFLib graph/dataset addition quirks. Should fix static analysis issues. * Fix mypy errors * Attempt to fix failing test. * Add Patch documentation + docstring --------- Co-authored-by: Ashley Sommer --- docs/plugin_serializers.rst | 6 + examples/patch_serializer_example.py | 64 +++++++ rdflib/plugin.py | 6 + rdflib/plugins/serializers/patch.py | 105 +++++++++++ .../test_serializers/test_serializer_patch.py | 178 ++++++++++++++++++ 5 files changed, 359 insertions(+) create mode 100644 examples/patch_serializer_example.py create mode 100644 rdflib/plugins/serializers/patch.py create mode 100644 test/test_serializers/test_serializer_patch.py diff --git a/docs/plugin_serializers.rst b/docs/plugin_serializers.rst index 39d00df7f..3721bb9f8 100644 --- a/docs/plugin_serializers.rst +++ b/docs/plugin_serializers.rst @@ -21,6 +21,7 @@ n3 :class:`~rdflib.plugins.serializers.n3.N3Serializer` nquads :class:`~rdflib.plugins.serializers.nquads.NQuadsSerializer` nt :class:`~rdflib.plugins.serializers.nt.NTSerializer` hext :class:`~rdflib.plugins.serializers.hext.HextuplesSerializer` +patch :class:`~rdflib.plugins.serializers.patch.PatchSerializer` pretty-xml :class:`~rdflib.plugins.serializers.rdfxml.PrettyXMLSerializer` trig :class:`~rdflib.plugins.serializers.trig.TrigSerializer` trix :class:`~rdflib.plugins.serializers.trix.TriXSerializer` @@ -34,6 +35,11 @@ JSON-LD ------- JSON-LD - 'json-ld' - has been incorporated into RDFLib since v6.0.0. +RDF Patch +--------- + +The RDF Patch Serializer - 'patch' - uses the RDF Patch format defined at https://afs.github.io/rdf-patch/. It supports serializing context aware stores as either addition or deletion patches; and also supports serializing the difference between two context aware stores as a Patch of additions and deletions. + HexTuples --------- The HexTuples Serializer - 'hext' - uses the HexTuples format defined at https://github.com/ontola/hextuples. diff --git a/examples/patch_serializer_example.py b/examples/patch_serializer_example.py new file mode 100644 index 000000000..748124bc5 --- /dev/null +++ b/examples/patch_serializer_example.py @@ -0,0 +1,64 @@ +from rdflib import Dataset, Graph, Literal, URIRef + + +def main(): + # example for adding a quad + ds = Dataset() + g = Graph(identifier=URIRef("http://graph-a")) + ds.add_graph(g) + triple = (URIRef("http://subj-a"), URIRef("http://pred-a"), Literal("obj-a")) + ds.get_context(g.identifier).add(triple) + result = ds.serialize(format="patch", operation="add") + print("Add Quad Patch:") + print(result) + + # alternate example for adding a quad + ds = Dataset() + quad = ( + URIRef("http://subj-a"), + URIRef("http://pred-a"), + Literal("obj-a"), + Graph(identifier=URIRef("http://graph-a")), + ) + ds.add(quad) + result = ds.serialize(format="patch", operation="add") + print("Add Quad Patch:") + print(result) + + # example for adding a triple + ds = Dataset() + ds.add(triple) + result = ds.serialize(format="patch", operation="add") + print("\nAdd Triple Patch:") + print(result) + + # Example for diff quads + quad_1 = ( + URIRef("http://subj-a"), + URIRef("http://pred-a"), + Literal("obj-a"), + Graph(identifier=URIRef("http://graph-a")), + ) + quad_2 = ( + URIRef("http://subj-b"), + URIRef("http://pred-b"), + Literal("obj-b"), + Graph(identifier=URIRef("http://graph-b")), + ) + quad_3 = ( + URIRef("http://subj-c"), + URIRef("http://pred-c"), + Literal("obj-c"), + Graph(identifier=URIRef("http://graph-c")), + ) + ds1 = Dataset() + ds2 = Dataset() + ds1.addN([quad_1, quad_2]) + ds2.addN([quad_2, quad_3]) + result = ds1.serialize(format="patch", target=ds2) + print("Diff Quad Patch:") + print(result) + + +if __name__ == "__main__": + main() diff --git a/rdflib/plugin.py b/rdflib/plugin.py index 82b46ad86..23699e68d 100644 --- a/rdflib/plugin.py +++ b/rdflib/plugin.py @@ -363,6 +363,12 @@ def plugins( "rdflib.plugins.serializers.hext", "HextuplesSerializer", ) +register( + "patch", + Serializer, + "rdflib.plugins.serializers.patch", + "PatchSerializer", +) # Register Triple Parsers register( diff --git a/rdflib/plugins/serializers/patch.py b/rdflib/plugins/serializers/patch.py new file mode 100644 index 000000000..f548cbe3d --- /dev/null +++ b/rdflib/plugins/serializers/patch.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import warnings +from typing import IO, Optional +from uuid import uuid4 + +from rdflib import Dataset +from rdflib.plugins.serializers.nquads import _nq_row +from rdflib.plugins.serializers.nt import _nt_row +from rdflib.serializer import Serializer + +add_remove_methods = {"add": "A", "remove": "D"} + + +class PatchSerializer(Serializer): + """ + Creates an RDF patch file to add and remove triples/quads. + Can either: + - Create an add or delete patch for a single Dataset. + - Create a patch to represent the difference between two Datasets. + """ + + def __init__( + self, + store: Dataset, + ): + self.store: Dataset = store + super().__init__(store) + + def serialize( + self, + stream: IO[bytes], + base: Optional[str] = None, + encoding: Optional[str] = None, + **kwargs, + ): + """ + Serialize the store to the given stream. + :param stream: The stream to serialize to. + :param base: The base URI to use for the serialization. + :param encoding: The encoding to use for the serialization. + :param kwargs: Additional keyword arguments. + Supported keyword arguments: + - operation: The operation to perform. Either 'add' or 'remove'. + - target: The target Dataset to compare against. + NB: Only one of 'operation' or 'target' should be provided. + - header_id: The header ID to use. + - header_prev: The previous header ID to use. + """ + operation = kwargs.get("operation") + target = kwargs.get("target") + header_id = kwargs.get("header_id") + header_prev = kwargs.get("header_prev") + if not header_id: + header_id = f"uuid:{uuid4()}" + encoding = self.encoding + if base is not None: + warnings.warn("PatchSerializer does not support base.") + if encoding is not None and encoding.lower() != self.encoding.lower(): + warnings.warn( + "PatchSerializer does not use custom encoding. " + f"Given encoding was: {encoding}" + ) + + def write_header(): + stream.write(f"H id <{header_id}> .\n".encode(encoding, "replace")) + if header_prev: + stream.write(f"H prev <{header_prev}>\n".encode(encoding, "replace")) + stream.write("TX .\n".encode(encoding, "replace")) + + def write_triples(contexts, op_code, use_passed_contexts=False): + for context in contexts: + if not use_passed_contexts: + context = self.store.get_context(context.identifier) + for triple in context: + stream.write( + self._patch_row(triple, context.identifier, op_code).encode( + encoding, "replace" + ) + ) + + if operation: + assert operation in add_remove_methods, f"Invalid operation: {operation}" + + write_header() + if operation: + operation_code = add_remove_methods.get(operation) + write_triples(self.store.contexts(), operation_code) + elif target: + to_add, to_remove = self._diff(target) + write_triples(to_add.contexts(), "A", use_passed_contexts=True) + write_triples(to_remove.contexts(), "D", use_passed_contexts=True) + + stream.write("TC .\n".encode(encoding, "replace")) + + def _diff(self, target): + rows_to_add = target - self.store + rows_to_remove = self.store - target + return rows_to_add, rows_to_remove + + def _patch_row(self, triple, context_id, operation): + if context_id == self.store.default_context.identifier: + return f"{operation} {_nt_row(triple)}" + else: + return f"{operation} {_nq_row(triple, context_id)}" diff --git a/test/test_serializers/test_serializer_patch.py b/test/test_serializers/test_serializer_patch.py new file mode 100644 index 000000000..6d8a05055 --- /dev/null +++ b/test/test_serializers/test_serializer_patch.py @@ -0,0 +1,178 @@ +from rdflib import Dataset, Graph, Literal, URIRef + + +def test_add_quad(): + ds = Dataset() + ds.add( + ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + Graph(identifier=URIRef("http://example.org/graph1")), + ) + ) + result = ds.serialize(format="patch", operation="add") + assert ( + """A "object2" . +""" + in result + ) + + +def test_delete_quad(): + ds = Dataset() + ds.add( + ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + Graph(identifier=URIRef("http://example.org/graph1")), + ) + ) + result = ds.serialize(format="patch", operation="remove") + assert ( + """D "object2" . +""" + in result + ) + + +def test_diff_quad(): + quad_1 = ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + Graph(identifier=URIRef("http://example.org/graph1")), + ) + quad_2 = ( + URIRef("http://example.org/subject2"), + URIRef("http://example.org/predicate3"), + Literal("object3"), + Graph(identifier=URIRef("http://example.org/graph2")), + ) + ds1 = Dataset() + ds2 = Dataset() + ds1.add(quad_1) + ds2.addN([quad_1, quad_2]) + result = ds1.serialize(format="patch", target=ds2) + assert ( + """A "object3" .""" + in result + ) + + +def test_add_triple(): + ds = Dataset() + ds.add( + ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + ) + ) + result = ds.serialize(format="patch", operation="add") + assert ( + """A "object2" .""" + in result + ) + + +def test_delete_triple(): + ds = Dataset() + ds.add( + ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + ) + ) + result = ds.serialize(format="patch", operation="remove") + assert ( + """D "object2" .""" + in result + ) + + +def test_diff_triple(): + triple_1 = ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + ) + triple_2 = ( + URIRef("http://example.org/subject2"), + URIRef("http://example.org/predicate3"), + Literal("object3"), + ) + ds1 = Dataset() + ds2 = Dataset() + ds1.add(triple_1) + ds2.add(triple_1) + ds2.add(triple_2) + result = ds1.serialize(format="patch", target=ds2) + assert ( + """A "object3" .""" + in result + ) + + +def test_diff_quad_overlap(): + quad_1 = ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate1"), + Literal("object1"), + Graph(identifier=URIRef("http://example.org/graph1")), + ) + quad_2 = ( + URIRef("http://example.org/subject2"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + Graph(identifier=URIRef("http://example.org/graph2")), + ) + quad_3 = ( + URIRef("http://example.org/subject3"), + URIRef("http://example.org/predicate3"), + Literal("object3"), + Graph(identifier=URIRef("http://example.org/graph3")), + ) + ds1 = Dataset() + ds2 = Dataset() + ds1.addN([quad_1, quad_2]) + ds2.addN([quad_2, quad_3]) + result = ds1.serialize(format="patch", target=ds2) + # first quad needs to be removed + assert ( + """D "object1" .""" + in result + ) + # third quad needs to be added + assert ( + """A "object3" .""" + in result + ) + + +def test_header_id(): + ds = Dataset() + ds.add( + ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + ) + ) + result = ds.serialize(format="patch", operation="add", header_id="uuid:123") + assert """H id """ in result + + +def test_prev_header(): + ds = Dataset() + ds.add( + ( + URIRef("http://example.org/subject1"), + URIRef("http://example.org/predicate2"), + Literal("object2"), + ) + ) + result = ds.serialize(format="patch", operation="add", header_prev="uuid:123") + assert """H prev """ in result