From 9baa63ded9cab77bf2d0b1f70ffd6607fdc6cb10 Mon Sep 17 00:00:00 2001 From: Thomas Tanon Date: Thu, 17 Oct 2024 18:39:16 +0200 Subject: [PATCH] Parsers: support quad formats and other stores --- oxrdflib/_converter.py | 18 ---- oxrdflib/parser.py | 124 ++++++++++---------------- pyproject.toml | 4 + tests/data/test.nt | 6 -- tests/data/test.rdf | 16 ---- tests/data/test.ttl | 10 --- tests/test_parser.py | 188 ++++++++++++++------------------------- tests/test_serializer.py | 1 + 8 files changed, 120 insertions(+), 247 deletions(-) delete mode 100644 tests/data/test.nt delete mode 100644 tests/data/test.rdf delete mode 100644 tests/data/test.ttl diff --git a/oxrdflib/_converter.py b/oxrdflib/_converter.py index f5ce721..67e037f 100644 --- a/oxrdflib/_converter.py +++ b/oxrdflib/_converter.py @@ -108,21 +108,3 @@ def from_ox( if isinstance(term, ox.Triple): return from_ox(term.subject), from_ox(term.predicate), from_ox(term.object) raise ValueError(f"Unexpected Oxigraph term: {term!r}") - - -def guess_rdf_format(rdflib_type: str) -> ox.RdfFormat: - """Convert an rdflib type to a MIME type.""" - rdflib_type = ox_to_rdflib_type(rdflib_type) - rdf_format = ( - ox.RdfFormat.from_media_type(rdflib_type) - or ox.RdfFormat.from_extension(rdflib_type) - or ox.RdfFormat.from_media_type(f"application/{rdflib_type}") - ) - if rdf_format is None: - raise ValueError(f"Unsupported rdflib type: {rdflib_type}") - return rdf_format - - -def ox_to_rdflib_type(ox_format: str) -> str: - """Convert an Oxigraph format to a rdflib parser format.""" - return ox_format[len("ox-") :] if ox_format.startswith("ox-") else ox_format diff --git a/oxrdflib/parser.py b/oxrdflib/parser.py index 18cf30c..41ad5df 100644 --- a/oxrdflib/parser.py +++ b/oxrdflib/parser.py @@ -1,7 +1,8 @@ -import warnings -from typing import Any, Optional +from abc import ABC, abstractmethod +from typing import Optional -from rdflib import ConjunctiveGraph, Graph +from pyoxigraph import DefaultGraph, RdfFormat, parse +from rdflib import Graph from rdflib.exceptions import ParserError from rdflib.parser import ( FileInputSource, @@ -11,113 +12,84 @@ create_input_source, ) -from oxrdflib._converter import guess_rdf_format, ox_to_rdflib_type, to_ox +from oxrdflib._converter import from_ox, to_ox from oxrdflib.store import OxigraphStore __all__ = [ + "OxigraphN3Parser", "OxigraphTurtleParser", "OxigraphNTriplesParser", "OxigraphRdfXmlParser", + "OxigraphTriGParser", + "OxigraphNQuadsParser", ] -class OxigraphParser(Parser): +class _OxigraphParser(Parser, ABC): def parse( self, source: InputSource, sink: Graph, - format: str, + *, encoding: Optional[str] = "utf-8", - **kwargs: Any, + transactional: bool = True, ) -> None: if encoding not in (None, "utf-8"): - raise ParserError("N3/Turtle files are always utf-8 encoded, I was passed: {encoding}") - - if not isinstance(sink.store, OxigraphStore): - warnings.warn( - "Graph store should be an instance of OxigraphStore, " - f"got {type(sink.store).__name__} store instead." - " Attempting to parse using rdflib native parser.", - stacklevel=2, - ) - sink.parse(source, format=ox_to_rdflib_type(format)) - return - + raise ParserError(f"Only the 'utf-8' encoding is supported, '{encoding}' given") base_iri = sink.absolutize(source.getPublicId() or source.getSystemId() or "") - args = { - "format": guess_rdf_format(format), + "format": self._format, "base_iri": base_iri, - "to_graph": to_ox(sink.identifier), } + if isinstance(source, URLInputSource): - source = create_input_source(source.url, format=ox_to_rdflib_type(format)) + source = create_input_source(source.url, format=self._format.file_extension) if isinstance(source, FileInputSource): args["path"] = source.file.name else: args["input"] = source.getByteStream() - if kwargs.get("transactional", True): - sink.store._inner.load(**args) + if isinstance(sink.store, OxigraphStore): + if transactional: + sink.store._inner.load(**args, to_graph=to_ox(sink.identifier)) + else: + sink.store._inner.bulk_load(**args, to_graph=to_ox(sink.identifier)) else: - sink.store._inner.bulk_load(**args) + sink.store.addN( + ( + from_ox(quad.subject), + from_ox(quad.predicate), + from_ox(quad.object), + sink.identifier if isinstance(quad.graph_name, DefaultGraph) else from_ox(quad.graph_name), + ) + for quad in parse(**args) + ) + @property + @abstractmethod + def _format(self) -> RdfFormat: + pass -class OxigraphTurtleParser(OxigraphParser): - def parse( - self, - source: InputSource, - sink: Graph, - format: str = "ox-turtle", - encoding: Optional[str] = "utf-8", - **kwargs: Any, - ) -> None: - super().parse(source, sink, format, encoding, **kwargs) +class OxigraphTurtleParser(_OxigraphParser): + _format = RdfFormat.TURTLE -class OxigraphNTriplesParser(OxigraphParser): - def parse( - self, - source: InputSource, - sink: Graph, - format: str = "ox-nt", - encoding: Optional[str] = None, - **kwargs: Any, - ) -> None: - super().parse(source, sink, format, encoding, **kwargs) +class OxigraphNTriplesParser(_OxigraphParser): + _format = RdfFormat.N_TRIPLES -class OxigraphRdfXmlParser(OxigraphParser): - def parse( - self, - source: FileInputSource, - sink: Graph, - format: str = "ox-xml", - encoding: Optional[str] = None, - **kwargs: Any, - ) -> None: - super().parse(source, sink, format, encoding, **kwargs) +class OxigraphRdfXmlParser(_OxigraphParser): + _format = RdfFormat.RDF_XML -class OxigraphNQuadsParser(OxigraphParser): - def parse( - self, - source: InputSource, - sink: ConjunctiveGraph, - format: str, - encoding: Optional[str] = None, - **kwargs: Any, - ) -> None: - raise NotImplementedError("N-Quads is not supported yet") +class OxigraphN3Parser(_OxigraphParser): + _format = RdfFormat.N3 -class OxigraphTriGParser(OxigraphParser): - def parse( - self, - source: InputSource, - sink: Graph, - format: str, - encoding: Optional[str] = "utf-8", - **kwargs: Any, - ) -> None: - raise NotImplementedError("TriG parser is not supported yet") + +class OxigraphNQuadsParser(_OxigraphParser): + _format = RdfFormat.N_QUADS + + +class OxigraphTriGParser(_OxigraphParser): + _format = RdfFormat.TRIG diff --git a/pyproject.toml b/pyproject.toml index 3ee58cf..51305c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,11 @@ oxigraph = "oxrdflib.store:OxigraphStore" ox-turtle = "oxrdflib.parser:OxigraphTurtleParser" ox-ttl = "oxrdflib.parser:OxigraphTurtleParser" ox-ntriples = "oxrdflib.parser:OxigraphNTriplesParser" +ox-n3 = "oxrdflib.parser:OxigraphN3Parser" +ox-nquads = "oxrdflib.parser:OxigraphNQuadsParser" ox-nt = "oxrdflib.parser:OxigraphNTriplesParser" +ox-nt11 = "oxrdflib.parser:OxigraphNTriplesParser" +ox-trig = "oxrdflib.parser:OxigraphTriGParser" ox-xml = "oxrdflib.parser:OxigraphRdfXmlParser" [project.entry-points."rdf.plugins.serializer"] diff --git a/tests/data/test.nt b/tests/data/test.nt deleted file mode 100644 index 0f097e6..0000000 --- a/tests/data/test.nt +++ /dev/null @@ -1,6 +0,0 @@ - . - "Example Document" . - . - . - "John Doe" . - . diff --git a/tests/data/test.rdf b/tests/data/test.rdf deleted file mode 100644 index 5f78ef6..0000000 --- a/tests/data/test.rdf +++ /dev/null @@ -1,16 +0,0 @@ - - - - - Example Document - - - John Doe - - - - - - - diff --git a/tests/data/test.ttl b/tests/data/test.ttl deleted file mode 100644 index edc53ab..0000000 --- a/tests/data/test.ttl +++ /dev/null @@ -1,10 +0,0 @@ -@prefix ex: . -@prefix foaf: . - -ex:document a foaf:Document ; - foaf:title "Example Document" ; - foaf:creator ex:creator . - -ex:creator a foaf:Person ; - foaf:name "John Doe" ; - foaf:mbox . diff --git a/tests/test_parser.py b/tests/test_parser.py index e7c18c6..f9bf863 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,131 +1,77 @@ import unittest -import warnings +from io import StringIO from pathlib import Path -import rdflib +from rdflib import Dataset, Graph, URIRef +from rdflib.graph import DATASET_DEFAULT_GRAPH_ID _TEST_DIR = Path(__file__).resolve().parent -class TestGraphParsing(unittest.TestCase): - def test_parsing_ox_turtle_bulk_load(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse(_TEST_DIR / "data/test.ttl", format="ox-turtle", transactional=False) - self.assertEqual(len(graph), 6) - - def test_parsing_ox_turtle_load(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse(_TEST_DIR / "data/test.ttl", format="ox-turtle", transactional=True) - - self.assertEqual(len(graph), 6) - - def test_parsing_ox_turtle_fallback(self): - graph = rdflib.Graph() - with warnings.catch_warnings(record=True) as warning: - graph.parse(_TEST_DIR / "data/test.ttl", format="ox-turtle", transactional=False) - - self.assertEqual( - warning[0].message.args[0], - ( - "Graph store should be an instance of OxigraphStore, got Memory" - " store instead. Attempting to parse using rdflib native parser." - ), - ) - self.assertEqual(len(graph), 6) - - def test_parsing_ox_url_turtle(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse( - "https://i-adopt.github.io/ontology/ontology.ttl", - format="ox-turtle", - transactional=True, - ) - self.assertIsNotNone(graph) - - def test_parsing_ox_ntriples_bulk_load(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse(_TEST_DIR / "data/test.nt", format="ox-ntriples", transactional=False) - self.assertEqual(len(graph), 6) - - def test_parsing_ox_ntriples_load(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse(_TEST_DIR / "data/test.nt", format="ox-ntriples", transactional=True) - - self.assertEqual(len(graph), 6) - - def test_parsing_ox_ntriples_fallback(self): - graph = rdflib.Graph() - with warnings.catch_warnings(record=True) as warning: - graph.parse(_TEST_DIR / "data/test.nt", format="ox-ntriples", transactional=False) - - self.assertEqual( - warning[0].message.args[0], - ( - "Graph store should be an instance of OxigraphStore, got Memory" - " store instead. Attempting to parse using rdflib native parser." - ), - ) - self.assertEqual(len(graph), 6) - - def test_parsing_ox_url_ntriples(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse( - "https://i-adopt.github.io/ontology/ontology.nt", - format="ox-ntriples", - transactional=True, - ) - self.assertIsNotNone(graph) - - def test_parsing_ox_rdfxml_bulk_load(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse( - _TEST_DIR / "data/test.rdf", - publicID="http://example.com/", - format="ox-xml", - transactional=False, - ) - - self.assertEqual(len(graph), 6) - self.assertTrue(next(iter(graph))[0].startswith("http://example.com/")) - - def test_parsing_ox_rdfxml_load(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse( - _TEST_DIR / "data/test.rdf", - publicID="http://example.com/", - format="ox-xml", - transactional=True, - ) - self.assertEqual(len(graph), 6) - self.assertTrue(next(iter(graph))[0].startswith("http://example.com/")) - - def test_parsing_ox_url_rdfxml_load(self): - graph = rdflib.Graph(store="Oxigraph") - graph.parse( - "https://i-adopt.github.io/ontology/ontology.xml", - format="ox-xml", - transactional=True, - ) - self.assertIsNotNone(graph) - - def test_parsing_ox_rdfxml_fallback(self): - graph = rdflib.Graph() - with warnings.catch_warnings(record=True) as warning: - graph.parse( - _TEST_DIR / "data/test.rdf", - publicID="http://example.com/", - format="ox-xml", - transactional=False, - ) - - self.assertEqual( - warning[0].message.args[0], - ( - "Graph store should be an instance of OxigraphStore, got Memory" - " store instead. Attempting to parse using rdflib native parser." - ), - ) - self.assertEqual(len(graph), 6) +s = URIRef("http://example.com/s") +p = URIRef("http://example.com/vocab#p") +o = URIRef("http://example.com/o") +g = URIRef("http://example.com/g") + + +class TestParser(unittest.TestCase): + def test_parse_graph(self): + for store in ("default", "oxigraph"): + for transactional in (True, False): + for fmt, serialization in ( + ("ox-turtle", "@prefix v: . v:p ."), + ("ox-ttl", "@prefix v: . v:p ."), + ("ox-ntriples", " .\n"), + ("ox-n3", " .\n"), + ("ox-nquads", " .\n"), + ("ox-nt", " .\n"), + ("ox-nt11", " .\n"), + ("ox-trig", "@prefix v: . v:p ."), + ( + "ox-xml", + """ + + +

+ + """, + ), + ): + with self.subTest(store=store, format=fmt, transactional=transactional): + graph = Graph(store=store, identifier="http://example.com/") + graph.parse( + StringIO(serialization), + format=fmt, + publicID="http://example.com/", + transactional=transactional, + ) + self.assertEqual(list(graph), [(s, p, o)]) + # TODO: pyoxigraph 0.4.2: test that prefixes are properly loaded + + def test_parse_dataset(self): + for store in ("default", "oxigraph"): + for transactional in (True, False): + for fmt, serialization in ( + ( + "ox-nquads", + " .\n" + " " + " .\n", + ), + ( + "ox-trig", + "@prefix v: . v:p . { v:p }", + ), + ): + with self.subTest(store=store, format=fmt, transactional=transactional): + dataset = Dataset(store=store) + dataset.parse( + StringIO(serialization), + format=fmt, + publicID="http://example.com/", + transactional=transactional, + ) + self.assertEqual(set(dataset), {(s, p, o, g), (s, p, o, DATASET_DEFAULT_GRAPH_ID)}) if __name__ == "__main__": diff --git a/tests/test_serializer.py b/tests/test_serializer.py index 78345c3..9562509 100644 --- a/tests/test_serializer.py +++ b/tests/test_serializer.py @@ -35,6 +35,7 @@ def test_serialize_graph(self): graph.add((s, p, o)) graph.store.add((o, p, s), context=g) # Should not be serialized self.assertEqual(graph.serialize(format=fmt), serialization) + # TODO: pyoxigraph 0.4.2: test that prefixes and base are properly used def test_serialize_dataset(self): for store in ("default", "oxigraph"):