From 139508200e3937654ffb1f715824f8932a947b9d Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Sat, 27 Jul 2024 02:20:23 +1000 Subject: [PATCH 01/10] initial orjson support, for hextuples parser and serializer, and first stages of json-ld parser. This relies on merging of the BytesIOWrapper PR. --- README.md | 2 +- devtools/constraints.min | 1 + pyproject.toml | 2 + rdflib/plugins/parsers/hext.py | 72 ++++++++--- rdflib/plugins/serializers/hext.py | 119 +++++++++++++----- rdflib/plugins/shared/jsonld/util.py | 110 ++++++++++++---- test/test_parsers/test_parser_hext.py | 27 ++++ test/test_serializers/test_serializer_hext.py | 18 ++- tox.ini | 3 +- 9 files changed, 275 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index d48f211f6..78283f534 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ The stable release of RDFLib may be installed with Python's package management t Some features of RDFLib require optional dependencies which may be installed using *pip* extras: - $ pip install rdflib[berkeleydb,networkx,html,lxml] + $ pip install rdflib[berkeleydb,networkx,html,lxml,orjson] Alternatively manually download the package from the Python Package Index (PyPI) at https://pypi.python.org/pypi/rdflib diff --git a/devtools/constraints.min b/devtools/constraints.min index 18788bc54..0034ea304 100644 --- a/devtools/constraints.min +++ b/devtools/constraints.min @@ -8,3 +8,4 @@ berkeleydb==18.1.2 networkx==2.0 html5lib==1.0.1 lxml==4.3.0 +orjson==3.9.14 diff --git a/pyproject.toml b/pyproject.toml index 4ca58a019..57941ee57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ berkeleydb = {version = "^18.1.0", optional = true} networkx = {version = ">=2,<4", optional = true} html5lib = {version = "^1.0", optional = true} lxml = {version = ">=4.3,<6.0", optional = true} +orjson = {version = ">=3.9.14,<4", optional = true} [tool.poetry.group.dev.dependencies] black = "24.4.2" @@ -74,6 +75,7 @@ berkeleydb = ["berkeleydb"] networkx = ["networkx"] html = ["html5lib"] lxml = ["lxml"] +orjson = ["orjson"] [build-system] requires = ["poetry-core>=1.4.0"] diff --git a/rdflib/plugins/parsers/hext.py b/rdflib/plugins/parsers/hext.py index 4a9c00b8d..4a043a153 100644 --- a/rdflib/plugins/parsers/hext.py +++ b/rdflib/plugins/parsers/hext.py @@ -9,12 +9,23 @@ import json import warnings from io import TextIOWrapper -from typing import Any, BinaryIO, List, Optional, TextIO, Union +from typing import TYPE_CHECKING, Any, BinaryIO, List, Optional, TextIO, Union from rdflib.graph import ConjunctiveGraph, Graph from rdflib.parser import InputSource, Parser from rdflib.term import BNode, Literal, URIRef +try: + import orjson + + _HAS_ORJSON = True +except ImportError: + orjson = None + _HAS_ORJSON = False + +if TYPE_CHECKING: + from io import BufferedReader + __all__ = ["HextuplesParser"] @@ -27,16 +38,6 @@ class HextuplesParser(Parser): def __init__(self): pass - def _load_json_line(self, line: str) -> List[Optional[Any]]: - # this complex handing is because the 'value' component is - # allowed to be "" but not None - # all other "" values are treated as None - ret1 = json.loads(line) - ret2 = [x if x != "" else None for x in ret1] - if ret1[2] == "": - ret2[2] = "" - return ret2 - def _parse_hextuple( self, cg: ConjunctiveGraph, tup: List[Union[str, None]] ) -> None: @@ -98,19 +99,50 @@ def parse(self, source: InputSource, graph: Graph, **kwargs: Any) -> None: # ty cg = ConjunctiveGraph(store=graph.store, identifier=graph.identifier) cg.default_context = graph - text_stream: Optional[TextIO] = source.getCharacterStream() - if text_stream is None: + try: + text_stream: Optional[TextIO] = source.getCharacterStream() + except (AttributeError, LookupError): + text_stream = None + try: binary_stream: Optional[BinaryIO] = source.getByteStream() - if binary_stream is None: - raise ValueError( - f"Source does not have a character stream or a byte stream and cannot be used {type(source)}" - ) - text_stream = TextIOWrapper(binary_stream, encoding="utf-8") + except (AttributeError, LookupError): + binary_stream = None + + if text_stream is None and binary_stream is None: + raise ValueError( + f"Source does not have a character stream or a byte stream and cannot be used {type(source)}" + ) + if TYPE_CHECKING: + assert text_stream is not None or binary_stream is not None + use_stream: Union[TextIO, BinaryIO] + if _HAS_ORJSON: + if binary_stream is not None: + use_stream = binary_stream + else: + if TYPE_CHECKING: + assert isinstance(text_stream, TextIOWrapper) + use_stream = text_stream + loads = orjson.loads + else: + if text_stream is not None: + use_stream = text_stream + else: + if TYPE_CHECKING: + assert isinstance(binary_stream, BufferedReader) + use_stream = TextIOWrapper(binary_stream, encoding="utf-8") + loads = json.loads - for line in text_stream: + for line in use_stream: # type: Union[str, bytes] if len(line) == 0 or line.isspace(): # Skipping empty lines because this is what was being done before for the first and last lines, albeit in an rather indirect way. # The result is that we accept input that would otherwise be invalid. # Possibly we should just let this result in an error. continue - self._parse_hextuple(cg, self._load_json_line(line)) + # this complex handing is because the 'value' component is + # allowed to be "" but not None + # all other "" values are treated as None + raw_line: List[str] = loads(line) + hex_tuple_line = [x if x != "" else None for x in raw_line] + if raw_line[2] == "": + hex_tuple_line[2] = "" + self._parse_hextuple(cg, hex_tuple_line) diff --git a/rdflib/plugins/serializers/hext.py b/rdflib/plugins/serializers/hext.py index 00a02c5ce..51d3b9299 100644 --- a/rdflib/plugins/serializers/hext.py +++ b/rdflib/plugins/serializers/hext.py @@ -7,12 +7,20 @@ import json import warnings -from typing import IO, Optional, Type, Union +from typing import IO, Callable, List, Optional, Type, Union -from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, ConjunctiveGraph, Graph +from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, ConjunctiveGraph, Dataset, Graph from rdflib.namespace import RDF, XSD from rdflib.serializer import Serializer -from rdflib.term import BNode, Literal, Node, URIRef +from rdflib.term import BNode, IdentifiedNode, Literal, URIRef + +try: + import orjson + + _HAS_ORJSON = True +except ImportError: + orjson = None + _HAS_ORJSON = False __all__ = ["HextuplesSerializer"] @@ -22,11 +30,37 @@ class HextuplesSerializer(Serializer): Serializes RDF graphs to NTriples format. """ - def __init__(self, store: Union[Graph, ConjunctiveGraph]): - self.default_context: Optional[Node] - self.graph_type: Type[Graph] - if isinstance(store, ConjunctiveGraph): - self.graph_type = ConjunctiveGraph + contexts: List[Union[Graph, IdentifiedNode]] + dumps: Callable + + def __new__(cls, store: Union[Graph, Dataset, ConjunctiveGraph]): + if _HAS_ORJSON: + cls.str_local_id = orjson.Fragment(b'"localId"') + cls.str_global_id = orjson.Fragment(b'"globalId"') + cls.empty = orjson.Fragment(b'""') + cls.newline: Union[bytes, str] = b"\n" + cls.lang_str = orjson.Fragment(b'"' + RDF.langString.encode("utf-8") + b'"') + cls.xsd_string = orjson.Fragment(b'"' + XSD.string.encode("utf-8") + b'"') + dumps = orjson.dumps + else: + cls.str_local_id = "localId" + cls.str_global_id = "globalId" + cls.empty = "" + cls.newline = "\n" + cls.lang_str = f"{RDF.langString}" + cls.xsd_string = f"{XSD.string}" + dumps = json.dumps + self = super(cls, cls).__new__(cls) + self.dumps = dumps + return self + + def __init__(self, store: Union[Graph, Dataset, ConjunctiveGraph]): + self.default_context: Optional[Union[Graph, IdentifiedNode]] + self.graph_type: Union[Type[Graph], Type[Dataset], Type[ConjunctiveGraph]] + if isinstance(store, (Dataset, ConjunctiveGraph)): + self.graph_type = ( + Dataset if isinstance(store, Dataset) else ConjunctiveGraph + ) self.contexts = list(store.contexts()) if store.default_context: self.default_context = store.default_context @@ -64,14 +98,26 @@ def serialize( raise Exception( "Hextuple serialization can't (yet) handle formula-aware stores" ) - + context: Union[Graph, IdentifiedNode] + context_str: Union[bytes, str] for context in self.contexts: for triple in context: - hl = self._hex_line(triple, context) + # Generate context string just once, because it doesn't change + # for every triple in this context + context_str = ( + self.empty + if self.graph_type is Graph + else ( + orjson.Fragment('"' + self._context_str(context) + '"') + if _HAS_ORJSON + else self._context_str(context) + ) + ) + hl = self._hex_line(triple, context_str) if hl is not None: - stream.write(hl.encode()) + stream.write(hl if _HAS_ORJSON else hl.encode()) - def _hex_line(self, triple, context): + def _hex_line(self, triple, context_str: Union[bytes, str]): if isinstance( triple[0], (URIRef, BNode) ): # exclude QuotedGraph and other objects @@ -85,18 +131,18 @@ def _hex_line(self, triple, context): # datatype if isinstance(triple[2], URIRef): # datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#namedNode" - datatype = "globalId" + datatype = self.str_global_id elif isinstance(triple[2], BNode): # datatype = "http://www.w3.org/1999/02/22-rdf-syntax-ns#blankNode" - datatype = "localId" + datatype = self.str_local_id elif isinstance(triple[2], Literal): if triple[2].datatype is not None: datatype = f"{triple[2].datatype}" else: if triple[2].language is not None: # language - datatype = RDF.langString + datatype = self.lang_str else: - datatype = XSD.string + datatype = self.xsd_string else: return None # can't handle non URI, BN or Literal Object (QuotedGraph) @@ -105,22 +151,22 @@ def _hex_line(self, triple, context): if triple[2].language is not None: language = f"{triple[2].language}" else: - language = "" + language = self.empty else: - language = "" + language = self.empty return ( - json.dumps( + self.dumps( [ self._iri_or_bn(triple[0]), triple[1], value, datatype, language, - self._context(context), + context_str, ] ) - + "\n" + + self.newline ) else: # do not return anything for non-IRIs or BNs, e.g. QuotedGraph, Subjects return None @@ -133,17 +179,28 @@ def _iri_or_bn(self, i_): else: return None - def _context(self, context): - if self.graph_type == Graph: - return "" - if context.identifier == DATASET_DEFAULT_GRAPH_ID: + def _context_str(self, context: Union[Graph, IdentifiedNode]) -> str: + context_identifier: IdentifiedNode = ( + context.identifier if isinstance(context, Graph) else context + ) + if context_identifier == DATASET_DEFAULT_GRAPH_ID: return "" - elif context is not None and self.default_context is not None: - # type error: "Node" has no attribute "identifier" - if context.identifier == self.default_context.identifier: # type: ignore[attr-defined] + if self.default_context is not None: + if ( + isinstance(self.default_context, IdentifiedNode) + and context_identifier == self.default_context + ): return "" + elif ( + isinstance(self.default_context, Graph) + and context_identifier == self.default_context.identifier + ): + return "" + if self.graph_type is Graph: + # Only emit a context name when serializing a Dataset or ConjunctiveGraph + return "" return ( - context.identifier - if isinstance(context.identifier, URIRef) - else context.identifier.n3() + f"{context_identifier}" + if isinstance(context_identifier, URIRef) + else context_identifier.n3() ) diff --git a/rdflib/plugins/shared/jsonld/util.py b/rdflib/plugins/shared/jsonld/util.py index ae2ceb3b9..2fc9719b6 100644 --- a/rdflib/plugins/shared/jsonld/util.py +++ b/rdflib/plugins/shared/jsonld/util.py @@ -1,22 +1,21 @@ # https://github.com/RDFLib/rdflib-jsonld/blob/feature/json-ld-1.1/rdflib_jsonld/util.py from __future__ import annotations +import json import pathlib +from io import StringIO, TextIOWrapper +from posixpath import normpath, sep from typing import IO, TYPE_CHECKING, Any, Optional, TextIO, Tuple, Union +from urllib.parse import urljoin, urlsplit, urlunsplit -if TYPE_CHECKING: - import json -else: - try: - import json +try: + import orjson - assert json # workaround for pyflakes issue #13 - except ImportError: - import simplejson as json + _HAS_ORJSON = True +except ImportError: + orjson = None + _HAS_ORJSON = False -from io import TextIOBase, TextIOWrapper -from posixpath import normpath, sep -from urllib.parse import urljoin, urlsplit, urlunsplit from rdflib.parser import ( BytesIOWrapper, @@ -37,22 +36,91 @@ def source_to_json( return source.data if isinstance(source, StringInputSource): - return json.load(source.getCharacterStream()) + # We can get the original string from the StringInputSource + # It's hidden in the BytesIOWrapper 'wrapped' attribute + b_stream = source.getByteStream() + original_string: Optional[str] = None + if isinstance(b_stream, BytesIOWrapper): + wrapped_inner = b_stream.wrapped + if isinstance(wrapped_inner, str): + original_string = wrapped_inner + elif isinstance(wrapped_inner, StringIO): + original_string = wrapped_inner.getvalue() + + if _HAS_ORJSON: + if original_string is not None: + return orjson.loads(original_string) + elif isinstance(b_stream, BytesIOWrapper): + # use the CharacterStream instead + c_stream = source.getCharacterStream() + return orjson.loads(c_stream.read()) + else: + return orjson.loads(b_stream.read()) + else: + if original_string is not None: + return json.loads(original_string) + return json.load(source.getCharacterStream()) # TODO: conneg for JSON (fix support in rdflib's URLInputSource!) source = create_input_source(source, format="json-ld") - stream = source.getByteStream() try: - if isinstance(stream, BytesIOWrapper): - stream = stream.wrapped - # Use character stream as-is, or interpret byte stream as UTF-8 - if isinstance(stream, TextIOBase): - use_stream = stream + b_stream = source.getByteStream() + except (AttributeError, LookupError): + b_stream = None + try: + c_stream = source.getCharacterStream() + except (AttributeError, LookupError): + c_stream = None + if b_stream is None and c_stream is None: + raise ValueError( + f"Source does not have a character stream or a byte stream and cannot be used {type(source)}" + ) + underlying_string: Optional[str] = None + if b_stream is not None and isinstance(b_stream, BytesIOWrapper): + # Try to find an underlying string to use? + wrapped_inner = b_stream.wrapped + if isinstance(wrapped_inner, str): + underlying_string = wrapped_inner + elif isinstance(wrapped_inner, StringIO): + underlying_string = wrapped_inner.getvalue() + try: + if _HAS_ORJSON: + if underlying_string is not None: + return orjson.loads(underlying_string) + elif ( + (b_stream is not None and isinstance(b_stream, BytesIOWrapper)) + or b_stream is None + ) and c_stream is not None: + # use the CharacterStream instead + return orjson.loads(c_stream.read()) + else: + if TYPE_CHECKING: + assert b_stream is not None + # b_stream is not None + return orjson.loads(b_stream.read()) else: - use_stream = TextIOWrapper(stream, encoding="utf-8") - return json.load(use_stream) + if underlying_string is not None: + return json.loads(underlying_string) + if c_stream is not None: + use_stream = c_stream + else: + if TYPE_CHECKING: + assert b_stream is not None + # b_stream is not None + use_stream = TextIOWrapper(b_stream, encoding="utf-8") + return json.load(use_stream) + finally: - stream.close() + if b_stream is not None: + try: + b_stream.close() + except AttributeError: + pass + if c_stream is not None: + try: + c_stream.close() + except AttributeError: + pass VOCAB_DELIMS = ("#", "/", ":") diff --git a/test/test_parsers/test_parser_hext.py b/test/test_parsers/test_parser_hext.py index 908c4950d..c71bd1a49 100644 --- a/test/test_parsers/test_parser_hext.py +++ b/test/test_parsers/test_parser_hext.py @@ -57,6 +57,33 @@ def test_small_string(): assert len(d) == 10 +def test_small_bytes_string(): + s = b"""\ + ["http://example.com/s01", "http://example.com/a", "http://example.com/Type1", "globalId", "", "https://example.com/graph/1"] + ["http://example.com/s01", "http://example.com/label", "This is a Label", "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString", "en", "_:graph-2"] + ["http://example.com/s01", "http://example.com/comment", "This is a comment", "http://www.w3.org/2001/XMLSchema#string", "", ""] + ["http://example.com/s01", "http://example.com/creationDate", "2021-12-01", "http://www.w3.org/2001/XMLSchema#date", "", ""] + ["http://example.com/s01", "http://example.com/creationTime", "2021-12-01T12:13:00", "http://www.w3.org/2001/XMLSchema#dateTime", "", ""] + ["http://example.com/s01", "http://example.com/age", "42", "http://www.w3.org/2001/XMLSchema#integer", "", ""] + ["http://example.com/s01", "http://example.com/trueFalse", "false", ",http://www.w3.org/2001/XMLSchema#boolean", "", ""] + ["http://example.com/s01", "http://example.com/op1", "http://example.com/o1", "globalId", "", ""] + ["http://example.com/s01", "http://example.com/op1", "http://example.com/o2", "globalId", "", ""] + ["http://example.com/s01", "http://example.com/op2", "http://example.com/o3", "globalId", "", ""] + """ + d = Dataset() + d.parse(data=s, format="hext") + + expected_graph_names = ( + URIRef(DATASET_DEFAULT_GRAPH_ID), + URIRef("https://example.com/graph/1"), + BNode("graph-2"), + ) + for graph in d.contexts(): + assert graph.identifier in expected_graph_names + + assert len(d) == 10 + + def test_small_string_cg(): s = """ ["http://example.com/s01", "http://example.com/a", "http://example.com/Type1", "globalId", "", "https://example.com/graph/1"] diff --git a/test/test_serializers/test_serializer_hext.py b/test/test_serializers/test_serializer_hext.py index 2a75cc895..2b0577bc1 100644 --- a/test/test_serializers/test_serializer_hext.py +++ b/test/test_serializers/test_serializer_hext.py @@ -79,8 +79,10 @@ def test_hext_graph(): ], ] for line in out.splitlines(): + normalized_line = line.replace(", ", ",").strip() for test in testing_lines: - if test[1] in line: + normalized_test = test[1].replace(", ", ",").strip() + if normalized_test in normalized_line: test[0] = True assert all([x[0] for x in testing_lines]) @@ -127,7 +129,7 @@ def test_hext_cg(): """ d.parse(data=trig_data, format="trig", publicID=d.default_context.identifier) out = d.serialize(format="hext") - # note: cant' test for BNs in result as they will be different ever time + # note: cant' test for BNs in result as they will be different every time testing_lines = [ [ False, @@ -200,8 +202,10 @@ def test_hext_cg(): ], ] for line in out.splitlines(): + normalized_line = line.replace(", ", ",").strip() for test in testing_lines: - if test[1] in line: + normalized_test = test[1].replace(", ", ",").strip() + if normalized_test in normalized_line: test[0] = True assert all([x[0] for x in testing_lines]) @@ -316,8 +320,10 @@ def test_hext_dataset(): ], ] for line in out.splitlines(): + normalized_line = line.replace(", ", ",").strip() for test in testing_lines: - if test[1] in line: + normalized_test = test[1].replace(", ", ",").strip() + if normalized_test in normalized_line: test[0] = True assert all([x[0] for x in testing_lines]) @@ -396,10 +402,12 @@ def test_roundtrip(): str(Path(__file__).parent.parent / "data/test_parser_hext_multigraph.ndjson") ) as i: ordered_input = "".join(sorted(i.readlines())).strip() + normalized_ordered_input = ordered_input.replace(", ", ",") ordered_output = "\n".join(sorted(d.serialize(format="hext").split("\n"))).strip() + normalized_ordered_output = ordered_output.replace(", ", ",") - assert ordered_output == ordered_input + assert normalized_ordered_output == normalized_ordered_input # def _make_large_graph(): diff --git a/tox.ini b/tox.ini index e5baffcc5..9ec80d516 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ setenv = COVERAGE_FILE = {env:COVERAGE_FILE:{toxinidir}/.coverage.{envname}} MYPY_CACHE_DIR = {envdir}/.mypy_cache docs: POETRY_ARGS_docs = --only=docs - extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html + extensive: POETRY_ARGS_extensive = --extras=berkeleydb --extras=networkx --extras=html --extras=orjson lxml: POETRY_ARGS_lxml = --extras=lxml commands_pre = py3{8,9,10,11}: python -c 'import os; print("\n".join(f"{key}={value}" for key, value in os.environ.items()))' @@ -78,6 +78,7 @@ extras = networkx lxml html + orjson commands = {envpython} --version pip freeze From 9989784dc71ca9a69f36e7a6b2c55539ff8c8f66 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Sat, 27 Jul 2024 14:26:27 +1000 Subject: [PATCH 02/10] Finish work bringing orjson to JSON-LD Graph Parser Add orjson to sparql-results-json parser, and sparql-results-json serializer Tangential fixes to all the other non-json SPARQL-Results serializers Adding better typing to all SPARQL-Results Serializers. Got 7 ignored tests passing for SPARQL-Results Serializers. --- rdflib/plugins/parsers/hext.py | 2 +- rdflib/plugins/parsers/jsonld.py | 17 +++-- rdflib/plugins/serializers/hext.py | 73 ++++++++++---------- rdflib/plugins/serializers/jsonld.py | 31 ++++++--- rdflib/plugins/shared/jsonld/util.py | 10 +-- rdflib/plugins/sparql/evaluate.py | 19 +++-- rdflib/plugins/sparql/results/csvresults.py | 15 ++-- rdflib/plugins/sparql/results/jsonresults.py | 48 ++++++++++--- rdflib/plugins/sparql/results/txtresults.py | 30 ++++---- rdflib/query.py | 3 +- test/test_sparql/test_result.py | 11 --- test/utils/result.py | 8 ++- 12 files changed, 170 insertions(+), 97 deletions(-) diff --git a/rdflib/plugins/parsers/hext.py b/rdflib/plugins/parsers/hext.py index 4a043a153..a5b66cb51 100644 --- a/rdflib/plugins/parsers/hext.py +++ b/rdflib/plugins/parsers/hext.py @@ -20,7 +20,7 @@ _HAS_ORJSON = True except ImportError: - orjson = None + orjson = None # type: ignore[assignment] _HAS_ORJSON = False if TYPE_CHECKING: diff --git a/rdflib/plugins/parsers/jsonld.py b/rdflib/plugins/parsers/jsonld.py index 6a5bc1103..691a896c0 100644 --- a/rdflib/plugins/parsers/jsonld.py +++ b/rdflib/plugins/parsers/jsonld.py @@ -62,9 +62,11 @@ VOCAB, ) from ..shared.jsonld.util import ( + _HAS_ORJSON, VOCAB_DELIMS, context_from_urlinputsource, json, + orjson, source_to_json, ) @@ -605,11 +607,18 @@ def _add_list( @staticmethod def _to_typed_json_value(value: Any) -> Dict[str, str]: - return { - TYPE: URIRef("%sJSON" % str(RDF)), - VALUE: json.dumps( + if _HAS_ORJSON: + val_string: str = orjson.dumps( + value, + option=orjson.OPT_SORT_KEYS | orjson.OPT_NON_STR_KEYS, + ).decode("utf-8") + else: + val_string = json.dumps( value, separators=(",", ":"), sort_keys=True, ensure_ascii=False - ), + ) + return { + TYPE: RDF.JSON, + VALUE: val_string, } @classmethod diff --git a/rdflib/plugins/serializers/hext.py b/rdflib/plugins/serializers/hext.py index 51d3b9299..d0e31a6f0 100644 --- a/rdflib/plugins/serializers/hext.py +++ b/rdflib/plugins/serializers/hext.py @@ -7,7 +7,7 @@ import json import warnings -from typing import IO, Callable, List, Optional, Type, Union +from typing import IO, Any, Callable, List, Optional, Type, Union, cast from rdflib.graph import DATASET_DEFAULT_GRAPH_ID, ConjunctiveGraph, Dataset, Graph from rdflib.namespace import RDF, XSD @@ -19,7 +19,7 @@ _HAS_ORJSON = True except ImportError: - orjson = None + orjson = None # type: ignore[assignment] _HAS_ORJSON = False __all__ = ["HextuplesSerializer"] @@ -35,24 +35,22 @@ class HextuplesSerializer(Serializer): def __new__(cls, store: Union[Graph, Dataset, ConjunctiveGraph]): if _HAS_ORJSON: - cls.str_local_id = orjson.Fragment(b'"localId"') - cls.str_global_id = orjson.Fragment(b'"globalId"') - cls.empty = orjson.Fragment(b'""') - cls.newline: Union[bytes, str] = b"\n" - cls.lang_str = orjson.Fragment(b'"' + RDF.langString.encode("utf-8") + b'"') - cls.xsd_string = orjson.Fragment(b'"' + XSD.string.encode("utf-8") + b'"') - dumps = orjson.dumps + cls.str_local_id: Union[str, Any] = orjson.Fragment(b'"localId"') + cls.str_global_id: Union[str, Any] = orjson.Fragment(b'"globalId"') + cls.empty: Union[str, Any] = orjson.Fragment(b'""') + cls.lang_str: Union[str, Any] = orjson.Fragment( + b'"' + RDF.langString.encode("utf-8") + b'"' + ) + cls.xsd_string: Union[str, Any] = orjson.Fragment( + b'"' + XSD.string.encode("utf-8") + b'"' + ) else: cls.str_local_id = "localId" cls.str_global_id = "globalId" cls.empty = "" - cls.newline = "\n" cls.lang_str = f"{RDF.langString}" cls.xsd_string = f"{XSD.string}" - dumps = json.dumps - self = super(cls, cls).__new__(cls) - self.dumps = dumps - return self + return super(cls, cls).__new__(cls) def __init__(self, store: Union[Graph, Dataset, ConjunctiveGraph]): self.default_context: Optional[Union[Graph, IdentifiedNode]] @@ -104,14 +102,17 @@ def serialize( for triple in context: # Generate context string just once, because it doesn't change # for every triple in this context - context_str = ( - self.empty - if self.graph_type is Graph - else ( - orjson.Fragment('"' + self._context_str(context) + '"') - if _HAS_ORJSON - else self._context_str(context) - ) + context_str = cast( + Union[str, bytes], + ( + self.empty + if self.graph_type is Graph + else ( + orjson.Fragment('"' + self._context_str(context) + '"') + if _HAS_ORJSON + else self._context_str(context) + ) + ), ) hl = self._hex_line(triple, context_str) if hl is not None: @@ -154,20 +155,20 @@ def _hex_line(self, triple, context_str: Union[bytes, str]): language = self.empty else: language = self.empty - - return ( - self.dumps( - [ - self._iri_or_bn(triple[0]), - triple[1], - value, - datatype, - language, - context_str, - ] - ) - + self.newline - ) + line_list = [ + self._iri_or_bn(triple[0]), + triple[1], + value, + datatype, + language, + context_str, + ] + outline: Union[str, bytes] + if _HAS_ORJSON: + outline = orjson.dumps(line_list, option=orjson.OPT_APPEND_NEWLINE) + else: + outline = json.dumps(line_list) + "\n" + return outline else: # do not return anything for non-IRIs or BNs, e.g. QuotedGraph, Subjects return None diff --git a/rdflib/plugins/serializers/jsonld.py b/rdflib/plugins/serializers/jsonld.py index ee3fe17bd..1bfb6cc02 100644 --- a/rdflib/plugins/serializers/jsonld.py +++ b/rdflib/plugins/serializers/jsonld.py @@ -47,7 +47,7 @@ from ..shared.jsonld.context import UNDEF, Context from ..shared.jsonld.keys import CONTEXT, GRAPH, ID, LANG, LIST, SET, VOCAB -from ..shared.jsonld.util import json +from ..shared.jsonld.util import _HAS_ORJSON, json, orjson __all__ = ["JsonLDSerializer", "from_rdf"] @@ -91,16 +91,25 @@ def serialize( use_rdf_type, auto_compact=auto_compact, ) - - data = json.dumps( - obj, - indent=indent, - separators=separators, - sort_keys=sort_keys, - ensure_ascii=ensure_ascii, - ) - - stream.write(data.encode(encoding, "replace")) + if _HAS_ORJSON: + option: int = orjson.OPT_NON_STR_KEYS + if indent is not None: + option |= orjson.OPT_INDENT_2 + if sort_keys: + option |= orjson.OPT_SORT_KEYS + if ensure_ascii: + warnings.warn("Cannot use ensure_ascii with orjson") + data_bytes = orjson.dumps(obj, option=option) + stream.write(data_bytes) + else: + data = json.dumps( + obj, + indent=indent, + separators=separators, + sort_keys=sort_keys, + ensure_ascii=ensure_ascii, + ) + stream.write(data.encode(encoding, "replace")) def from_rdf( diff --git a/rdflib/plugins/shared/jsonld/util.py b/rdflib/plugins/shared/jsonld/util.py index 2fc9719b6..83e80ecbd 100644 --- a/rdflib/plugins/shared/jsonld/util.py +++ b/rdflib/plugins/shared/jsonld/util.py @@ -3,9 +3,9 @@ import json import pathlib -from io import StringIO, TextIOWrapper +from io import StringIO, TextIOBase, TextIOWrapper from posixpath import normpath, sep -from typing import IO, TYPE_CHECKING, Any, Optional, TextIO, Tuple, Union +from typing import IO, TYPE_CHECKING, Any, Optional, TextIO, Tuple, Union, cast from urllib.parse import urljoin, urlsplit, urlunsplit try: @@ -13,7 +13,7 @@ _HAS_ORJSON = True except ImportError: - orjson = None + orjson = None # type: ignore[assignment] _HAS_ORJSON = False @@ -41,7 +41,7 @@ def source_to_json( b_stream = source.getByteStream() original_string: Optional[str] = None if isinstance(b_stream, BytesIOWrapper): - wrapped_inner = b_stream.wrapped + wrapped_inner = cast(Union[str, StringIO, TextIOBase], b_stream.wrapped) if isinstance(wrapped_inner, str): original_string = wrapped_inner elif isinstance(wrapped_inner, StringIO): @@ -193,4 +193,6 @@ def context_from_urlinputsource(source: URLInputSource) -> Optional[str]: # typ "split_iri", "norm_url", "context_from_urlinputsource", + "orjson", + "_HAS_ORJSON", ] diff --git a/rdflib/plugins/sparql/evaluate.py b/rdflib/plugins/sparql/evaluate.py index 123749026..60307a14c 100644 --- a/rdflib/plugins/sparql/evaluate.py +++ b/rdflib/plugins/sparql/evaluate.py @@ -18,7 +18,6 @@ import collections import itertools -import json as j import re from typing import ( TYPE_CHECKING, @@ -63,6 +62,15 @@ if TYPE_CHECKING: from rdflib.paths import Path +import json + +try: + import orjson + + _HAS_ORJSON = True +except ImportError: + orjson = None # type: ignore[assignment] + _HAS_ORJSON = False _Triple = Tuple[Identifier, Identifier, Identifier] @@ -365,10 +373,13 @@ def evalServiceQuery(ctx: QueryContext, part: CompValue): ) ) if response.status == 200: - json = j.loads(response.read()) - variables = res["vars_"] = json["head"]["vars"] + if _HAS_ORJSON: + json_dict = orjson.loads(response.read()) + else: + json_dict = json.loads(response.read()) + variables = res["vars_"] = json_dict["head"]["vars"] # or just return the bindings? - res = json["results"]["bindings"] + res = json_dict["results"]["bindings"] if len(res) > 0: for r in res: # type error: Argument 2 to "_yieldBindingsFromServiceCallResult" has incompatible type "str"; expected "Dict[str, Dict[str, str]]" diff --git a/rdflib/plugins/sparql/results/csvresults.py b/rdflib/plugins/sparql/results/csvresults.py index ef557c014..32b3e4212 100644 --- a/rdflib/plugins/sparql/results/csvresults.py +++ b/rdflib/plugins/sparql/results/csvresults.py @@ -11,7 +11,8 @@ import codecs import csv -from typing import IO, Dict, List, Optional, Union +from io import BufferedIOBase, TextIOBase +from typing import IO, Dict, List, Optional, Union, cast from rdflib.plugins.sparql.processor import SPARQLResult from rdflib.query import Result, ResultParser, ResultSerializer @@ -71,13 +72,19 @@ def __init__(self, result: SPARQLResult): def serialize(self, stream: IO, encoding: str = "utf-8", **kwargs) -> None: # the serialiser writes bytes in the given encoding # in py3 csv.writer is unicode aware and writes STRINGS, - # so we encode afterwards + # so we encode afterward import codecs - stream = codecs.getwriter(encoding)(stream) # type: ignore[assignment] + # TODO: Find a better solution for all this casting + writable_stream = cast(Union[TextIOBase, BufferedIOBase], stream) + if isinstance(writable_stream, TextIOBase): + string_stream: TextIOBase = writable_stream + else: + byte_stream = cast(BufferedIOBase, writable_stream) + string_stream = cast(TextIOBase, codecs.getwriter(encoding)(byte_stream)) - out = csv.writer(stream, delimiter=self.delim) + out = csv.writer(string_stream, delimiter=self.delim) vs = [self.serializeTerm(v, encoding) for v in self.result.vars] # type: ignore[union-attr] out.writerow(vs) diff --git a/rdflib/plugins/sparql/results/jsonresults.py b/rdflib/plugins/sparql/results/jsonresults.py index 405a3860b..8614daab7 100644 --- a/rdflib/plugins/sparql/results/jsonresults.py +++ b/rdflib/plugins/sparql/results/jsonresults.py @@ -17,14 +17,29 @@ from rdflib.query import Result, ResultException, ResultParser, ResultSerializer from rdflib.term import BNode, Identifier, Literal, URIRef, Variable +try: + import orjson + + _HAS_ORJSON = True +except ImportError: + orjson = None # type: ignore[assignment] + _HAS_ORJSON = False + class JSONResultParser(ResultParser): # type error: Signature of "parse" incompatible with supertype "ResultParser" def parse(self, source: IO, content_type: Optional[str] = None) -> Result: # type: ignore[override] inp = source.read() - if isinstance(inp, bytes): - inp = inp.decode("utf-8") - return JSONResult(json.loads(inp)) + if _HAS_ORJSON: + try: + loaded = orjson.loads(inp) + except Exception as e: + raise ResultException(f"Failed to parse result: {e}") + else: + if isinstance(inp, bytes): + inp = inp.decode("utf-8") + loaded = json.loads(inp) + return JSONResult(loaded) class JSONResultSerializer(ResultSerializer): @@ -45,12 +60,29 @@ def serialize(self, stream: IO, encoding: str = None) -> None: # type: ignore[o res["results"]["bindings"] = [ self._bindingToJSON(x) for x in self.result.bindings ] - - r = json.dumps(res, allow_nan=False, ensure_ascii=False) - if encoding is not None: - stream.write(r.encode(encoding)) + if _HAS_ORJSON: + try: + r_bytes = orjson.dumps(res, option=orjson.OPT_NON_STR_KEYS) + except Exception as e: + raise ResultException(f"Failed to serialize result: {e}") + if encoding is not None: + # Note, orjson will always write utf-8 even if + # encoding is specified as something else. + try: + stream.write(r_bytes) + except (TypeError, ValueError): + stream.write(r_bytes.decode("utf-8")) + else: + stream.write(r_bytes.decode("utf-8")) else: - stream.write(r) + r_str = json.dumps(res, allow_nan=False, ensure_ascii=False) + if encoding is not None: + try: + stream.write(r_str.encode(encoding)) + except (TypeError, ValueError): + stream.write(r_str) + else: + stream.write(r_str) def _bindingToJSON(self, b: Mapping[Variable, Identifier]) -> Dict[Variable, Any]: res = {} diff --git a/rdflib/plugins/sparql/results/txtresults.py b/rdflib/plugins/sparql/results/txtresults.py index 999daa60c..86d8933e3 100644 --- a/rdflib/plugins/sparql/results/txtresults.py +++ b/rdflib/plugins/sparql/results/txtresults.py @@ -1,5 +1,6 @@ from __future__ import annotations +from io import StringIO from typing import IO, List, Optional, Union from rdflib.namespace import NamespaceManager @@ -26,16 +27,16 @@ def _termString( class TXTResultSerializer(ResultSerializer): """ - A write only QueryResult serializer for text/ascii tables + A write-only QueryResult serializer for text/ascii tables """ - # TODO FIXME: class specific args should be keyword only. - # type error: Signature of "serialize" incompatible with supertype "ResultSerializer" - def serialize( # type: ignore[override] + def serialize( self, stream: IO, - encoding: str, + encoding: str = "utf-8", + *, namespace_manager: Optional[NamespaceManager] = None, + **kwargs, ) -> None: """ return a text table of query results @@ -53,10 +54,9 @@ def c(s, w): if self.result.type != "SELECT": raise Exception("Can only pretty print SELECT results!") - + string_stream = StringIO() if not self.result: - # type error: No return value expected - return "(no results)\n" # type: ignore[return-value] + string_stream.write("(no results)\n") else: keys: List[Variable] = self.result.vars # type: ignore[assignment] maxlen = [0] * len(keys) @@ -71,10 +71,16 @@ def c(s, w): for r in b: for i in range(len(keys)): maxlen[i] = max(maxlen[i], len(r[i])) - - stream.write("|".join([c(k, maxlen[i]) for i, k in enumerate(keys)]) + "\n") - stream.write("-" * (len(maxlen) + sum(maxlen)) + "\n") + string_stream.write( + "|".join([c(k, maxlen[i]) for i, k in enumerate(keys)]) + "\n" + ) + string_stream.write("-" * (len(maxlen) + sum(maxlen)) + "\n") for r in sorted(b): - stream.write( + string_stream.write( "|".join([t + " " * (i - len(t)) for i, t in zip(maxlen, r)]) + "\n" ) + text_val = string_stream.getvalue() + try: + stream.write(text_val.encode(encoding)) + except (TypeError, ValueError): + stream.write(text_val) diff --git a/rdflib/query.py b/rdflib/query.py index f42b37bb2..b3a0c43ce 100644 --- a/rdflib/query.py +++ b/rdflib/query.py @@ -316,7 +316,8 @@ def serialize( serializer = plugin.get(format, ResultSerializer)(self) if destination is None: streamb: BytesIO = BytesIO() - stream2 = EncodeOnlyUnicode(streamb) + stream2 = EncodeOnlyUnicode(streamb) # TODO: Remove the need for this + # TODO: All QueryResult serializers should write to a Bytes Stream. # type error: Argument 1 to "serialize" of "ResultSerializer" has incompatible type "EncodeOnlyUnicode"; expected "IO[Any]" serializer.serialize(stream2, encoding=encoding, **args) # type: ignore[arg-type] return streamb.getvalue() diff --git a/test/test_sparql/test_result.py b/test/test_sparql/test_result.py index ddf9a781c..d228bba56 100644 --- a/test/test_sparql/test_result.py +++ b/test/test_sparql/test_result.py @@ -182,17 +182,6 @@ def narrow_dest_param(param: DestParmType) -> ResultDestParamType: def make_select_result_serialize_parse_tests() -> Iterator[ParameterSet]: xfails: Dict[Tuple[str, DestinationType, str], Union[MarkDecorator, Mark]] = { - ("csv", DestinationType.TEXT_IO, "utf-8"): pytest.mark.xfail(raises=TypeError), - ("csv", DestinationType.TEXT_IO, "utf-16"): pytest.mark.xfail(raises=TypeError), - ("json", DestinationType.TEXT_IO, "utf-8"): pytest.mark.xfail(raises=TypeError), - ("json", DestinationType.TEXT_IO, "utf-16"): pytest.mark.xfail( - raises=TypeError - ), - ("txt", DestinationType.BINARY_IO, "utf-8"): pytest.mark.xfail( - raises=TypeError - ), - ("txt", DestinationType.STR_PATH, "utf-8"): pytest.mark.xfail(raises=TypeError), - ("txt", DestinationType.FILE_URI, "utf-8"): pytest.mark.xfail(raises=TypeError), } format_infos = [ format_info diff --git a/test/utils/result.py b/test/utils/result.py index 6fb8b6e80..8765a8255 100644 --- a/test/utils/result.py +++ b/test/utils/result.py @@ -11,6 +11,12 @@ logger = logging.getLogger(__name__) +try: + import orjson + _HAS_ORJSON = True +except ImportError: + orjson = None # type: ignore[assignment] + _HAS_ORJSON = False ResultTypeInfoDict = Dict["ResultType", "ResultTypeInfo"] @@ -222,7 +228,7 @@ def info_dict(cls) -> ResultFormatInfoDict: ResultFormatTrait.HAS_SERIALIZER, } ), - frozenset({"utf-8", "utf-16"}), + frozenset({"utf-8"} if _HAS_ORJSON else {"utf-8", "utf-16"}), ), ResultFormatInfo( ResultFormat.XML, From cb71e06b541be9810d6fd06472492f8a7d0a20bf Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Mon, 29 Jul 2024 13:03:11 +1000 Subject: [PATCH 03/10] If its known, get FileInputSource encoding from the source itself for JSON-LD Parser. --- rdflib/plugins/shared/jsonld/util.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/rdflib/plugins/shared/jsonld/util.py b/rdflib/plugins/shared/jsonld/util.py index 2d6815a5f..6d4381526 100644 --- a/rdflib/plugins/shared/jsonld/util.py +++ b/rdflib/plugins/shared/jsonld/util.py @@ -123,7 +123,10 @@ def source_to_json( raise ValueError( f"Source does not have a character stream or a byte stream and cannot be used {type(source)}" ) - underlying_string: Optional[str] = None + try: + b_encoding: Optional[str] = None if b_stream is None else source.getEncoding() + except (AttributeError, LookupError): + b_encoding = None underlying_string: Optional[str] = None if b_stream is not None and isinstance(b_stream, BytesIOWrapper): # Try to find an underlying wrapped Unicode string to use? wrapped_inner = b_stream.wrapped @@ -141,7 +144,9 @@ def source_to_json( else: if TYPE_CHECKING: assert b_stream is not None - html_string = TextIOWrapper(b_stream, encoding="utf-8").read() + if b_encoding is None: + b_encoding = "utf-8" + html_string = TextIOWrapper(b_stream, encoding=b_encoding).read() html_docparser.feed(html_string) json_dict, html_base = html_docparser.get_json(), html_docparser.get_base() elif _HAS_ORJSON: @@ -169,7 +174,9 @@ def source_to_json( if TYPE_CHECKING: assert b_stream is not None # b_stream is not None - use_stream = TextIOWrapper(b_stream, encoding="utf-8") + if b_encoding is None: + b_encoding = "utf-8" + use_stream = TextIOWrapper(b_stream, encoding=b_encoding) json_dict = json.load(use_stream) return json_dict, html_base finally: From 5d002bdb4cb4785d3ae8d06f8c1527ca9316d6a2 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Mon, 29 Jul 2024 13:04:03 +1000 Subject: [PATCH 04/10] Fix MyPy complaint in JSON-LD parser caused by reusing a variable name after it already has a type associated. --- rdflib/plugins/shared/jsonld/context.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rdflib/plugins/shared/jsonld/context.py b/rdflib/plugins/shared/jsonld/context.py index f80cdf376..1eec94419 100644 --- a/rdflib/plugins/shared/jsonld/context.py +++ b/rdflib/plugins/shared/jsonld/context.py @@ -481,14 +481,14 @@ def _fetch_context( return self._context_cache[source_url] # type error: Incompatible types in assignment (expression has type "Optional[Any]", variable has type "str") - source, _ = source_to_json(source_url) - if source and CONTEXT not in source: + source_json, _ = source_to_json(source_url) + if source_json and CONTEXT not in source_json: raise INVALID_REMOTE_CONTEXT # type error: Invalid index type "Optional[str]" for "Dict[str, Any]"; expected type "str" - self._context_cache[source_url] = source # type: ignore[index] + self._context_cache[source_url] = source_json # type: ignore[index] - return source + return source_json def _read_source( self, From 8965a4509316debba30196926eecc2b0b8c58305 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Mon, 29 Jul 2024 13:04:55 +1000 Subject: [PATCH 05/10] Add orjson support to the newly merged HTMLJSONParser. --- rdflib/plugins/shared/jsonld/util.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/rdflib/plugins/shared/jsonld/util.py b/rdflib/plugins/shared/jsonld/util.py index 6d4381526..1138a02c7 100644 --- a/rdflib/plugins/shared/jsonld/util.py +++ b/rdflib/plugins/shared/jsonld/util.py @@ -314,7 +314,13 @@ def handle_data(self, data): return # Try to parse the json - parsed = json.loads(data) + if _HAS_ORJSON: + # orjson can load a unicode string + # if that's the only thing we have, + # its not worth encoding it to bytes + parsed = orjson.loads(data) + else: + parsed = json.loads(data) # Add to the result document if isinstance(parsed, list): From 80dd5690c14c8b1c2dae7f53cbb818b0869c19a1 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Mon, 29 Jul 2024 13:05:45 +1000 Subject: [PATCH 06/10] MyPy typing fixes, Black Fixes, and Ruff linting fixes for orjson branch after resolving recent merge conflicts. --- rdflib/plugins/shared/jsonld/util.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/rdflib/plugins/shared/jsonld/util.py b/rdflib/plugins/shared/jsonld/util.py index 1138a02c7..ba020afab 100644 --- a/rdflib/plugins/shared/jsonld/util.py +++ b/rdflib/plugins/shared/jsonld/util.py @@ -5,7 +5,7 @@ import pathlib from html.parser import HTMLParser from io import StringIO, TextIOBase, TextIOWrapper -from typing import IO, TYPE_CHECKING, Any, List, Optional, TextIO, Tuple, Union +from typing import IO, TYPE_CHECKING, Any, Dict, List, Optional, TextIO, Tuple, Union if TYPE_CHECKING: import json @@ -18,7 +18,7 @@ import simplejson as json from posixpath import normpath, sep -from typing import IO, TYPE_CHECKING, Any, Optional, TextIO, Tuple, Union, cast +from typing import TYPE_CHECKING, cast from urllib.parse import urljoin, urlsplit, urlunsplit try: @@ -46,7 +46,7 @@ def source_to_json( ], fragment_id: Optional[str] = None, extract_all_scripts: Optional[bool] = False, -) -> Tuple[Optional[Any], Any]: +) -> Tuple[Union[Dict, List[Dict]], Any]: """Extract JSON from a source document. The source document can be JSON or HTML with embedded JSON script elements (type attribute = "application/ld+json"). @@ -71,13 +71,13 @@ def source_to_json( # It's hidden in the BytesIOWrapper 'wrapped' attribute b_stream = source.getByteStream() original_string: Optional[str] = None + json_dict: Union[Dict, List[Dict]] if isinstance(b_stream, BytesIOWrapper): wrapped_inner = cast(Union[str, StringIO, TextIOBase], b_stream.wrapped) if isinstance(wrapped_inner, str): original_string = wrapped_inner elif isinstance(wrapped_inner, StringIO): original_string = wrapped_inner.getvalue() - if _HAS_ORJSON: if original_string is not None: json_dict = orjson.loads(original_string) @@ -103,8 +103,10 @@ def source_to_json( except (AttributeError, LookupError): content_type = None - is_html = content_type is not None and \ - content_type.lower() in ("text/html", "application/xhtml+xml") + is_html = content_type is not None and content_type.lower() in ( + "text/html", + "application/xhtml+xml", + ) if is_html: html_docparser: Optional[HTMLJSONParser] = HTMLJSONParser( fragment_id=fragment_id, extract_all_scripts=extract_all_scripts @@ -126,7 +128,8 @@ def source_to_json( try: b_encoding: Optional[str] = None if b_stream is None else source.getEncoding() except (AttributeError, LookupError): - b_encoding = None underlying_string: Optional[str] = None + b_encoding = None + underlying_string: Optional[str] = None if b_stream is not None and isinstance(b_stream, BytesIOWrapper): # Try to find an underlying wrapped Unicode string to use? wrapped_inner = b_stream.wrapped @@ -152,18 +155,18 @@ def source_to_json( elif _HAS_ORJSON: html_base = None if underlying_string is not None: - json_dict = orjson.loads(underlying_string) + json_dict = orjson.loads(underlying_string) elif ( (b_stream is not None and isinstance(b_stream, BytesIOWrapper)) or b_stream is None ) and c_stream is not None: # use the CharacterStream instead - json_dict = orjson.loads(c_stream.read()) + json_dict = orjson.loads(c_stream.read()) else: if TYPE_CHECKING: assert b_stream is not None # b_stream is not None - json_dict = orjson.loads(b_stream.read()) + json_dict = orjson.loads(b_stream.read()) else: html_base = None if underlying_string is not None: @@ -275,7 +278,7 @@ def __init__( ): super().__init__() self.fragment_id = fragment_id - self.json: List[Any] = [] + self.json: List[Dict] = [] self.contains_json = False self.fragment_id_does_not_match = False self.base = None @@ -330,7 +333,7 @@ def handle_data(self, data): self.script_count += 1 - def get_json(self): + def get_json(self) -> List[Dict]: return self.json def get_base(self): From 90815d09c422862bd19214323038a886c5527140 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Mon, 29 Jul 2024 13:43:22 +1000 Subject: [PATCH 07/10] Add missing updated poetry.lock lockfile that is supposed to accompany the `orjson` pyproject addition. --- poetry.lock | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 27c05a77e..8a287f016 100644 --- a/poetry.lock +++ b/poetry.lock @@ -794,6 +794,66 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx- extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] +[[package]] +name = "orjson" +version = "3.10.6" +description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" +optional = true +python-versions = ">=3.8" +files = [ + {file = "orjson-3.10.6-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:fb0ee33124db6eaa517d00890fc1a55c3bfe1cf78ba4a8899d71a06f2d6ff5c7"}, + {file = "orjson-3.10.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c1c4b53b24a4c06547ce43e5fee6ec4e0d8fe2d597f4647fc033fd205707365"}, + {file = "orjson-3.10.6-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eadc8fd310edb4bdbd333374f2c8fec6794bbbae99b592f448d8214a5e4050c0"}, + {file = "orjson-3.10.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61272a5aec2b2661f4fa2b37c907ce9701e821b2c1285d5c3ab0207ebd358d38"}, + {file = "orjson-3.10.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57985ee7e91d6214c837936dc1608f40f330a6b88bb13f5a57ce5257807da143"}, + {file = "orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:633a3b31d9d7c9f02d49c4ab4d0a86065c4a6f6adc297d63d272e043472acab5"}, + {file = "orjson-3.10.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:1c680b269d33ec444afe2bdc647c9eb73166fa47a16d9a75ee56a374f4a45f43"}, + {file = "orjson-3.10.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f759503a97a6ace19e55461395ab0d618b5a117e8d0fbb20e70cfd68a47327f2"}, + {file = "orjson-3.10.6-cp310-none-win32.whl", hash = "sha256:95a0cce17f969fb5391762e5719575217bd10ac5a189d1979442ee54456393f3"}, + {file = "orjson-3.10.6-cp310-none-win_amd64.whl", hash = "sha256:df25d9271270ba2133cc88ee83c318372bdc0f2cd6f32e7a450809a111efc45c"}, + {file = "orjson-3.10.6-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:b1ec490e10d2a77c345def52599311849fc063ae0e67cf4f84528073152bb2ba"}, + {file = "orjson-3.10.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55d43d3feb8f19d07e9f01e5b9be4f28801cf7c60d0fa0d279951b18fae1932b"}, + {file = "orjson-3.10.6-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac3045267e98fe749408eee1593a142e02357c5c99be0802185ef2170086a863"}, + {file = "orjson-3.10.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c27bc6a28ae95923350ab382c57113abd38f3928af3c80be6f2ba7eb8d8db0b0"}, + {file = "orjson-3.10.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d27456491ca79532d11e507cadca37fb8c9324a3976294f68fb1eff2dc6ced5a"}, + {file = "orjson-3.10.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05ac3d3916023745aa3b3b388e91b9166be1ca02b7c7e41045da6d12985685f0"}, + {file = "orjson-3.10.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1335d4ef59ab85cab66fe73fd7a4e881c298ee7f63ede918b7faa1b27cbe5212"}, + {file = "orjson-3.10.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4bbc6d0af24c1575edc79994c20e1b29e6fb3c6a570371306db0993ecf144dc5"}, + {file = "orjson-3.10.6-cp311-none-win32.whl", hash = "sha256:450e39ab1f7694465060a0550b3f6d328d20297bf2e06aa947b97c21e5241fbd"}, + {file = "orjson-3.10.6-cp311-none-win_amd64.whl", hash = "sha256:227df19441372610b20e05bdb906e1742ec2ad7a66ac8350dcfd29a63014a83b"}, + {file = "orjson-3.10.6-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:ea2977b21f8d5d9b758bb3f344a75e55ca78e3ff85595d248eee813ae23ecdfb"}, + {file = "orjson-3.10.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b6f3d167d13a16ed263b52dbfedff52c962bfd3d270b46b7518365bcc2121eed"}, + {file = "orjson-3.10.6-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f710f346e4c44a4e8bdf23daa974faede58f83334289df80bc9cd12fe82573c7"}, + {file = "orjson-3.10.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7275664f84e027dcb1ad5200b8b18373e9c669b2a9ec33d410c40f5ccf4b257e"}, + {file = "orjson-3.10.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0943e4c701196b23c240b3d10ed8ecd674f03089198cf503105b474a4f77f21f"}, + {file = "orjson-3.10.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:446dee5a491b5bc7d8f825d80d9637e7af43f86a331207b9c9610e2f93fee22a"}, + {file = "orjson-3.10.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:64c81456d2a050d380786413786b057983892db105516639cb5d3ee3c7fd5148"}, + {file = "orjson-3.10.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:960db0e31c4e52fa0fc3ecbaea5b2d3b58f379e32a95ae6b0ebeaa25b93dfd34"}, + {file = "orjson-3.10.6-cp312-none-win32.whl", hash = "sha256:a6ea7afb5b30b2317e0bee03c8d34c8181bc5a36f2afd4d0952f378972c4efd5"}, + {file = "orjson-3.10.6-cp312-none-win_amd64.whl", hash = "sha256:874ce88264b7e655dde4aeaacdc8fd772a7962faadfb41abe63e2a4861abc3dc"}, + {file = "orjson-3.10.6-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:66680eae4c4e7fc193d91cfc1353ad6d01b4801ae9b5314f17e11ba55e934183"}, + {file = "orjson-3.10.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:caff75b425db5ef8e8f23af93c80f072f97b4fb3afd4af44482905c9f588da28"}, + {file = "orjson-3.10.6-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3722fddb821b6036fd2a3c814f6bd9b57a89dc6337b9924ecd614ebce3271394"}, + {file = "orjson-3.10.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2c116072a8533f2fec435fde4d134610f806bdac20188c7bd2081f3e9e0133f"}, + {file = "orjson-3.10.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6eeb13218c8cf34c61912e9df2de2853f1d009de0e46ea09ccdf3d757896af0a"}, + {file = "orjson-3.10.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:965a916373382674e323c957d560b953d81d7a8603fbeee26f7b8248638bd48b"}, + {file = "orjson-3.10.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03c95484d53ed8e479cade8628c9cea00fd9d67f5554764a1110e0d5aa2de96e"}, + {file = "orjson-3.10.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:e060748a04cccf1e0a6f2358dffea9c080b849a4a68c28b1b907f272b5127e9b"}, + {file = "orjson-3.10.6-cp38-none-win32.whl", hash = "sha256:738dbe3ef909c4b019d69afc19caf6b5ed0e2f1c786b5d6215fbb7539246e4c6"}, + {file = "orjson-3.10.6-cp38-none-win_amd64.whl", hash = "sha256:d40f839dddf6a7d77114fe6b8a70218556408c71d4d6e29413bb5f150a692ff7"}, + {file = "orjson-3.10.6-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:697a35a083c4f834807a6232b3e62c8b280f7a44ad0b759fd4dce748951e70db"}, + {file = "orjson-3.10.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd502f96bf5ea9a61cbc0b2b5900d0dd68aa0da197179042bdd2be67e51a1e4b"}, + {file = "orjson-3.10.6-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f215789fb1667cdc874c1b8af6a84dc939fd802bf293a8334fce185c79cd359b"}, + {file = "orjson-3.10.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2debd8ddce948a8c0938c8c93ade191d2f4ba4649a54302a7da905a81f00b56"}, + {file = "orjson-3.10.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5410111d7b6681d4b0d65e0f58a13be588d01b473822483f77f513c7f93bd3b2"}, + {file = "orjson-3.10.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bb1f28a137337fdc18384079fa5726810681055b32b92253fa15ae5656e1dddb"}, + {file = "orjson-3.10.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:bf2fbbce5fe7cd1aa177ea3eab2b8e6a6bc6e8592e4279ed3db2d62e57c0e1b2"}, + {file = "orjson-3.10.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:79b9b9e33bd4c517445a62b90ca0cc279b0f1f3970655c3df9e608bc3f91741a"}, + {file = "orjson-3.10.6-cp39-none-win32.whl", hash = "sha256:30b0a09a2014e621b1adf66a4f705f0809358350a757508ee80209b2d8dae219"}, + {file = "orjson-3.10.6-cp39-none-win_amd64.whl", hash = "sha256:49e3bc615652617d463069f91b867a4458114c5b104e13b7ae6872e5f79d0844"}, + {file = "orjson-3.10.6.tar.gz", hash = "sha256:e54b63d0a7c6c54a5f5f726bc93a2078111ef060fec4ecbf34c5db800ca3b3a7"}, +] + [[package]] name = "packaging" version = "23.1" @@ -1335,8 +1395,9 @@ berkeleydb = ["berkeleydb"] html = ["html5lib"] lxml = ["lxml"] networkx = ["networkx"] +orjson = ["orjson"] [metadata] lock-version = "2.0" python-versions = "^3.8.1" -content-hash = "7514432973368065fa5482d533d74c83b54642cd6f4e8598c0ec7af28bf2ced9" +content-hash = "1b0767f29cb55fb16f955194eb4e69f01a781c3658e04006e0dd88100c4e062b" From e632b56c39fe1773800509a69271556fea8e20ad Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Mon, 29 Jul 2024 15:50:31 +1000 Subject: [PATCH 08/10] The linter ignore-assignment-type suppressions aren't used when orjson is not installed throwing a mypy error, but they are needed when orjson is installed to prevent different mypy errors. So add additional unused-ignore suppressions. --- rdflib/plugins/parsers/hext.py | 2 +- rdflib/plugins/serializers/hext.py | 2 +- rdflib/plugins/shared/jsonld/util.py | 2 +- rdflib/plugins/sparql/evaluate.py | 2 +- rdflib/plugins/sparql/results/jsonresults.py | 2 +- test/utils/result.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/rdflib/plugins/parsers/hext.py b/rdflib/plugins/parsers/hext.py index 9039a019e..d47dcdcd7 100644 --- a/rdflib/plugins/parsers/hext.py +++ b/rdflib/plugins/parsers/hext.py @@ -20,7 +20,7 @@ _HAS_ORJSON = True except ImportError: - orjson = None # type: ignore[assignment] + orjson = None # type: ignore[assignment, unused-ignore] _HAS_ORJSON = False if TYPE_CHECKING: diff --git a/rdflib/plugins/serializers/hext.py b/rdflib/plugins/serializers/hext.py index d0e31a6f0..9a8187c76 100644 --- a/rdflib/plugins/serializers/hext.py +++ b/rdflib/plugins/serializers/hext.py @@ -19,7 +19,7 @@ _HAS_ORJSON = True except ImportError: - orjson = None # type: ignore[assignment] + orjson = None # type: ignore[assignment, unused-ignore] _HAS_ORJSON = False __all__ = ["HextuplesSerializer"] diff --git a/rdflib/plugins/shared/jsonld/util.py b/rdflib/plugins/shared/jsonld/util.py index ba020afab..71057c90a 100644 --- a/rdflib/plugins/shared/jsonld/util.py +++ b/rdflib/plugins/shared/jsonld/util.py @@ -26,7 +26,7 @@ _HAS_ORJSON = True except ImportError: - orjson = None # type: ignore[assignment] + orjson = None # type: ignore[assignment, unused-ignore] _HAS_ORJSON = False diff --git a/rdflib/plugins/sparql/evaluate.py b/rdflib/plugins/sparql/evaluate.py index 60307a14c..0c487a4a6 100644 --- a/rdflib/plugins/sparql/evaluate.py +++ b/rdflib/plugins/sparql/evaluate.py @@ -69,7 +69,7 @@ _HAS_ORJSON = True except ImportError: - orjson = None # type: ignore[assignment] + orjson = None # type: ignore[assignment, unused-ignore] _HAS_ORJSON = False _Triple = Tuple[Identifier, Identifier, Identifier] diff --git a/rdflib/plugins/sparql/results/jsonresults.py b/rdflib/plugins/sparql/results/jsonresults.py index 8614daab7..cfc2dc1e1 100644 --- a/rdflib/plugins/sparql/results/jsonresults.py +++ b/rdflib/plugins/sparql/results/jsonresults.py @@ -22,7 +22,7 @@ _HAS_ORJSON = True except ImportError: - orjson = None # type: ignore[assignment] + orjson = None # type: ignore[assignment, unused-ignore] _HAS_ORJSON = False diff --git a/test/utils/result.py b/test/utils/result.py index 8765a8255..9e2fb378c 100644 --- a/test/utils/result.py +++ b/test/utils/result.py @@ -15,7 +15,7 @@ import orjson _HAS_ORJSON = True except ImportError: - orjson = None # type: ignore[assignment] + orjson = None # type: ignore[assignment, unused-ignore] _HAS_ORJSON = False ResultTypeInfoDict = Dict["ResultType", "ResultTypeInfo"] From cccc94d51b65b9ac30dcabbbe6751edfbfa6ce6c Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Mon, 29 Jul 2024 16:30:14 +1000 Subject: [PATCH 09/10] Change internal usages of json serializer indent to 2, so that orjson and stdlib json outputs are the same for docs and comparison purposes. --- rdflib/plugins/serializers/jsonld.py | 20 ++++++++++---------- test/jsonld/runner.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/rdflib/plugins/serializers/jsonld.py b/rdflib/plugins/serializers/jsonld.py index 1bfb6cc02..1cbd1899c 100644 --- a/rdflib/plugins/serializers/jsonld.py +++ b/rdflib/plugins/serializers/jsonld.py @@ -14,17 +14,17 @@ >>> g = Graph().parse(data=testrdf, format='n3') - >>> print(g.serialize(format='json-ld', indent=4)) + >>> print(g.serialize(format='json-ld', indent=2)) [ - { - "@id": "http://example.org/about", - "http://purl.org/dc/terms/title": [ - { - "@language": "en", - "@value": "Someone's Homepage" - } - ] - } + { + "@id": "http://example.org/about", + "http://purl.org/dc/terms/title": [ + { + "@language": "en", + "@value": "Someone's Homepage" + } + ] + } ] """ diff --git a/test/jsonld/runner.py b/test/jsonld/runner.py index a8237fd95..9c648d57c 100644 --- a/test/jsonld/runner.py +++ b/test/jsonld/runner.py @@ -258,7 +258,7 @@ def _ord_key(x): def _dump_json(obj): return json.dumps( - obj, indent=4, separators=(",", ": "), sort_keys=True, check_circular=True + obj, indent=2, separators=(",", ": "), sort_keys=True, check_circular=True ) From 14e4f956d896590c79d3db5ff7ee4fdd1627879c Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Mon, 29 Jul 2024 16:46:11 +1000 Subject: [PATCH 10/10] Apply Black formatting to the test/ dir too. --- test/test_sparql/test_result.py | 3 +-- test/utils/result.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_sparql/test_result.py b/test/test_sparql/test_result.py index d228bba56..9f7defc0c 100644 --- a/test/test_sparql/test_result.py +++ b/test/test_sparql/test_result.py @@ -181,8 +181,7 @@ def narrow_dest_param(param: DestParmType) -> ResultDestParamType: def make_select_result_serialize_parse_tests() -> Iterator[ParameterSet]: - xfails: Dict[Tuple[str, DestinationType, str], Union[MarkDecorator, Mark]] = { - } + xfails: Dict[Tuple[str, DestinationType, str], Union[MarkDecorator, Mark]] = {} format_infos = [ format_info for format_info in ResultFormat.info_set() diff --git a/test/utils/result.py b/test/utils/result.py index 9e2fb378c..9d47c7c83 100644 --- a/test/utils/result.py +++ b/test/utils/result.py @@ -13,6 +13,7 @@ try: import orjson + _HAS_ORJSON = True except ImportError: orjson = None # type: ignore[assignment, unused-ignore]