Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add skolemization support for ntriples, nquads, hextuples and json-ld support at parse time #2816

Merged
merged 13 commits into from
Jul 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions rdflib/plugins/parsers/hext.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class HextuplesParser(Parser):
"""

def __init__(self):
pass
self.skolemize = False

def _load_json_line(self, line: str) -> List[Optional[Any]]:
# this complex handing is because the 'value' component is
Expand All @@ -52,6 +52,8 @@ def _parse_hextuple(
s: Union[URIRef, BNode]
if tup[0].startswith("_"):
s = BNode(value=tup[0].replace("_:", ""))
if self.skolemize:
s = s.skolemize()
else:
s = URIRef(tup[0])

Expand All @@ -64,6 +66,8 @@ def _parse_hextuple(
o = URIRef(tup[2])
elif tup[3] == "localId":
o = BNode(value=tup[2].replace("_:", ""))
if self.skolemize:
o = o.skolemize()
else: # literal
if tup[4] is None:
o = Literal(tup[2], datatype=URIRef(tup[3]))
Expand All @@ -77,13 +81,16 @@ def _parse_hextuple(
if tup[5].startswith("_:")
else URIRef(tup[5])
)
if isinstance(c, BNode) and self.skolemize:
c = c.skolemize()

# type error: Argument 1 to "add" of "ConjunctiveGraph" has incompatible type "Tuple[Union[URIRef, BNode], URIRef, Union[URIRef, BNode, Literal], URIRef]"; expected "Union[Tuple[Node, Node, Node], Tuple[Node, Node, Node, Optional[Graph]]]"
cg.add((s, p, o, c)) # type: ignore[arg-type]
else:
cg.add((s, p, o))

# type error: Signature of "parse" incompatible with supertype "Parser"
def parse(self, source: InputSource, graph: Graph, **kwargs: Any) -> None: # type: ignore[override]
def parse(self, source: InputSource, graph: Graph, skolemize: bool = False, **kwargs: Any) -> None: # type: ignore[override]
if kwargs.get("encoding") not in [None, "utf-8"]:
warnings.warn(
f"Hextuples files are always utf-8 encoded, "
Expand All @@ -95,6 +102,7 @@ def parse(self, source: InputSource, graph: Graph, **kwargs: Any) -> None: # ty
graph.store.context_aware
), "Hextuples Parser needs a context-aware store!"

self.skolemize = skolemize
cg = ConjunctiveGraph(store=graph.store, identifier=graph.identifier)
cg.default_context = graph

Expand Down
40 changes: 35 additions & 5 deletions rdflib/plugins/parsers/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def parse(
source: InputSource,
sink: Graph,
version: float = 1.1,
skolemize: bool = False,
encoding: Optional[str] = "utf-8",
base: Optional[str] = None,
context: Optional[
Expand All @@ -95,6 +96,7 @@ def parse(
] = None,
generalized_rdf: Optional[bool] = False,
extract_all_scripts: Optional[bool] = False,
**kwargs: Any,
) -> None:
"""Parse JSON-LD from a source document.

Expand Down Expand Up @@ -164,7 +166,15 @@ def parse(
else:
conj_sink = sink

to_rdf(data, conj_sink, base, context_data, version, bool(generalized_rdf))
to_rdf(
data,
conj_sink,
base,
context_data,
version,
bool(generalized_rdf),
skolemize=skolemize,
)


def to_rdf(
Expand All @@ -181,21 +191,28 @@ def to_rdf(
version: Optional[float] = None,
generalized_rdf: bool = False,
allow_lists_of_lists: Optional[bool] = None,
skolemize: bool = False,
):
# TODO: docstring w. args and return value
context = Context(base=base, version=version)
if context_data:
context.load(context_data)
parser = Parser(
generalized_rdf=generalized_rdf, allow_lists_of_lists=allow_lists_of_lists
generalized_rdf=generalized_rdf,
allow_lists_of_lists=allow_lists_of_lists,
skolemize=skolemize,
)
return parser.parse(data, context, dataset)


class Parser:
def __init__(
self, generalized_rdf: bool = False, allow_lists_of_lists: Optional[bool] = None
self,
generalized_rdf: bool = False,
allow_lists_of_lists: Optional[bool] = None,
skolemize: bool = False,
):
self.skolemize = skolemize
self.generalized_rdf = generalized_rdf
self.allow_lists_of_lists = (
allow_lists_of_lists
Expand Down Expand Up @@ -265,6 +282,8 @@ def _add_to_graph(
subj = self._to_rdf_id(context, id_val)
else:
subj = BNode()
if self.skolemize:
subj = subj.skolemize()

if subj is None:
return None
Expand Down Expand Up @@ -415,6 +434,8 @@ def flatten(n: Iterable[Any]) -> List[Any]:
if not self.generalized_rdf:
return
pred = BNode(bid)
if self.skolemize:
pred = pred.skolemize()
else:
pred = URIRef(pred_uri)

Expand Down Expand Up @@ -598,7 +619,10 @@ def _to_object(
def _to_rdf_id(self, context: Context, id_val: str) -> Optional[IdentifiedNode]:
bid = self._get_bnodeid(id_val)
if bid:
return BNode(bid)
b = BNode(bid)
if self.skolemize:
return b.skolemize()
return b
else:
uri = context.resolve(id_val)
if not self.generalized_rdf and ":" not in uri:
Expand All @@ -623,7 +647,11 @@ def _add_list(
if not isinstance(node_list, list):
node_list = [node_list]

first_subj = BNode()
first_subj: Union[URIRef, BNode] = BNode()
if self.skolemize and isinstance(first_subj, BNode):
first_subj = first_subj.skolemize()

rest: Union[URIRef, BNode, None]
subj, rest = first_subj, None

for node in node_list:
Expand All @@ -642,6 +670,8 @@ def _add_list(

graph.add((subj, RDF.first, obj))
rest = BNode()
if self.skolemize and isinstance(rest, BNode):
rest = rest.skolemize()

if rest:
graph.add((subj, RDF.rest, RDF.nil))
Expand Down
2 changes: 2 additions & 0 deletions rdflib/plugins/parsers/nquads.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def parse( # type: ignore[override]
inputsource: InputSource,
sink: ConjunctiveGraph,
bnode_context: Optional[_BNodeContextType] = None,
skolemize: bool = False,
**kwargs: Any,
) -> ConjunctiveGraph:
"""
Expand All @@ -68,6 +69,7 @@ def parse( # type: ignore[override]
self.sink: ConjunctiveGraph = ConjunctiveGraph( # type: ignore[assignment]
store=sink.store, identifier=sink.identifier
)
self.skolemize = skolemize

source = inputsource.getCharacterStream()
if not source:
Expand Down
41 changes: 25 additions & 16 deletions rdflib/plugins/parsers/ntriples.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,15 @@ class W3CNTriplesParser:
`W3CNTriplesParser`.
"""

__slots__ = ("_bnode_ids", "sink", "buffer", "file", "line")
__slots__ = ("_bnode_ids", "sink", "buffer", "file", "line", "skolemize")

def __init__(
self,
sink: Optional[Union[DummySink, NTGraphSink]] = None,
bnode_context: Optional[_BNodeContextType] = None,
):
self.skolemize = False

if bnode_context is not None:
self._bnode_ids = bnode_context
else:
Expand All @@ -164,6 +166,7 @@ def parse(
self,
f: Union[TextIO, IO[bytes], codecs.StreamReader],
bnode_context: Optional[_BNodeContextType] = None,
skolemize: bool = False,
) -> Union[DummySink, NTGraphSink]:
"""
Parse f as an N-Triples file.
Expand All @@ -184,6 +187,7 @@ def parse(
# someone still using a bytestream here?
f = codecs.getreader("utf-8")(f)

self.skolemize = skolemize
self.file = f # type: ignore[assignment]
self.buffer = ""
while True:
Expand Down Expand Up @@ -270,7 +274,7 @@ def subject(self, bnode_context=None) -> Union[bNode, URIRef]:
raise ParseError("Subject must be uriref or nodeID")
return subj

def predicate(self) -> URIRef:
def predicate(self) -> Union[bNode, URIRef]:
pred = self.uriref()
if not pred:
raise ParseError("Predicate must be uriref")
Expand All @@ -294,22 +298,27 @@ def uriref(self) -> Union[te.Literal[False], URI]:

def nodeid(
self, bnode_context: Optional[_BNodeContextType] = None
) -> Union[te.Literal[False], bNode]:
) -> Union[te.Literal[False], bNode, URI]:
if self.peek("_"):
# Fix for https://github.com/RDFLib/rdflib/issues/204
if bnode_context is None:
bnode_context = self._bnode_ids
bnode_id = self.eat(r_nodeid).group(1)
new_id = bnode_context.get(bnode_id, None)
if new_id is not None:
# Re-map to id specific to this doc
return bNode(new_id)
if self.skolemize:
bnode_id = self.eat(r_nodeid).group(1)
return bNode(bnode_id).skolemize()

else:
# Replace with freshly-generated document-specific BNode id
bnode = bNode()
# Store the mapping
bnode_context[bnode_id] = bnode
return bnode
# Fix for https://github.com/RDFLib/rdflib/issues/204
if bnode_context is None:
bnode_context = self._bnode_ids
bnode_id = self.eat(r_nodeid).group(1)
new_id = bnode_context.get(bnode_id, None)
if new_id is not None:
# Re-map to id specific to this doc
return bNode(new_id)
else:
# Replace with freshly-generated document-specific BNode id
bnode = bNode()
# Store the mapping
bnode_context[bnode_id] = bnode
return bnode
return False

def literal(self) -> Union[te.Literal[False], Literal]:
Expand Down
128 changes: 128 additions & 0 deletions test/test_parsers/test_parse_with_skolemize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import pytest

from rdflib import BNode, Dataset, Graph
from rdflib.compare import isomorphic


@pytest.mark.parametrize(
"data, data_format, expected_data, expected_data_format",
[
[
"""
<urn:object> <urn:hasPart> _:internal-bnode-id-1 .
_:internal-bnode-id-1 <urn:value> "..." .
""",
"ntriples",
"""
<urn:object> <urn:hasPart> <https://rdflib.github.io/.well-known/genid/rdflib/internal-bnode-id-1> .
<https://rdflib.github.io/.well-known/genid/rdflib/internal-bnode-id-1> <urn:value> "..." .
""",
"ntriples",
]
],
)
def test_parse_with_skolemize_triples(
data: str, data_format: str, expected_data: str, expected_data_format: str
):
graph = Graph().parse(data=data, format=data_format, skolemize=True)
assert len(graph)

expected_graph = Graph().parse(data=expected_data, format=expected_data_format)
assert len(expected_graph)

assert isomorphic(graph, expected_graph)

de_skolem_graph = graph.de_skolemize()
expected_de_skolem_graph = expected_graph.de_skolemize()
assert isomorphic(de_skolem_graph, expected_de_skolem_graph)


@pytest.mark.parametrize(
"data, data_format, expected_data, expected_data_format, anonymous_graph_name",
[
[
"""
<urn:object> <urn:hasPart> _:internal-bnode-id-1 _:graph-id .
_:internal-bnode-id-1 <urn:value> "..." _:graph-id .
""",
"nquads",
"""
<urn:object> <urn:hasPart> <https://rdflib.github.io/.well-known/genid/rdflib/internal-bnode-id-1> <https://rdflib.github.io/.well-known/genid/rdflib/graph-id> .
<https://rdflib.github.io/.well-known/genid/rdflib/internal-bnode-id-1> <urn:value> "..." <https://rdflib.github.io/.well-known/genid/rdflib/graph-id> .
""",
"nquads",
"graph-id",
],
[
"""
["urn:object", "urn:hasPart", "_:internal-bnode-id-1", "localId", "", "_:graph-id"]
["_:internal-bnode-id-1", "urn:value", "...", "http://www.w3.org/2001/XMLSchema#string", "", "_:graph-id"]
""",
"hext",
"""
<urn:object> <urn:hasPart> <https://rdflib.github.io/.well-known/genid/rdflib/internal-bnode-id-1> <https://rdflib.github.io/.well-known/genid/rdflib/graph-id> .
<https://rdflib.github.io/.well-known/genid/rdflib/internal-bnode-id-1> <urn:value> "..."^^<http://www.w3.org/2001/XMLSchema#string> <https://rdflib.github.io/.well-known/genid/rdflib/graph-id> .
""",
"nquads",
"graph-id",
],
[
"""
[
{
"@id": "_:graph-id",
"@graph": [
{
"@id": "urn:object",
"urn:hasPart": {
"@id": "_:internal-bnode-id-1"
}
},
{
"@id": "_:internal-bnode-id-1",
"urn:value": "..."
}
]
}
]
""",
"json-ld",
"""
<urn:object> <urn:hasPart> <https://rdflib.github.io/.well-known/genid/rdflib/internal-bnode-id-1> <https://rdflib.github.io/.well-known/genid/rdflib/graph-id> .
<https://rdflib.github.io/.well-known/genid/rdflib/internal-bnode-id-1> <urn:value> "..." <https://rdflib.github.io/.well-known/genid/rdflib/graph-id> .
""",
"nquads",
"graph-id",
],
],
)
def test_parse_with_skolemize_quads(
data: str,
data_format: str,
expected_data: str,
expected_data_format: str,
anonymous_graph_name,
):
ds = Dataset(default_union=True)
ds.parse(data=data, format=data_format, skolemize=True)
assert len(ds)

expected_ds = Dataset(default_union=True)
expected_ds.parse(data=expected_data, format=expected_data_format)
assert len(expected_ds)

graph_name = BNode(anonymous_graph_name)
skolem_graph_name = graph_name.skolemize()

skolem_graph = ds.graph(skolem_graph_name)
expected_skolem_graph = expected_ds.graph(skolem_graph_name)
assert len(skolem_graph)
assert len(expected_skolem_graph)
assert isomorphic(skolem_graph, expected_skolem_graph)
assert isomorphic(skolem_graph.de_skolemize(), expected_skolem_graph.de_skolemize())

# Note: Datasets must have default_union set to True, otherwise calling
# de_skolemize returns an empty graph.
assert isomorphic(ds.de_skolemize(), expected_ds.de_skolemize())

# TODO: There's no way to roundtrip datasets with skolemization?
Loading