Skip to content

Commit

Permalink
new: explicit_metagraph, optimize fetch_adb_docs
Browse files Browse the repository at this point in the history
  • Loading branch information
aMahanna committed Jan 19, 2024
1 parent 4697679 commit 285f9b6
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 22 deletions.
1 change: 1 addition & 0 deletions arango_rdf/abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def arangodb_to_rdf(
name: str,
rdf_graph: RDFGraph,
metagraph: ADBMetagraph,
explicit_metagraph: bool,
list_conversion_mode: str,
infer_type_from_adb_v_col: bool,
include_adb_v_col_statements: bool,
Expand Down
78 changes: 62 additions & 16 deletions arango_rdf/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,6 @@ def __init__(
for ns in os.listdir(f"{PROJECT_DIR}/meta"):
self.__meta_graph.parse(f"{PROJECT_DIR}/meta/{ns}", format="trig")

# An instance variable that serves as a shortcut of
# the current RDF Graph. Used in ArangoDB-to-RDF & RDF-to-ArangoDB methods.
# self.__rdf_graph = RDFGraph()

# A mapping of Reified Subjects to their corresponding RDF Predicates.
self.__reified_subject_predicate_map: Dict[RDFTerm, URIRef] = {}

Expand Down Expand Up @@ -171,6 +167,7 @@ def arangodb_to_rdf(
name: str,
rdf_graph: RDFGraph,
metagraph: ADBMetagraph,
explicit_metagraph: bool = True,
list_conversion_mode: str = "static",
infer_type_from_adb_v_col: bool = False,
include_adb_v_col_statements: bool = False,
Expand All @@ -186,7 +183,25 @@ def arangodb_to_rdf(
:type rdf_graph: rdflib.graph.Graph
:param metagraph: An dictionary of dictionaries defining the ArangoDB Vertex
& Edge Collections whose entries will be inserted into the RDF Graph.
For example:
.. code-block:: python
{
"vertexCollections": {
"Person": {"name", "age"},
"Book": {"title", "author"}
},
"edgeCollections": {
"Likes": {"date"},
"Wrote": {"date"}
}
}
:type metagraph: arango_rdf.typings.ADBMetagraph
:param explicit_metagraph: Only keep the document attributes specified in
**metagraph** when importing to RDF (is True by default). Otherwise,
all document attributes are included. Defaults to True.
:type explicit_metagraph: bool
:param list_conversion_mode: Specify how ArangoDB JSON lists
are handled andprocessed into the RDF Graph. If "collection", ArangoDB
lists will be processed using the RDF Collection structure. If "container",
Expand Down Expand Up @@ -236,8 +251,6 @@ def arangodb_to_rdf(
self.__include_adb_v_col_statements = include_adb_v_col_statements
self.__include_adb_v_key_statements = include_adb_v_key_statements
self.__include_adb_e_key_statements = include_adb_e_key_statements
self.__adb_export_kwargs = adb_export_kwargs
self.__adb_export_kwargs["stream"] = True

# Maps ArangoDB Document IDs to RDFLib Terms (i.e URIRef, Literal, BNode)
self.__term_map: Dict[str, RDFTerm] = {}
Expand All @@ -261,8 +274,7 @@ def arangodb_to_rdf(
"_sub_graph_uri",
}

adb_v_cols = set(metagraph["vertexCollections"])
adb_e_cols = set(metagraph["edgeCollections"])
adb_e_cols = set(metagraph.get("edgeCollections", {}))

# PGT Scenario: Build a mapping of the RDF Predicates stored in ArangoDB
if self.db.has_collection("Property"):
Expand All @@ -275,7 +287,7 @@ def arangodb_to_rdf(
# Vertex Collections #
######################

for v_col in adb_v_cols:
for v_col, atribs in metagraph["vertexCollections"].items():
if v_col in adb_e_cols:
continue

Expand All @@ -286,7 +298,9 @@ def arangodb_to_rdf(
self.__rdf_graph.bind(v_col, f"{v_col_namespace}#")

# 1. Fetch ArangoDB vertices
v_col_cursor, v_col_size = self.__fetch_adb_docs(v_col)
v_col_cursor, v_col_size = self.__fetch_adb_docs(
v_col, False, atribs, explicit_metagraph, **adb_export_kwargs
)

# 2. Process ArangoDB vertices
self.__process_adb_cursor(
Expand All @@ -302,15 +316,17 @@ def arangodb_to_rdf(
# Edge Collections #
####################

for e_col in adb_e_cols:
for e_col, atribs in metagraph.get("edgeCollections", {}).items():
logger.debug(f"Preparing '{e_col}' edges")

e_col_namespace = f"{self.__graph_ns}/{e_col}"
e_col_uri = URIRef(e_col_namespace)
self.__rdf_graph.bind(e_col, f"{e_col_namespace}#")

# 1. Fetch ArangoDB edges
e_col_cursor, e_col_size = self.__fetch_adb_docs(e_col)
e_col_cursor, e_col_size = self.__fetch_adb_docs(
e_col, True, atribs, explicit_metagraph, **adb_export_kwargs
)

# 2. Process ArangoDB edges
self.__process_adb_cursor(
Expand Down Expand Up @@ -389,10 +405,13 @@ def arangodb_collections_to_rdf(
"edgeCollections": {col: set() for col in e_cols},
}

explicit_metagraph = False

return self.arangodb_to_rdf(
name,
rdf_graph,
metagraph,
explicit_metagraph,
list_conversion_mode,
infer_type_from_adb_v_col,
include_adb_v_col_statements,
Expand Down Expand Up @@ -1227,24 +1246,51 @@ def load_meta_ontology(self, rdf_graph: RDFGraph) -> RDFConjunctiveGraph:
# Private: ArangoDB -> RDF #
############################

def __fetch_adb_docs(self, col: str) -> Tuple[Cursor, int]:
def __fetch_adb_docs(
self,
col: str,
is_edge: bool,
attributes: Set[str],
explicit_metagraph: bool,
**adb_export_kwargs: Any,
) -> Tuple[Cursor, int]:
"""ArangoDB -> RDF: Fetches ArangoDB documents within a collection.
:param col: The ArangoDB collection.
:type col: str
:param is_edge: True if **col** is an edge collection.
:type is_edge: bool
:param attributes: The set of document attributes.
:type attributes: Set[str]
:param explicit_metagraph: If True, only return the set of **attributes**
specified when fetching the documents of the collection **col**.
If False, all document attributes are included.
:type explicit_metagraph: bool
:param adb_export_kwargs: Keyword arguments to specify AQL query options when
fetching documents from the ArangoDB instance.
:type adb_export_kwargs: Any
:return: The document cursor along with the total collection size.
:rtype: Tuple[arango.cursor.Cursor, int]
"""
aql_return_value = "doc"
if explicit_metagraph:
edge_keys = "_from: doc._from, _to: doc._to" if is_edge else ""
aql_return_value = f"""
MERGE(
KEEP(doc, {list(attributes)}),
{{"_id": doc._id, "_key": doc._key, {edge_keys}}}
)
"""

col_size: int = self.__db.collection(col).count()

with get_export_spinner_progress(f"ADB Export: '{col}' ({col_size})") as p:
p.add_task(col)

cursor: Cursor = self.__db.aql.execute(
# TODO: Return **doc** attributes based on **metagraph**
"FOR doc IN @@col RETURN doc",
f"FOR doc IN @@col RETURN {aql_return_value}",
bind_vars={"@col": col},
**self.__adb_export_kwargs,
**{**adb_export_kwargs, **{"stream": True}},
)

return cursor, col_size
Expand Down
34 changes: 28 additions & 6 deletions tests/test_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1809,7 +1809,7 @@ def test_rpt_meta(name: str, rdf_graph: RDFGraph) -> None:
"name, rdf_graph",
[("Case_1_PGT", get_rdf_graph("cases/1.ttl"))],
)
def test_pgt_case_1(name: str, rdf_graph: RDFGraph) -> None:
def test_pgt_case_1_a(name: str, rdf_graph: RDFGraph) -> None:
NON_LITERAL_STATEMENTS = len(rdf_graph) - len(get_literal_statements(rdf_graph))
UNIQUE_NODES = len(
get_uris(rdf_graph, include_predicates=True)
Expand Down Expand Up @@ -4462,14 +4462,14 @@ def test_adb_doc_with_dict_property(name: str) -> None:
"sub_val_2": {"sub_val_3": 3, "sub_val_4": [4]},
"sub_val_5": [{"sub_val_6": 6}, {"sub_val_7": 7}],
},
"foo": "bar",
}

db.collection("TestDoc").insert(doc)

rdf_graph = adbrdf.arangodb_graph_to_rdf(
name,
RDFGraph(),
list_conversion_mode="collection",
include_adb_v_col_statements=True,
)

Expand All @@ -4482,23 +4482,45 @@ def test_adb_doc_with_dict_property(name: str) -> None:

assert len(adb_col_statements) == 1
assert (test_doc, None, None) in rdf_graph
assert (test_doc, URIRef(f"{test_doc_namespace}#foo"), Literal("bar")) in rdf_graph
assert (test_doc, URIRef(f"{test_doc_namespace}#val"), None) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_1"), None) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_1"), Literal(1)) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_2"), None) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_3"), Literal(3)) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_4"), None) in rdf_graph
assert (None, RDF.first, Literal(4)) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_4"), Literal(4)) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_5"), None) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_6"), Literal(6)) in rdf_graph
assert (None, URIRef(f"{test_doc_namespace}#sub_val_7"), Literal(7)) in rdf_graph
# TODO: Revisit magic number
assert len(rdf_graph) == 14
assert len(rdf_graph) == 10

# TODO: RDF Graph back to ArangoDB with this monster ^
# Need to discuss...
# adb_graph = adbrdf.rdf_to_arangodb_by_pgt(f"{name}2", rdf_graph)
# db.delete_graph(f"{name}2", drop_collections=True)

rdf_graph_2 = adbrdf.arangodb_to_rdf(
name,
RDFGraph(),
metagraph={"vertexCollections": {"TestDoc": {"foo"}}},
)

assert len(rdf_graph_2) == 1
assert (
test_doc,
URIRef(f"{test_doc_namespace}#foo"),
Literal("bar"),
) in rdf_graph_2

rdf_graph_3 = adbrdf.arangodb_to_rdf(
name,
RDFGraph(),
metagraph={"vertexCollections": {"TestDoc": {"foo"}}},
explicit_metagraph=False,
)

assert len(rdf_graph_3) == len(rdf_graph)

db.delete_graph(name, drop_collections=True)


Expand Down

0 comments on commit 285f9b6

Please sign in to comment.