diff --git a/examples/graph_algebra/gla_ex0_0.ipynb b/examples/graph_algebra/gla_ex0_0.ipynb new file mode 100644 index 0000000..3a46c2d --- /dev/null +++ b/examples/graph_algebra/gla_ex0_0.ipynb @@ -0,0 +1,199 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Graph Algebra with `kglab`\n", + "\n", + "## intro\n", + "`kglab` provides tools to access graph data from multiple source to build a `KnowledgeGraph` that can be easily used by data scientists. For a thorough explanation of how to use triples-stored data and how to load this data into `kglab` please see examples in the `examples/` directory. The examples in this directory (`examples/graph_algebra/`) will care to introduce graph algebra capabilities to be used on the graphs the user has loaded. \n", + "\n", + "## basic load and querying\n", + "In particular, once your data is loaded in a `KnowledgeGraph` with something like:\n", + "\n", + "1. Instantiate a graph from a dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# for use in tutorial and development; do not include this `sys.path` change in production:\n", + "import sys ; sys.path.insert(0, \"../../\")\n", + "from os.path import dirname\n", + "import kglab\n", + "import os\n", + "\n", + "namespaces = {\n", + " \"foaf\": \"http://xmlns.com/foaf/0.1/\",\n", + " \"gorm\": \"http://example.org/sagas#\",\n", + " \"rel\": \"http://purl.org/vocab/relationship/\",\n", + " }\n", + "\n", + "kg = kglab.KnowledgeGraph(\n", + " name = \"Happy Vikings KG example for SKOS/OWL inference\",\n", + " namespaces=namespaces,\n", + " )\n", + "\n", + "kg.load_rdf(dirname(dirname(os.getcwd())) + \"/dat/gorm.ttl\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "2. It is possible to create a subgraph by providing a SPARQL query, by defining a \"subject\" and \"object\":\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"SELECT ?subject ?object\n", + "WHERE {\n", + " ?subject rdf:type gorm:Viking .\n", + " ?subject gorm:childOf ?object .\n", + "}\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## define a subgraph\n", + "In this case we are looking for the network of parent-child relations among members of Vikings family.\n", + "\n", + "With this query we can define a **subgraph** so to have access to **graph algebra** capabilities: " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from kglab.subg import SubgraphMatrix\n", + "\n", + "subgraph = SubgraphMatrix(kg=kg, sparql=query)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## compute Adjacency matrices\n", + "Let's compute the first basic adjacency matrix (usually noted with `A`):" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 1., 1., 0., 0.],\n", + " [0., 0., 0., 1., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 1.],\n", + " [0., 0., 0., 0., 0.]])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adj_matrix = subgraph.to_adjacency()\n", + "adj_matrix" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "what happened here is that all the subjects and objects have been turned into integer indices from 0 to number of nodes. So we can see that the entity with index 0 is adjancent (is connected, has a directed edge) to the entity with index 1. This is a directed graph because the relationship `gorm:childOf` goes from child to parent, let's turn this into an undirected graph so to see the relation in a more symmetric way (both the child-parent and parent-child)." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 1., 1., 0., 0.],\n", + " [1., 0., 0., 1., 0.],\n", + " [1., 0., 0., 0., 0.],\n", + " [0., 1., 0., 0., 1.],\n", + " [0., 0., 0., 1., 0.]])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "undirected_adj_mtx = subgraph.to_undirected()\n", + "undirected_adj_mtx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see now the relationship is a generic symmetric \"parenthood\" relations, not just a child-parent directed relationship." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.10 64-bit ('.venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "de68f9b565e1e230f4433adb1a318d8f3a0dfad0917fa0c696727472c8ddadbf" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/kglab/algebra.py b/kglab/algebra.py new file mode 100644 index 0000000..2ac39d8 --- /dev/null +++ b/kglab/algebra.py @@ -0,0 +1,65 @@ +""" +Working with `SubgraphMatrix` as vectorized representation. +Additions to functionalities present in `subg.py`. +Integrate `scipy` and `scikit-learn` functionalities. + +see license https://github.com/DerwenAI/kglab#license-and-copyright +""" +import typing + +import networkx as nx +from networkx import DiGraph + +class AlgebraMixin: + """ +Provides methods to work with graph algebra using `SubgraphMatrix` data. + +NOTE: provide optional Oxigraph support for fast in-memory computation + """ + nx_graph: typing.Optional[DiGraph] = None + + def to_undirected(self): + return nx.to_numpy_array(self.nx_graph.to_undirected()) + + def to_adjacency(self): + """ +Return adjacency (dense) matrix for the KG. +[Relevant NetworkX interface](https://networkx.org/documentation/stable/reference/convert.html#id2) + + returns: +`numpy.array`: the array representation in `numpy` standard + """ + self.check_attributes() + return nx.to_numpy_array(self.nx_graph) + + def to_incidence(self): + """ +Return incidence (dense) matrix for the KG. +[Relevant scipy docs](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html) + + returns: +`numpy.array`: the array representation in `numpy` standard + """ + self.check_attributes() + return nx.incidence_matrix(self.nx_graph).toarray() + + def to_laplacian(self): + """ +Return Laplacian matrix for the KG. Graph is turned into undirected. +[docs](https://networkx.org/documentation/stable/reference/generated/networkx.linalg.laplacianmatrix.laplacian_matrix.html) + + returns: +`numpy.array`: the array representation in `numpy` standard + """ + self.check_attributes() + return nx.laplacian_matrix(self.nx_graph.to_undirected()).toarray() + + def to_scipy_sparse(self): + """ +Return graph in CSR format (optimized for matrix-matrix operations). + + returns: +SciPy sparse matrix: Graph adjacency matrix. + """ + self.check_attributes() + return nx.to_scipy_sparse_array(self.nx_graph) diff --git a/kglab/networks.py b/kglab/networks.py new file mode 100644 index 0000000..328a5d7 --- /dev/null +++ b/kglab/networks.py @@ -0,0 +1,43 @@ +""" +Working with `SubgraphMatrix` as vectorized representation. +Additions to functionalities present in `subg.py`. +Integrate `scikit-network` functionalities. + +see license https://github.com/DerwenAI/kglab#license-and-copyright +""" + +import sknetwork as skn + +class NetAnalysisMixin: + """ +Provides methods for network analysis tools to work with `KnowledgeGraph`. + """ + def get_distances(self, adj_mtx): + """ +Compute distances according to an adjacency matrix. + """ + self.check_attributes() + return skn.path.get_distances(adj_mtx) + + def get_shortest_path(self, adj_matx, src, dst): + """ +Return shortest path from sources to destinations according to an djacency matrix. + + adj_mtx: +numpy.array: adjacency matrix for the graph. + src: +int or iterable: indices of source nodes + dst: +int or iterable: indices of destination nodes + + returns: +list of int: a path of indices + """ + self.check_attributes() + return skn.path.get_shortest_path(adj_matx, src, dst) + + +# number of nodes, number of edges +# density +# triangles +# reciprocity \ No newline at end of file diff --git a/kglab/query/mixin.py b/kglab/query/mixin.py index a0d398d..e0f92c6 100644 --- a/kglab/query/mixin.py +++ b/kglab/query/mixin.py @@ -81,7 +81,8 @@ def query_as_df ( pythonify: bool = True, ) -> pd.DataFrame: """ -Wrapper for [`rdflib.Graph.query()`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=query#rdflib.Graph.query) to perform a SPARQL query on the RDF graph. +Wrapper for [`rdflib.Graph.query()`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=query#rdflib.Graph.query) +to perform a SPARQL query on the RDF graph. sparql: text for the SPARQL query @@ -123,7 +124,8 @@ def visualize_query ( notebook: bool = False, ) -> pyvis.network.Network: """ -Visualize the given SPARQL query as a [`pyvis.network.Network`](https://pyvis.readthedocs.io/en/latest/documentation.html#pyvis.network.Network) +Visualize the given SPARQL query as a +[`pyvis.network.Network`](https://pyvis.readthedocs.io/en/latest/documentation.html#pyvis.network.Network) sparql: input SPARQL query to be visualized @@ -144,7 +146,9 @@ def n3fy ( pythonify: bool = True, ) -> typing.Any: """ -Wrapper for RDFlib [`n3()`](https://rdflib.readthedocs.io/en/stable/utilities.html?highlight=n3#serializing-a-single-term-to-n3) and [`toPython()`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=toPython#rdflib.Variable.toPython) to serialize a node into a human-readable representation using N3 format. +Wrapper for RDFlib [`n3()`](https://rdflib.readthedocs.io/en/stable/utilities.html?highlight=n3#serializing-a-single-term-to-n3) +and [`toPython()`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=toPython#rdflib.Variable.toPython) +to serialize a node into a human-readable representation using N3 format. node: must be a [`rdflib.term.Node`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=Node#rdflib.term.Node) diff --git a/kglab/subg.py b/kglab/subg.py index 479f381..9c9a8e4 100644 --- a/kglab/subg.py +++ b/kglab/subg.py @@ -1,34 +1,36 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# see license https://github.com/DerwenAI/kglab#license-and-copyright - """ Subgraph transforms for visualization, graph algorithms, etc. + +see license https://github.com/DerwenAI/kglab#license-and-copyright """ import typing -from icecream import ic # type: ignore # pylint: disable=W0611,E0401 -from tqdm import tqdm # type: ignore # pylint: disable=E0401 -import pandas as pd # type: ignore # pylint: disable=E0401 -import pyvis.network # type: ignore # pylint: disable=E0401 -import networkx as nx # type: ignore # pylint: disable=E0401 +from icecream import ic # type: ignore # pylint: disable=W0611 +from tqdm import tqdm # type: ignore +import pandas as pd # type: ignore +import pyvis.network # type: ignore +import networkx as nx # type: ignore from kglab import KnowledgeGraph from kglab.topo import Measure from kglab.pkg_types import NodeLike, RDF_Node, RDF_Triple +from kglab.algebra import AlgebraMixin +from kglab.networks import NetAnalysisMixin from kglab.util import get_gpu_count if get_gpu_count() > 0: - import cudf # type: ignore # pylint: disable=E0401 - import cugraph # type: ignore # pylint: disable=E0401,W0611 # lgtm[py/unused-import] + import cudf # type: ignore + import cugraph # type: ignore # pylint: disable=W0611 -class Subgraph: +class Subgraph(AlgebraMixin, NetAnalysisMixin): """ -Base class for projection of an RDF graph into an *algebraic object* such as a *vector*, *matrix*, or *tensor* representation, to support integration with non-RDF graph libraries. -In other words, this class provides means to vectorize selected portions of a graph as a [*dimension*](https://mathworld.wolfram.com/Dimension.html). +Base class for projection of an RDF graph into an *algebraic object* such as a *vector*, +*matrix*, or *tensor* representation, to support integration with non-RDF graph libraries. +In other words, this class provides means to vectorize selected portions of a graph as a +[*dimension*](https://mathworld.wolfram.com/Dimension.html). See Features support several areas of use cases, including: @@ -40,9 +42,12 @@ class Subgraph: * embedding (deep learning) * probabilistic graph inference (statistical relational learning) -The base case is where a *subset* of the nodes in the source RDF graph get represented as a *vector*, in the `node_vector` member. -This provides an efficient *index* on a constructed *dimension*, solely for the context of a specific use case. +The base case is where a *subset* of the nodes in the source RDF graph get represented as +a *vector*, in the `node_vector` member. This provides an efficient *index* on a constructed +*dimension*, solely for the context of a specific use case. """ + kg: typing.Optional[KnowledgeGraph] = None + nx_graph: typing.Optional[nx.DiGraph] = None def __init__ ( self, @@ -110,7 +115,7 @@ def inverse_transform ( id: int, ) -> NodeLike: """ -Inverse transform from an intenger to a node in the RDF graph, using the identifier as an index into the node vector. +Inverse transform from an integer to a node in the RDF graph, using the identifier as an index into the node vector. id: an integer index for the `node` in the RDF graph @@ -140,13 +145,34 @@ def n3fy ( """ return self.kg.n3fy(node) + def check_attributes(self): + """ +Check if needed attributes are set. + + returns: +None + """ + if self.kg is None: + raise ValueError( + """`Subgraph`'s `kg` should be initialized: + `kglab.Subgraph(kg)`""" + ) + + # create an empy `nx.DiGraph` if none is present + if self.nx_graph is None: + # NOTE: find a way to pass `bipartite` if needed + self.nx_graph = self.build_nx_graph(nx.DiGraph()) + class SubgraphMatrix (Subgraph): """ Projection of a RDF graph to a [*matrix*](https://mathworld.wolfram.com/AdjacencyMatrix.html) representation. Typical use cases include integration with non-RDF graph libraries for *graph algorithms*. + +SPARQL query text needs to define a subgraph as: `subject -> object`. """ _SRC_DST_MAP: typing.List[str] = ["subject", "object"] + sparql: typing.Optional[str] = None def __init__ ( self, @@ -164,7 +190,8 @@ def __init__ ( the source RDF graph sparql: -text for a SPARQL query that yields pairs to project into the *subgraph*; by default this expects the query to return bindings for `subject` and `object` nodes in the RDF graph +text for a SPARQL query that yields pairs to project into the *subgraph*; by default this expects the query to return + bindings for `subject` and `object` nodes in the RDF graph bindings: initial variable bindings @@ -188,9 +215,11 @@ def build_df ( show_symbols: bool = False, ) -> pd.DataFrame: """ -Factory pattern to populate a [`pandas.DataFrame`](https://pandas.pydata.org/docs/reference/frame.html) object, using transforms in this subgraph. +Factory pattern to populate a [`pandas.DataFrame`](https://pandas.pydata.org/docs/reference/frame.html) object, +using transforms in this subgraph. -Note: this method is primarily intended for [`cuGraph`](https://docs.rapids.ai/api/cugraph/stable/) support. Loading via a `DataFrame` is required – in lieu of using the `nx.add_node()` approach. +Note: this method is primarily intended for [`cuGraph`](https://docs.rapids.ai/api/cugraph/stable/) support. +Loading via a `DataFrame` is required in lieu of using the `nx.add_node()` approach. Therefore the support for representing *bipartite* graphs is still pending. show_symbols: @@ -200,6 +229,10 @@ def build_df ( the populated `DataFrame` object; uses the [RAPIDS `cuDF` library](https://docs.rapids.ai/api/cudf/stable/) if GPUs are enabled """ col_names: typing.List[str] = [ "src", "dst", "src_sym", "dst_sym" ] + + if self.sparql is None and self.kg.use_gpus is True: + raise ValueError("""To use GPUs is necessary to provide a SPARQL query to define a subgraph: + `kglab.SubgraphMatrix(kg, sparql)` or `SubgraphTensor`""") row_iter = self.kg.query(self.sparql, bindings=self.bindings) if not show_symbols: diff --git a/requirements.txt b/requirements.txt index 1d66f14..aa8dd09 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ gcsfs >= 2022.2 icecream >= 2.1 morph-kgc >= 2.0.0 networkx >= 2.7 -numpy >= 1.21 +numpy == 1.23.0 owlrl >= 6.0.2 oxrdflib >= 0.3.1 pandas >= 1.4 @@ -20,6 +20,9 @@ python-dateutil >= 2.8 pyvis >= 0.1.9 rdflib >= 6.1 requests >= 2.27 +scikit-learn == 1.1.2 +scikit-network == 0.27.1 +scipy >= 1.8.0 statsmodels >= 0.13 tqdm >= 4.63 urlpath >= 1.2 diff --git a/tests/test_algebra_basic.py b/tests/test_algebra_basic.py new file mode 100644 index 0000000..0a8c42d --- /dev/null +++ b/tests/test_algebra_basic.py @@ -0,0 +1,94 @@ +import pytest +import numpy as np + +import kglab +from kglab.subg import SubgraphMatrix + +from .__init__ import DAT_FILES_DIR + + +@pytest.fixture() +def kg_test_data(): + namespaces = { + "nom": "http://example.org/#", + "wtm": "http://purl.org/heals/food/", + "ind": "http://purl.org/heals/ingredient/", + "skos": "http://www.w3.org/2004/02/skos/core#", + } + + kg = kglab.KnowledgeGraph( + name = "A recipe KG example based on Food.com", + base_uri = "https://www.food.com/recipe/", + namespaces = namespaces, + ) + + kg.load_rdf(DAT_FILES_DIR / "tmp.ttl") + + yield kg + + del kg + + +def get_items(s): + s_coo = s.tocoo() + return set(zip(s_coo.row, s_coo.col)) + + +QUERY1 = """ +SELECT ?subject ?object +WHERE { + ?subject rdf:type wtm:Recipe . + ?subject wtm:hasIngredient ?object . +} +""" + +def test_adj_mtx(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + n_array = subgraph.to_adjacency() + np.testing.assert_allclose( + n_array[:3,:6], + np.array( + [[0, 1, 1, 1, 1, 1], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]] + ), + rtol=1e-5, atol=0 + ) + + +def test_incidence(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + n_array = subgraph.to_incidence() + np.testing.assert_allclose( + n_array[:3,:6], + np.array( + [[1, 1, 1, 1, 1, 1], + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0]] + ), + rtol=1e-5, atol=0 + ) + +def test_laplacian(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + n_array = subgraph.to_laplacian() + np.testing.assert_allclose( + n_array[:3,:6], + np.array( + [[6, -1, -1, -1, -1, -1], + [-1, 190, 0, 0, 0, 0], + [-1, 0, 147, 0, 0, 0]] + ), + rtol=1e-5, atol=0 + ) + +def test_scipy_sparse(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + n_array = subgraph.to_scipy_sparse() + assert n_array.getformat() == "csr" + + set_ = ((0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (12, 1), (12, 2), + (12, 3), (12, 4), (12, 8), (12, 11), (12, 13), (14, 2), ) + not_set_ = ((8, 8), (10, 6), (10, 10), (8, 1), (249, 2)) + assert all(i in get_items(n_array) for i in set_) + assert all(i not in get_items(n_array) for i in not_set_) diff --git a/tests/test_networks.py b/tests/test_networks.py new file mode 100644 index 0000000..2b83139 --- /dev/null +++ b/tests/test_networks.py @@ -0,0 +1,68 @@ +import pytest +import numpy as np + +import kglab +from kglab.subg import SubgraphMatrix, Subgraph + +from .__init__ import DAT_FILES_DIR + + +@pytest.fixture() +def kg_test_data(): + namespaces = { + "nom": "http://example.org/#", + "wtm": "http://purl.org/heals/food/", + "ind": "http://purl.org/heals/ingredient/", + "skos": "http://www.w3.org/2004/02/skos/core#", + } + + kg = kglab.KnowledgeGraph( + name = "A recipe KG example based on Food.com", + base_uri = "https://www.food.com/recipe/", + namespaces = namespaces, + ) + + kg.load_rdf(DAT_FILES_DIR / "tmp.ttl") + + yield kg + + del kg + + +def get_items(s): + s_coo = s.tocoo() + return set(zip(s_coo.row, s_coo.col)) + +# +# A query that defines a subgraph as subject -> object +# +QUERY1 = """ +SELECT ?subject ?object +WHERE { + ?subject rdf:type wtm:Recipe . + ?subject wtm:hasIngredient ?object . +} +""" + +def test_distances_mtx(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + + dist = subgraph.get_distances(subgraph.to_scipy_sparse()) + np.testing.assert_allclose( + dist[0,:6], + [0, 1, 1, 1, 1, 1] + ) + +def test_shortest_path(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + + dist = subgraph.get_shortest_path(subgraph.to_scipy_sparse(), 2, 6) + assert dist == [] + + dist = subgraph.get_shortest_path(subgraph.to_scipy_sparse(), 0, 2) + assert dist == [0, 2] + + dist = subgraph.get_shortest_path(subgraph.to_scipy_sparse(), 0, 7) + assert dist == [] + + \ No newline at end of file