Add RDF2vec

epoz · Mar 27, 2024 · 310e9b7 · 310e9b7
1 parent 68766a9
commit 310e9b7
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -29,6 +29,26 @@ docker run --rm -p 8000:8000 -it -v $(pwd)/databases:/data -e DATA_LOAD_PATHS=/d
 
 This will load all .ttl files found in the specified directory, and make it available under a /sparql endpoint, eg. http://localhost:8000/sparql
 
+## Using full-text searches in queries
+
+```shell
+docker run --rm -it -p 8000:8000 -it -e DEBUG=1 -e FTS_FILEPATH=/fts -e DATA_LOAD_PATHS=https://yogaontology.org/ontology.ttl ghcr.io/epoz/shmarql:latest
+```
+
+...add a link here to a a sparql query using the magic predicate on localhost:8000
+
+## RDF2vec
+
+Under development is RDF2vec support. This can be enabled by adding the path to store the embeddings like so:
+
+```shell
+docker run --rm -it -p 8000:8000 -it -e DEBUG=1 -e RDF2VEC_FILEPATH=/vec -e DATA_LOAD_PATHS=https://yogaontology.org/ontology.ttl ghcr.io/epoz/shmarql:latest
+```
+
+Then you can query the triples for similar entities by using the magic predicate: `<http://shmarql.com/vec>`
+
+☣️ This is a new experimental feature, and needs more extenstive testing.
+
 ## Development instructions
 
 If you would like to run and modify the code in this repo, there is a [Dockerfile](Dockerfile) which includes the necessary versions of the required libraries.

diff --git a/requirements.txt b/requirements.txt
@@ -15,4 +15,8 @@ python3-saml==1.14.0
 python-jose==3.3.0
 rdflib==6.1.1
 tree-sitter==0.20.1
-fizzysearch==0.1
+fizzysearch==0.2
+igraph==0.10.4
+numpy==1.22.4
+voyager==2.0.6
+gensim==4.3.2
diff --git a/src/app/config.py b/src/app/config.py
@@ -25,6 +25,7 @@
 SCHPIEL_PATH = os.environ.get("SCHPIEL_PATH")
 SCHPIEL_TOKEN = os.environ.get("SCHPIEL_TOKEN")
 FTS_FILEPATH = os.environ.get("FTS_FILEPATH")
+RDF2VEC_FILEPATH = os.environ.get("RDF2VEC_FILEPATH")
 
 # space-separated list of sqlite3 DB files containing sources, in the format:
 # CREATE TABLE source (id PRIMARY KEY, last_download, xml) as per the DDB
@@ -52,7 +53,25 @@
 
 try:
     if PREFIXES_FILEPATH:
-        PREFIXES = DEFAULT_PREFIXES | json.load(open(PREFIXES_FILEPATH))
+        # also support reading the prefixed from a .ttl file for convenience
+        if PREFIXES_FILEPATH.endswith(".ttl"):
+            PREFIXES = DEFAULT_PREFIXES
+            for line in open(PREFIXES_FILEPATH).readlines():
+                if not line.lower().startswith("@prefix "):
+                    continue
+                if not line.lower().endswith(" .\n"):
+                    continue
+                line = line.strip("\n .")
+                parts = line.split(":")
+                if len(parts) < 2:
+                    continue
+                prefix = parts[0][8:] + ":"
+                prefix_uri = "".join(parts[1:]).strip("<> ")
+                if prefix == ":":
+                    prefix = " "
+                PREFIXES[prefix] = prefix_uri
+        else:
+            PREFIXES = DEFAULT_PREFIXES | json.load(open(PREFIXES_FILEPATH))
     else:
         PREFIXES = DEFAULT_PREFIXES
 except:

diff --git a/src/app/main.py b/src/app/main.py
@@ -21,6 +21,7 @@
     PREFIXES,
     ENDPOINT,
     FTS_FILEPATH,
+    RDF2VEC_FILEPATH,
 )
 import httpx
 import logging, os, json, io, time, random, sys
@@ -30,6 +31,7 @@
 from .rdfer import prefixes, RDFer, Nice
 from rich.traceback import install
 from .fts import init_fts, search
+from .rdf2vec import init_rdf2vec, rdf2vec_search
 from .px_util import OxigraphSerialization, SynthQuerySolutions, results_to_triples
 import rdflib
 import fizzysearch
@@ -125,6 +127,11 @@
     init_fts(GRAPH.quads_for_pattern, FTS_FILEPATH)
     fizzysearch.register(["<http://shmarql.com/fts>"], search)
 
+if RDF2VEC_FILEPATH:
+    logging.debug(f"RDF2Vec filepath has been specified: {RDF2VEC_FILEPATH}")
+    init_rdf2vec(GRAPH.quads_for_pattern, RDF2VEC_FILEPATH)
+    fizzysearch.register(["<http://shmarql.com/vec>"], rdf2vec_search)
+
 
 @app.post("/sparql")
 async def sparql_post_sparql_query(

diff --git a/src/app/rdf2vec.py b/src/app/rdf2vec.py
@@ -0,0 +1,91 @@
+import logging, os, sqlite3
+import igraph as ig
+import gensim
+import voyager
+from .config import RDF2VEC_FILEPATH
+import numpy as np
+
+
+def rdf2vec_search(node_uri: str):
+    if not RDF2VEC_FILEPATH:
+        return None
+
+    DB = sqlite3.connect(RDF2VEC_FILEPATH + ".db")
+    found = False
+    for row in DB.execute(
+        "SELECT vector FROM rdf2vec_index WHERE uri = ?", (node_uri,)
+    ):
+        vector = np.frombuffer(row[0], dtype=np.float32)
+        found = True
+    if not found:
+        return []
+
+    index = voyager.Index.load(RDF2VEC_FILEPATH)
+    ids, distance = index.query(vector, 99)
+    ids_as = ",".join(str(anid) for anid in ids)
+    results = [
+        uri.strip("<>")
+        for id, uri in DB.execute(
+            f"SELECT id, uri FROM rdf2vec_index WHERE id IN ({ids_as})"
+        )
+        if uri.startswith("<")
+    ]
+    return results
+
+
+def init_rdf2vec(triple_func, rdf2vec_filepath):
+    logging.debug(f"Init RDF2Vec {rdf2vec_filepath}")
+    if os.path.exists(rdf2vec_filepath):
+        logging.debug(f"RDF2Vec {rdf2vec_filepath} already exists, skipping")
+        return
+
+    nodes = set()
+    edges = set()
+    as_ints = []
+    for s, p, o, _ in triple_func(None, None, None):
+        nodes.add(s)
+        nodes.add(o)
+        edges.add(p)
+    nodemap = {n: i for i, n in enumerate(nodes)}
+    nodemap_inv = {v: k for k, v in nodemap.items()}
+    edgemap = {e: i for i, e in enumerate(edges)}
+    edgemap_inv = {v: k for k, v in edgemap.items()}
+
+    only_subjects = set()
+    for s, p, o, _ in triple_func(None, None, None):
+        as_ints.append((nodemap[s], edgemap[p], nodemap[o]))
+        only_subjects.add(nodemap[s])
+
+    graph = ig.Graph(n=len(nodes))
+    graph.add_edges([(s, o) for s, p, o in as_ints])
+    graph.es["p_i"] = [p for s, p, o in as_ints]
+
+    data = set(
+        tuple(
+            [tuple(graph.random_walk(s, 15)) for s in only_subjects for x in range(100)]
+        )
+    )
+
+    model = gensim.models.Word2Vec(
+        sentences=data, vector_size=100, window=5, min_count=1, workers=4
+    )
+    vectors = [model.wv.get_vector(node_id) for node_id in nodemap_inv]
+
+    index = voyager.Index(voyager.Space.Cosine, 100)
+    index.add_items(vectors)
+    index.save(rdf2vec_filepath)
+    logging.debug(f"RDF2Vec {rdf2vec_filepath} created")
+
+    DB = sqlite3.connect(rdf2vec_filepath + ".db")
+    DB_SCHEMA = """
+CREATE TABLE IF NOT EXISTS rdf2vec_index (id INTEGER PRIMARY KEY, uri TEXT, vector BLOB);
+CREATE INDEX IF NOT EXISTS rdf2vec_index_uri ON rdf2vec_index (uri);
+"""
+    DB.executescript(DB_SCHEMA)
+    to_insert = [
+        (i, str(nodemap_inv[i]), vector.tobytes())
+        for i, vector in zip(nodemap_inv, vectors)
+    ]
+    DB.executemany("INSERT OR IGNORE INTO rdf2vec_index VALUES (?, ?, ?)", to_insert)
+    DB.commit()
+    logging.debug(f"RDF2Vec mapping saved in {rdf2vec_filepath}.db")