Skip to content

Commit

Permalink
Add RDF2vec
Browse files Browse the repository at this point in the history
  • Loading branch information
epoz committed Mar 27, 2024
1 parent 68766a9 commit 310e9b7
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 2 deletions.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,26 @@ docker run --rm -p 8000:8000 -it -v $(pwd)/databases:/data -e DATA_LOAD_PATHS=/d

This will load all .ttl files found in the specified directory, and make it available under a /sparql endpoint, eg. http://localhost:8000/sparql

## Using full-text searches in queries

```shell
docker run --rm -it -p 8000:8000 -it -e DEBUG=1 -e FTS_FILEPATH=/fts -e DATA_LOAD_PATHS=https://yogaontology.org/ontology.ttl ghcr.io/epoz/shmarql:latest
```

...add a link here to a a sparql query using the magic predicate on localhost:8000

## RDF2vec

Under development is RDF2vec support. This can be enabled by adding the path to store the embeddings like so:

```shell
docker run --rm -it -p 8000:8000 -it -e DEBUG=1 -e RDF2VEC_FILEPATH=/vec -e DATA_LOAD_PATHS=https://yogaontology.org/ontology.ttl ghcr.io/epoz/shmarql:latest
```

Then you can query the triples for similar entities by using the magic predicate: `<http://shmarql.com/vec>`

☣️ This is a new experimental feature, and needs more extenstive testing.

## Development instructions

If you would like to run and modify the code in this repo, there is a [Dockerfile](Dockerfile) which includes the necessary versions of the required libraries.
Expand Down
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,8 @@ python3-saml==1.14.0
python-jose==3.3.0
rdflib==6.1.1
tree-sitter==0.20.1
fizzysearch==0.1
fizzysearch==0.2
igraph==0.10.4
numpy==1.22.4
voyager==2.0.6
gensim==4.3.2
21 changes: 20 additions & 1 deletion src/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
SCHPIEL_PATH = os.environ.get("SCHPIEL_PATH")
SCHPIEL_TOKEN = os.environ.get("SCHPIEL_TOKEN")
FTS_FILEPATH = os.environ.get("FTS_FILEPATH")
RDF2VEC_FILEPATH = os.environ.get("RDF2VEC_FILEPATH")

# space-separated list of sqlite3 DB files containing sources, in the format:
# CREATE TABLE source (id PRIMARY KEY, last_download, xml) as per the DDB
Expand Down Expand Up @@ -52,7 +53,25 @@

try:
if PREFIXES_FILEPATH:
PREFIXES = DEFAULT_PREFIXES | json.load(open(PREFIXES_FILEPATH))
# also support reading the prefixed from a .ttl file for convenience
if PREFIXES_FILEPATH.endswith(".ttl"):
PREFIXES = DEFAULT_PREFIXES
for line in open(PREFIXES_FILEPATH).readlines():
if not line.lower().startswith("@prefix "):
continue
if not line.lower().endswith(" .\n"):
continue
line = line.strip("\n .")
parts = line.split(":")
if len(parts) < 2:
continue
prefix = parts[0][8:] + ":"
prefix_uri = "".join(parts[1:]).strip("<> ")
if prefix == ":":
prefix = " "
PREFIXES[prefix] = prefix_uri
else:
PREFIXES = DEFAULT_PREFIXES | json.load(open(PREFIXES_FILEPATH))
else:
PREFIXES = DEFAULT_PREFIXES
except:
Expand Down
7 changes: 7 additions & 0 deletions src/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
PREFIXES,
ENDPOINT,
FTS_FILEPATH,
RDF2VEC_FILEPATH,
)
import httpx
import logging, os, json, io, time, random, sys
Expand All @@ -30,6 +31,7 @@
from .rdfer import prefixes, RDFer, Nice
from rich.traceback import install
from .fts import init_fts, search
from .rdf2vec import init_rdf2vec, rdf2vec_search
from .px_util import OxigraphSerialization, SynthQuerySolutions, results_to_triples
import rdflib
import fizzysearch
Expand Down Expand Up @@ -125,6 +127,11 @@
init_fts(GRAPH.quads_for_pattern, FTS_FILEPATH)
fizzysearch.register(["<http://shmarql.com/fts>"], search)

if RDF2VEC_FILEPATH:
logging.debug(f"RDF2Vec filepath has been specified: {RDF2VEC_FILEPATH}")
init_rdf2vec(GRAPH.quads_for_pattern, RDF2VEC_FILEPATH)
fizzysearch.register(["<http://shmarql.com/vec>"], rdf2vec_search)


@app.post("/sparql")
async def sparql_post_sparql_query(
Expand Down
91 changes: 91 additions & 0 deletions src/app/rdf2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import logging, os, sqlite3
import igraph as ig
import gensim
import voyager
from .config import RDF2VEC_FILEPATH
import numpy as np


def rdf2vec_search(node_uri: str):
if not RDF2VEC_FILEPATH:
return None

DB = sqlite3.connect(RDF2VEC_FILEPATH + ".db")
found = False
for row in DB.execute(
"SELECT vector FROM rdf2vec_index WHERE uri = ?", (node_uri,)
):
vector = np.frombuffer(row[0], dtype=np.float32)
found = True
if not found:
return []

index = voyager.Index.load(RDF2VEC_FILEPATH)
ids, distance = index.query(vector, 99)
ids_as = ",".join(str(anid) for anid in ids)
results = [
uri.strip("<>")
for id, uri in DB.execute(
f"SELECT id, uri FROM rdf2vec_index WHERE id IN ({ids_as})"
)
if uri.startswith("<")
]
return results


def init_rdf2vec(triple_func, rdf2vec_filepath):
logging.debug(f"Init RDF2Vec {rdf2vec_filepath}")
if os.path.exists(rdf2vec_filepath):
logging.debug(f"RDF2Vec {rdf2vec_filepath} already exists, skipping")
return

nodes = set()
edges = set()
as_ints = []
for s, p, o, _ in triple_func(None, None, None):
nodes.add(s)
nodes.add(o)
edges.add(p)
nodemap = {n: i for i, n in enumerate(nodes)}
nodemap_inv = {v: k for k, v in nodemap.items()}
edgemap = {e: i for i, e in enumerate(edges)}
edgemap_inv = {v: k for k, v in edgemap.items()}

only_subjects = set()
for s, p, o, _ in triple_func(None, None, None):
as_ints.append((nodemap[s], edgemap[p], nodemap[o]))
only_subjects.add(nodemap[s])

graph = ig.Graph(n=len(nodes))
graph.add_edges([(s, o) for s, p, o in as_ints])
graph.es["p_i"] = [p for s, p, o in as_ints]

data = set(
tuple(
[tuple(graph.random_walk(s, 15)) for s in only_subjects for x in range(100)]
)
)

model = gensim.models.Word2Vec(
sentences=data, vector_size=100, window=5, min_count=1, workers=4
)
vectors = [model.wv.get_vector(node_id) for node_id in nodemap_inv]

index = voyager.Index(voyager.Space.Cosine, 100)
index.add_items(vectors)
index.save(rdf2vec_filepath)
logging.debug(f"RDF2Vec {rdf2vec_filepath} created")

DB = sqlite3.connect(rdf2vec_filepath + ".db")
DB_SCHEMA = """
CREATE TABLE IF NOT EXISTS rdf2vec_index (id INTEGER PRIMARY KEY, uri TEXT, vector BLOB);
CREATE INDEX IF NOT EXISTS rdf2vec_index_uri ON rdf2vec_index (uri);
"""
DB.executescript(DB_SCHEMA)
to_insert = [
(i, str(nodemap_inv[i]), vector.tobytes())
for i, vector in zip(nodemap_inv, vectors)
]
DB.executemany("INSERT OR IGNORE INTO rdf2vec_index VALUES (?, ?, ?)", to_insert)
DB.commit()
logging.debug(f"RDF2Vec mapping saved in {rdf2vec_filepath}.db")

0 comments on commit 310e9b7

Please sign in to comment.