diff --git a/README.md b/README.md index c42b5d4..c36181c 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,26 @@ docker run --rm -p 8000:8000 -it -v $(pwd)/databases:/data -e DATA_LOAD_PATHS=/d This will load all .ttl files found in the specified directory, and make it available under a /sparql endpoint, eg. http://localhost:8000/sparql +## Using full-text searches in queries + +```shell +docker run --rm -it -p 8000:8000 -it -e DEBUG=1 -e FTS_FILEPATH=/fts -e DATA_LOAD_PATHS=https://yogaontology.org/ontology.ttl ghcr.io/epoz/shmarql:latest +``` + +...add a link here to a a sparql query using the magic predicate on localhost:8000 + +## RDF2vec + +Under development is RDF2vec support. This can be enabled by adding the path to store the embeddings like so: + +```shell +docker run --rm -it -p 8000:8000 -it -e DEBUG=1 -e RDF2VEC_FILEPATH=/vec -e DATA_LOAD_PATHS=https://yogaontology.org/ontology.ttl ghcr.io/epoz/shmarql:latest +``` + +Then you can query the triples for similar entities by using the magic predicate: `` + +☣️ This is a new experimental feature, and needs more extenstive testing. + ## Development instructions If you would like to run and modify the code in this repo, there is a [Dockerfile](Dockerfile) which includes the necessary versions of the required libraries. diff --git a/requirements.txt b/requirements.txt index c97d409..a9b4410 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,8 @@ python3-saml==1.14.0 python-jose==3.3.0 rdflib==6.1.1 tree-sitter==0.20.1 -fizzysearch==0.1 \ No newline at end of file +fizzysearch==0.2 +igraph==0.10.4 +numpy==1.22.4 +voyager==2.0.6 +gensim==4.3.2 \ No newline at end of file diff --git a/src/app/config.py b/src/app/config.py index d9d7669..4b4bc2a 100644 --- a/src/app/config.py +++ b/src/app/config.py @@ -25,6 +25,7 @@ SCHPIEL_PATH = os.environ.get("SCHPIEL_PATH") SCHPIEL_TOKEN = os.environ.get("SCHPIEL_TOKEN") FTS_FILEPATH = os.environ.get("FTS_FILEPATH") +RDF2VEC_FILEPATH = os.environ.get("RDF2VEC_FILEPATH") # space-separated list of sqlite3 DB files containing sources, in the format: # CREATE TABLE source (id PRIMARY KEY, last_download, xml) as per the DDB @@ -52,7 +53,25 @@ try: if PREFIXES_FILEPATH: - PREFIXES = DEFAULT_PREFIXES | json.load(open(PREFIXES_FILEPATH)) + # also support reading the prefixed from a .ttl file for convenience + if PREFIXES_FILEPATH.endswith(".ttl"): + PREFIXES = DEFAULT_PREFIXES + for line in open(PREFIXES_FILEPATH).readlines(): + if not line.lower().startswith("@prefix "): + continue + if not line.lower().endswith(" .\n"): + continue + line = line.strip("\n .") + parts = line.split(":") + if len(parts) < 2: + continue + prefix = parts[0][8:] + ":" + prefix_uri = "".join(parts[1:]).strip("<> ") + if prefix == ":": + prefix = " " + PREFIXES[prefix] = prefix_uri + else: + PREFIXES = DEFAULT_PREFIXES | json.load(open(PREFIXES_FILEPATH)) else: PREFIXES = DEFAULT_PREFIXES except: diff --git a/src/app/main.py b/src/app/main.py index 7ad9770..fee5a0c 100644 --- a/src/app/main.py +++ b/src/app/main.py @@ -21,6 +21,7 @@ PREFIXES, ENDPOINT, FTS_FILEPATH, + RDF2VEC_FILEPATH, ) import httpx import logging, os, json, io, time, random, sys @@ -30,6 +31,7 @@ from .rdfer import prefixes, RDFer, Nice from rich.traceback import install from .fts import init_fts, search +from .rdf2vec import init_rdf2vec, rdf2vec_search from .px_util import OxigraphSerialization, SynthQuerySolutions, results_to_triples import rdflib import fizzysearch @@ -125,6 +127,11 @@ init_fts(GRAPH.quads_for_pattern, FTS_FILEPATH) fizzysearch.register([""], search) +if RDF2VEC_FILEPATH: + logging.debug(f"RDF2Vec filepath has been specified: {RDF2VEC_FILEPATH}") + init_rdf2vec(GRAPH.quads_for_pattern, RDF2VEC_FILEPATH) + fizzysearch.register([""], rdf2vec_search) + @app.post("/sparql") async def sparql_post_sparql_query( diff --git a/src/app/rdf2vec.py b/src/app/rdf2vec.py new file mode 100644 index 0000000..ed382a3 --- /dev/null +++ b/src/app/rdf2vec.py @@ -0,0 +1,91 @@ +import logging, os, sqlite3 +import igraph as ig +import gensim +import voyager +from .config import RDF2VEC_FILEPATH +import numpy as np + + +def rdf2vec_search(node_uri: str): + if not RDF2VEC_FILEPATH: + return None + + DB = sqlite3.connect(RDF2VEC_FILEPATH + ".db") + found = False + for row in DB.execute( + "SELECT vector FROM rdf2vec_index WHERE uri = ?", (node_uri,) + ): + vector = np.frombuffer(row[0], dtype=np.float32) + found = True + if not found: + return [] + + index = voyager.Index.load(RDF2VEC_FILEPATH) + ids, distance = index.query(vector, 99) + ids_as = ",".join(str(anid) for anid in ids) + results = [ + uri.strip("<>") + for id, uri in DB.execute( + f"SELECT id, uri FROM rdf2vec_index WHERE id IN ({ids_as})" + ) + if uri.startswith("<") + ] + return results + + +def init_rdf2vec(triple_func, rdf2vec_filepath): + logging.debug(f"Init RDF2Vec {rdf2vec_filepath}") + if os.path.exists(rdf2vec_filepath): + logging.debug(f"RDF2Vec {rdf2vec_filepath} already exists, skipping") + return + + nodes = set() + edges = set() + as_ints = [] + for s, p, o, _ in triple_func(None, None, None): + nodes.add(s) + nodes.add(o) + edges.add(p) + nodemap = {n: i for i, n in enumerate(nodes)} + nodemap_inv = {v: k for k, v in nodemap.items()} + edgemap = {e: i for i, e in enumerate(edges)} + edgemap_inv = {v: k for k, v in edgemap.items()} + + only_subjects = set() + for s, p, o, _ in triple_func(None, None, None): + as_ints.append((nodemap[s], edgemap[p], nodemap[o])) + only_subjects.add(nodemap[s]) + + graph = ig.Graph(n=len(nodes)) + graph.add_edges([(s, o) for s, p, o in as_ints]) + graph.es["p_i"] = [p for s, p, o in as_ints] + + data = set( + tuple( + [tuple(graph.random_walk(s, 15)) for s in only_subjects for x in range(100)] + ) + ) + + model = gensim.models.Word2Vec( + sentences=data, vector_size=100, window=5, min_count=1, workers=4 + ) + vectors = [model.wv.get_vector(node_id) for node_id in nodemap_inv] + + index = voyager.Index(voyager.Space.Cosine, 100) + index.add_items(vectors) + index.save(rdf2vec_filepath) + logging.debug(f"RDF2Vec {rdf2vec_filepath} created") + + DB = sqlite3.connect(rdf2vec_filepath + ".db") + DB_SCHEMA = """ +CREATE TABLE IF NOT EXISTS rdf2vec_index (id INTEGER PRIMARY KEY, uri TEXT, vector BLOB); +CREATE INDEX IF NOT EXISTS rdf2vec_index_uri ON rdf2vec_index (uri); +""" + DB.executescript(DB_SCHEMA) + to_insert = [ + (i, str(nodemap_inv[i]), vector.tobytes()) + for i, vector in zip(nodemap_inv, vectors) + ] + DB.executemany("INSERT OR IGNORE INTO rdf2vec_index VALUES (?, ?, ?)", to_insert) + DB.commit() + logging.debug(f"RDF2Vec mapping saved in {rdf2vec_filepath}.db")