Skip to content

Commit

Permalink
add pagination
Browse files Browse the repository at this point in the history
  • Loading branch information
tarcisio_adm committed Nov 6, 2024
1 parent 9c42681 commit 5685ccb
Showing 1 changed file with 40 additions and 17 deletions.
57 changes: 40 additions & 17 deletions src/sparql_llm/embed_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,26 @@
entities_embeddings_dir = os.path.join("data", "embeddings")
entities_embeddings_filepath = os.path.join(entities_embeddings_dir, "entities_embeddings.csv")


def retrieve_index_data(entity: dict , docs: list[Document], pagination : (int, int) | None ):
if pagination:
query = entity["query"] + " LIMIT " + pagination[0] + " OFFSET " + pagination[1]
else:
query = entity["query"]
entities_res = query_sparql(query, entity["endpoint"])["results"]["bindings"]
print(f"Found {len(entities_res)} entities for {entity['label']} in {entity['endpoint']}")
for entity_res in entities_res:
docs.append(
Document(
page_content=entity_res["label"]["value"],
metadata={
"label": entity_res["label"]["value"],
"uri": entity_res["uri"]["value"],
"endpoint_url": entity["endpoint"],
"entity_type": entity["uri"],
},
)
)
return entities_res
def generate_embeddings_for_entities():
start_time = time.time()
embedding_model = get_embedding_model()
Expand All @@ -36,6 +55,7 @@ def generate_embeddings_for_entities():
"label": "species",
"description": "species scientific names",
"endpoint": "https://www.bgee.org/sparql/",
"pagination": False,
"query": """PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?uri ?label
WHERE {
Expand All @@ -49,6 +69,7 @@ def generate_embeddings_for_entities():
"label": "developmental stage",
"description": "A developmental stage is spatiotemporal region encompassing some part of the life cycle of an organism, e.g. blastula stage.",
"endpoint": "https://www.bgee.org/sparql/",
"pagination": False,
"query": """PREFIX genex: <http://purl.org/genex#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
Expand All @@ -60,6 +81,7 @@ def generate_embeddings_for_entities():
"label": "Gene",
"description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
"endpoint": "https://www.bgee.org/sparql/",
"pagination": False,
"query": """PREFIX orth: <http://purl.org/net/orth#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
Expand All @@ -71,6 +93,7 @@ def generate_embeddings_for_entities():
"label": "Protein",
"description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
"endpoint": "https://sparql.omabrowser.org/sparql/",
"pagination": False,
"query": """PREFIX dc: <http://purl.org/dc/terms/>
PREFIX orth: <http://purl.org/net/orth#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
Expand All @@ -87,6 +110,7 @@ def generate_embeddings_for_entities():
"label": "Gene",
"description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
"endpoint": "https://sparql.omabrowser.org/sparql/",
"pagination": False,
"query": """PREFIX orth: <http://purl.org/net/orth#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
Expand All @@ -98,10 +122,11 @@ def generate_embeddings_for_entities():
"label": "Gene",
"description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
"endpoint": "https://sparql.uniprot.org/sparql/",
"pagination": True,
"query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
SELECT ?uri ?label {
?uri a up:Gene .
?uri skos:prefLabel ?label .}""",
},
Expand All @@ -110,9 +135,10 @@ def generate_embeddings_for_entities():
"label": "Protein",
"description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
"endpoint": "https://sparql.uniprot.org/sparql/",
"pagination": True,
"query": """PREFIX up: <http://purl.uniprot.org/core/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
SELECT ?uri ?label {
?uri a up:Protein .
?uri rdfs:label ?label .}""",
},
Expand All @@ -121,6 +147,7 @@ def generate_embeddings_for_entities():
"label": "species",
"description": "species scientific names",
"endpoint": "https://sparql.uniprot.org/sparql/",
"pagination": False,
"query": """PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?uri ?label
WHERE {
Expand All @@ -134,6 +161,7 @@ def generate_embeddings_for_entities():
"label": "species",
"description": "species scientific names",
"endpoint": "https://sparql.omabrowser.org/sparql/",
"pagination": False,
"query": """PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?uri ?label
WHERE {
Expand All @@ -146,20 +174,15 @@ def generate_embeddings_for_entities():

docs: list[Document] = []
for entity in entities_list.values():
entities_res = query_sparql(entity["query"], entity["endpoint"])["results"]["bindings"]
print(f"Found {len(entities_res)} entities for {entity['label']} in {entity['endpoint']}")
for entity_res in entities_res:
docs.append(
Document(
page_content=entity_res["label"]["value"],
metadata={
"label": entity_res["label"]["value"],
"uri": entity_res["uri"]["value"],
"endpoint_url": entity["endpoint"],
"entity_type": entity["uri"],
},
)
)
if entity["pagination"]:
max_results = 100000
pagination = (max_results, 0)
while retrieve_index_data(entity, docs, pagination):
pagination = (pagination[0], pagination[1] + max_results)
else:
retrieve_index_data(entity, docs)


print(f"Generating embeddings for {len(docs)} entities")

# To test with a smaller number of entities
Expand Down

0 comments on commit 5685ccb

Please sign in to comment.