diff --git a/README.md b/README.md index a4e90ff..ef13044 100644 --- a/README.md +++ b/README.md @@ -165,39 +165,58 @@ print("\n".join(issues)) > > It can easily be adapted to use any LLM served through an OpenAI-compatible API. We plan to make configuration and deployment of complete SPARQL LLM chat system easier in the future, let us know if you are interested in the GitHub issues! -Create a `.env` file at the root of the repository to provide secrets and API keys: +1. Create a `.env` file at the root of the repository to provide secrets and API keys: -```bash -OPENAI_API_KEY=sk-proj-YYY -GLHF_API_KEY=APIKEY_FOR_glhf.chat_USED_FOR_TEST_OPEN_SOURCE_MODELS -EXPASY_API_KEY=NOT_SO_SECRET_API_KEY_USED_BY_FRONTEND_TO_AVOID_SPAM_FROM_CRAWLERS -LOGS_API_KEY=SECRET_PASSWORD_TO_EASILY_ACCESS_LOGS_THROUGH_THE_API -``` + ```sh + OPENAI_API_KEY=sk-proj-YYY + GLHF_API_KEY=APIKEY_FOR_glhf.chat_USED_FOR_TEST_OPEN_SOURCE_MODELS + EXPASY_API_KEY=NOT_SO_SECRET_API_KEY_USED_BY_FRONTEND_TO_AVOID_SPAM_FROM_CRAWLERS + LOGS_API_KEY=SECRET_PASSWORD_TO_EASILY_ACCESS_LOGS_THROUGH_THE_API + ``` -Build the chat webpage (will be better integrated in the workflow in the future): +2. Build the chat UI webpage (will be better integrated in the workflow in the future): -```bash -cd chat-with-context -npm i -npm run build:demo -cd .. -``` + ```sh + cd chat-with-context + npm i + npm run build:demo + cd .. + ``` -Start the web UI, API, and similarity search engine in production (you might need to make some changes to the `compose.yml` file to adapt it to your server/proxy setup): +3. Start the vector database and web server -```bash -docker compose up -``` + In production (you might need to make some changes to the `compose.yml` file to adapt it to your server/proxy setup): -Start the stack locally for development, with code from `src` folder mounted in the container and automatic API reload on changes to the code: + ```bash + docker compose up + ``` -```bash -docker compose -f compose.dev.yml up -``` + Start the stack locally for development, with code from `src` folder mounted in the container and automatic API reload on changes to the code: + + ```bash + docker compose -f compose.dev.yml up + ``` -* Chat web UI available at http://localhost:8000 -* OpenAPI Swagger UI available at http://localhost:8000/docs -* Vector database dashboard UI available at http://localhost:6333/dashboard + * Chat web UI available at http://localhost:8000 + * OpenAPI Swagger UI available at http://localhost:8000/docs + * Vector database dashboard UI available at http://localhost:6333/dashboard + +4. Run the script to index the resources (SPARQL endpoints listed in config file): + + ```sh + docker compose run api python src/sparql_llm/embed.py + ``` + +> [!WARNING] +> +> **Experimental entities indexing**: it can take a lot of time to generate embeddings for entities. So we recommend to run the script to generate embeddings on a machine with GPU (does not need to be a powerful one, but at least with a GPU, checkout [fastembed GPU docs](https://qdrant.github.io/fastembed/examples/FastEmbed_GPU/) to install the GPU drivers and dependencies) +> +> ```sh +> pip install -e ".[chat,gpu]" +> python src/sparql_llm/embed_entities.py +> ``` +> +> Then move the CSV containing the embeddings in `data/embeddings/entities_embeddings.py` before running the `embed.py` script ## 🧑‍💻 Contributing diff --git a/prestart.sh b/prestart.sh deleted file mode 100644 index eb55087..0000000 --- a/prestart.sh +++ /dev/null @@ -1 +0,0 @@ -python src/sparql_llm/embed.py diff --git a/src/sparql_llm/api.py b/src/sparql_llm/api.py index 4c43c87..853873a 100644 --- a/src/sparql_llm/api.py +++ b/src/sparql_llm/api.py @@ -216,9 +216,18 @@ async def chat(request: ChatCompletionRequest): ] ), limit=settings.retrieved_docs_count, + # group_by="iri", + # with_payload=True, ) + # TODO: vectordb.search_groups( + # https://qdrant.tech/documentation/concepts/search/#search-groups + + # TODO: hybrid search? https://qdrant.github.io/fastembed/examples/Hybrid_Search/#about-qdrant + # we might want to group by iri for shex docs https://qdrant.tech/documentation/concepts/hybrid-queries/?q=hybrid+se#grouping + # https://qdrant.tech/documentation/concepts/search/#search-groups prompt_with_context += "Here is some additional information that could be useful to answer the user question:\n\n" + # for docs_hit in docs_hits.groups: for docs_hit in docs_hits: if docs_hit.payload["doc_type"] == "shex": prompt_with_context += f"ShEx shape for {docs_hit.payload['question']} in {docs_hit.payload['endpoint_url']}:\n```\n{docs_hit.payload['answer']}\n```\n\n" diff --git a/src/sparql_llm/embed.py b/src/sparql_llm/embed.py index 46a7b3a..844bfa8 100644 --- a/src/sparql_llm/embed.py +++ b/src/sparql_llm/embed.py @@ -81,6 +81,7 @@ def load_schemaorg_description(endpoint: dict[str, str]) -> list[Document]: "question": question, "answer": "\n".join(descs), "endpoint_url": endpoint["endpoint_url"], + "iri": endpoint["homepage"], "doc_type": "schemaorg_description", }, ) @@ -119,6 +120,7 @@ def load_ontology(endpoint: dict[str, str]) -> list[Document]: "question": split.page_content, "answer": "", "endpoint_url": endpoint["endpoint_url"], + "iri": endpoint["ontology"], "doc_type": "ontology", }, ) @@ -175,6 +177,7 @@ def init_vectordb(vectordb_host: str = settings.vectordb_host) -> None: The UniProt consortium is headed by Alex Bateman, Alan Bridge and Cathy Wu, supported by key staff, and receives valuable input from an independent Scientific Advisory Board. """, "endpoint_url": "https://sparql.uniprot.org/sparql/", + "iri": "http://www.uniprot.org/help/about", "doc_type": "schemaorg_description", }, ) @@ -201,8 +204,9 @@ def init_vectordb(vectordb_host: str = settings.vectordb_host) -> None: collection_name=settings.entities_collection_name, vectors_config=VectorParams(size=settings.embedding_dimensions, distance=Distance.COSINE), ) - if vectordb.get_collection(settings.entities_collection_name).points_count == 0: - load_entities_embeddings_to_vectordb() + # if vectordb.get_collection(settings.entities_collection_name).points_count == 0: + load_entities_embeddings_to_vectordb() + # docs = [] # # TODO: Add entities list to the vectordb diff --git a/src/sparql_llm/embed_entities.py b/src/sparql_llm/embed_entities.py index 69b67fc..74b89e0 100644 --- a/src/sparql_llm/embed_entities.py +++ b/src/sparql_llm/embed_entities.py @@ -134,32 +134,6 @@ def generate_embeddings_for_entities(): ?uri a orth:Protein . ?uri rdfs:label ?label .}""", }, - # TODO: way too many UniProt genes, should we just ignore indexing genes? - # "uniprot_gene": { - # "uri": "http://purl.uniprot.org/core/Gene", - # "label": "Gene", - # "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.", - # "endpoint": "https://sparql.uniprot.org/sparql/", - # "pagination": True, - # "query": """PREFIX skos: - # PREFIX up: - # PREFIX rdfs: - # SELECT ?uri ?label { - # ?uri a up:Gene . - # ?uri skos:prefLabel ?label .}""", - # }, - # "uniprot_protein": { - # "uri": "http://purl.uniprot.org/core/Protein", - # "label": "Protein", - # "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.", - # "endpoint": "https://sparql.uniprot.org/sparql/", - # "pagination": True, - # "query": """PREFIX up: - # PREFIX rdfs: - # SELECT ?uri ?label { - # ?uri a up:Protein . - # ?uri rdfs:label ?label .}""", - # }, "uniprot_species": { "uri": "http://purl.uniprot.org/core/Taxon", "label": "species", @@ -201,19 +175,6 @@ def generate_embeddings_for_entities(): orth:taxRange ?label . }""", }, - # "uniprot_mnemonics": { - # "uri": "http://purl.uniprot.org/core/Protein", - # "label": "mnemonic", - # "description": "uniprot mnemonic", - # "endpoint": "https://sparql.uniprot.org/sparql/", - # "pagination": True, - # "query": """PREFIX up: - # SELECT ?uri ?label - # WHERE { - # ?uri a up:Protein ; - # up:mnemonic ?label . - # }""", - # }, "uniprot_taxon": { "uri": "http://purl.uniprot.org/core/Taxon", "label": "species", @@ -241,6 +202,45 @@ def generate_embeddings_for_entities(): skos:prefLabel ?label . }""", }, + # TODO: way too many UniProt genes, should we just ignore indexing genes? + # "uniprot_gene": { + # "uri": "http://purl.uniprot.org/core/Gene", + # "label": "Gene", + # "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.", + # "endpoint": "https://sparql.uniprot.org/sparql/", + # "pagination": True, + # "query": """PREFIX skos: + # PREFIX up: + # PREFIX rdfs: + # SELECT ?uri ?label { + # ?uri a up:Gene . + # ?uri skos:prefLabel ?label .}""", + # }, + # "uniprot_protein": { + # "uri": "http://purl.uniprot.org/core/Protein", + # "label": "Protein", + # "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.", + # "endpoint": "https://sparql.uniprot.org/sparql/", + # "pagination": True, + # "query": """PREFIX up: + # PREFIX rdfs: + # SELECT ?uri ?label { + # ?uri a up:Protein . + # ?uri rdfs:label ?label .}""", + # }, + # "uniprot_mnemonics": { + # "uri": "http://purl.uniprot.org/core/Protein", + # "label": "mnemonic", + # "description": "uniprot mnemonic", + # "endpoint": "https://sparql.uniprot.org/sparql/", + # "pagination": True, + # "query": """PREFIX up: + # SELECT ?uri ?label + # WHERE { + # ?uri a up:Protein ; + # up:mnemonic ?label . + # }""", + # }, } docs: list[Document] = [] @@ -301,7 +301,7 @@ def load_entities_embeddings_to_vectordb(): docs = [] embeddings = [] - print("Reading entities embeddings from the .csv file") + print(f"Reading entities embeddings from the .csv file at {entities_embeddings_filepath}") with open(entities_embeddings_filepath) as file: reader = csv.DictReader(file) for row in tqdm(reader, desc="Extracting embeddings from CSV file"): diff --git a/src/sparql_llm/sparql_void_shapes_loader.py b/src/sparql_llm/sparql_void_shapes_loader.py index fb88c42..fee751b 100644 --- a/src/sparql_llm/sparql_void_shapes_loader.py +++ b/src/sparql_llm/sparql_void_shapes_loader.py @@ -40,7 +40,7 @@ def load(self) -> list[Document]: metadata_dict = { "answer": shex_shape["shex"], "endpoint_url": self.endpoint_url, - "class_uri": cls_uri, + "iri": cls_uri, "doc_type": "shex", } if "label" in shex_shape: