From 99c3669d8f6c772cc0e08f738b37a2a961c3c843 Mon Sep 17 00:00:00 2001 From: Deep Nayak <59172064+deepnayak@users.noreply.github.com> Date: Sat, 15 Jun 2024 14:58:49 +0530 Subject: [PATCH] Llama3 Integration and RAG pipeline for querying (#2) * Initial commit containing Ollama and Llama3 code * Removed ngrok URL * Improved performance and integrated GoaT API entity lookup * Implemented changes requested in PR review * Minor changes * Minor changes --- .env.dist | 3 + .gitignore | 5 ++ INSTALL.md | 70 +++++++++++++++++++ app.py | 49 ++++++++++++++ index.py | 87 ++++++++++++++++++++++++ prompt.py | 49 ++++++++++++++ query_engine.py | 52 ++++++++++++++ query_reformulation.py | 75 ++++++++++++++++++++ requirements.txt | 8 +++ rich_queries/queryV1.json | 70 +++++++++++++++++++ templates/chat.html | 139 ++++++++++++++++++++++++++++++++++++++ 11 files changed, 607 insertions(+) create mode 100644 .env.dist create mode 100644 .gitignore create mode 100644 INSTALL.md create mode 100644 app.py create mode 100644 index.py create mode 100644 prompt.py create mode 100644 query_engine.py create mode 100644 query_reformulation.py create mode 100644 requirements.txt create mode 100644 rich_queries/queryV1.json create mode 100644 templates/chat.html diff --git a/.env.dist b/.env.dist new file mode 100644 index 0000000..6972c32 --- /dev/null +++ b/.env.dist @@ -0,0 +1,3 @@ +OLLAMA_HOST_URL=http://127.0.0.1:11434 +RETRY_COUNT=3 +GOAT_BASE_URL=https://goat.genomehubs.org/api/v2 \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..86aaf7c --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +prompts/__pycache__ +rich_query_index +.DS_Store +.env +__pycache__ \ No newline at end of file diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..c9b99a9 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,70 @@ + +# Installation Guide + +This guide provides step-by-step instructions to set up the project after cloning the repository. + +## Step 1: Install Miniconda + +Download and install Miniconda using the following commands: + +```bash +curl https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > Miniconda3.sh +chmod +x Miniconda3.sh +./Miniconda3.sh +``` + +## Step 2: Create a Conda Environment + +Create a new Conda environment with Python 3.12 and activate it: + +```bash +conda create -y -n nlp python=3.12 +conda activate nlp +``` + +## Step 3: Clone the Repository + +Clone the repository using the specified branch: + +```bash +git clone https://github.com/genomehubs/goat-nlp +cd goat-nlp +``` + +## Step 4: Install Python Dependencies + +Install the required Python packages using pip: + +```bash +pip install -r requirements.txt +``` + +## Step 5: Install Ollama + +Install Ollama using the provided script: + +```bash +curl -fsSL https://ollama.com/install.sh | sh +``` + +## Step 6: Run Ollama + +Run the Ollama application: + +```bash +ollama run llama3 +``` + +## Step 7: Start the Flask Application + +Set the necessary environment variables and start the Flask application: + +```bash +export OLLAMA_HOST_URL=http://127.0.0.1:11434 +export RETRY_COUNT=5 +export GOAT_BASE_URL=https://goat.genomehubs.org/api/v2 +python -m flask run +``` + +The UI will be available at `http://localhost:5000/` + diff --git a/app.py b/app.py new file mode 100644 index 0000000..ed137e9 --- /dev/null +++ b/app.py @@ -0,0 +1,49 @@ +import sys +from flask import Flask, request, render_template, jsonify +from index import load_index, query_engine +from query_reformulation import fetch_related_taxons +import json +import logging + + +app = Flask("goat_nlp") + +handler = logging.StreamHandler(sys.stdout) +handler.setFormatter(logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) +app.logger.addHandler(handler) +app.logger.setLevel(logging.INFO) + + +def chat_bot_rag(query): + entity_taxon_map = fetch_related_taxons(query) + + return query_engine.custom_query(query, + entity_taxon_map) + + +@app.route('/') +def home(): + return render_template('chat.html') + + +@app.route('/rebuildIndex') +def index(): + load_index(force_reload=True) + + +@app.route('/chat', methods=['POST']) +def chat(): + user_message = request.form['user_input'] + bot_message = chat_bot_rag(user_message) + + try: + bot_message = json.loads(bot_message)["url"] + except Exception: + pass + + return jsonify({'response': str(bot_message)}) + + +if __name__ == '__main__': + app.run(debug=True) diff --git a/index.py b/index.py new file mode 100644 index 0000000..79bfad1 --- /dev/null +++ b/index.py @@ -0,0 +1,87 @@ +from dotenv import load_dotenv +from llama_index.core import VectorStoreIndex, StorageContext +from llama_index.core import load_index_from_storage +from llama_index.core import get_response_synthesizer +from llama_index.core import SimpleDirectoryReader +from llama_index.core import Settings +from llama_index.llms.ollama import Ollama +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +import os + +from prompt import QUERY_PROMPT +from query_engine import GoaTAPIQueryEngine + + +load_dotenv() +Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5") +Settings.llm = Ollama(model="llama3", base_url=os. + getenv("OLLAMA_HOST_URL", "http://127.0.0.1:11434"), + request_timeout=36000.0) +Settings.chunk_size = 256 + + +def build_index(documents, + save_dir="rich_query_index", + force=False): + ''' + Build the index from the given rich queries and save it in the specified + directory. + + Parameters: + - documents (list): A list of rich queries to build the index from. + - save_dir (str): The directory path where the index will be saved. + Defaults to "rich_query_index". + - force (bool): If True, forces the index to be rebuilt even if the + save directory already exists. Defaults to False. + + Returns: + - query_index (VectorStoreIndex): The built index. + + Raises: + - FileNotFoundError: If the save directory does not exist and force is + set to False. + ''' + if not os.path.exists(save_dir) or force: + query_index = VectorStoreIndex.from_documents( + documents + ) + query_index.storage_context.persist(persist_dir=save_dir) + else: + query_index = load_index_from_storage( + StorageContext.from_defaults(persist_dir=save_dir) + ) + + return query_index + + +def load_index(force_reload=False): + ''' + Load the index and query engine for the GoaT NLP system. + + Parameters: + force_reload (bool): If True, force reload the index and rebuild it. + Default is False. + + Returns: + tuple: A tuple containing the index and query engine. + + ''' + documents = SimpleDirectoryReader( + "rich_queries" + ).load_data() + + index = build_index(documents, force=force_reload) + retriever = index.as_retriever(similarity_top_k=3) + synthesizer = get_response_synthesizer(response_mode="compact") + + query_engine = GoaTAPIQueryEngine( + retriever=retriever, + response_synthesizer=synthesizer, + llm=Settings.llm, + qa_prompt=QUERY_PROMPT, + ) + + return index, query_engine + + +index, query_engine = load_index() diff --git a/prompt.py b/prompt.py new file mode 100644 index 0000000..766c7ca --- /dev/null +++ b/prompt.py @@ -0,0 +1,49 @@ +from llama_index.core import PromptTemplate + + +QUERY_PROMPT = PromptTemplate('''We need to query a database that is exposed +by an API that has its own query syntax. I am giving you the query by the user, +you need to convert it to the API counter part. Use the examples given below as +reference: + +{context_str} + +------ + +The current date and time is {time} +Use this for any time related calculation + +We have also fetched some related entities and their taxon id: +{entity_taxon_map} +Use the best matching result from this in the final output. + + +Query given by the user: +{query_str} + + +Return your response in a JSON of the following pattern: +{{ + "url": "" +}} +I do not want any explanation, return ONLY the json +''') + +ENTITY_PROMPT = '''The following query is given by the user: + +{query} + +We need to make an API call using this query. +For that we need to convert all the entities in this query to their +scientific counterparts (specifically their family/species name). +For e.g. cat/fox will be translated to Felidae, elephant to Elephantidae. +Return all entities and their converted form as a single list of strings in a JSON of the following format: +{{ + "entity": ["", ""] +}} +I do not want any explanation, return ONLY the json +''' + + +def wrap_with_entity_prompt(query: str): + return ENTITY_PROMPT.format(query=query) diff --git a/query_engine.py b/query_engine.py new file mode 100644 index 0000000..5713c7e --- /dev/null +++ b/query_engine.py @@ -0,0 +1,52 @@ +import logging +from llama_index.llms.ollama import Ollama +from llama_index.core.query_engine import CustomQueryEngine +from llama_index.core.retrievers import BaseRetriever +from llama_index.core.response_synthesizers import BaseSynthesizer +from llama_index.core import PromptTemplate +from datetime import datetime + + +logger = logging.getLogger('goat_nlp.query_engine') + +class GoaTAPIQueryEngine(CustomQueryEngine): + """ + Custom query engine for the GoaT API. + + Attributes: + retriever (BaseRetriever): The retriever used to retrieve nodes. + response_synthesizer (BaseSynthesizer): The synthesizer used to + generate responses. + llm (Ollama): The language model used for completion. + qa_prompt (PromptTemplate): The template for the QA prompt. + """ + + retriever: BaseRetriever + response_synthesizer: BaseSynthesizer + llm: Ollama + qa_prompt: PromptTemplate + + def custom_query(self, query_str: str, entity_taxon_map: dict): + """ + Custom query method. + + Args: + query_str (str): The query string. + entity_taxon_map (dict): The entity taxon map. + + Returns: + str: The response generated by the language model. + """ + nodes = self.retriever.retrieve(query_str) + + context_str = "\n\n".join([n.node.get_content() for n in nodes]) + current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + populated_prompt = self.qa_prompt.format(context_str=context_str, query_str=query_str, + entity_taxon_map=entity_taxon_map, + time=current_time) + logger.info(populated_prompt) + response = self.llm.complete( + populated_prompt + ) + + return str(response) diff --git a/query_reformulation.py b/query_reformulation.py new file mode 100644 index 0000000..154bae6 --- /dev/null +++ b/query_reformulation.py @@ -0,0 +1,75 @@ +from llama_index.core import Settings +from prompt import wrap_with_entity_prompt +import os +import json +import requests +import logging + +logger = logging.getLogger('goat_nlp.query_reformulation') + + +def fetch_related_taxons(query: str): + """ + Fetches related taxons for a given query. + + Args: + query (str): The query for which related taxons need to be fetched. + + Returns: + dict: A dictionary mapping entities to their corresponding taxons. + + Raises: + Exception: If an error occurs while making the API call to retrieve + taxons. + + Example: + >>> query = "find the number of assemblies for bat" + >>> fetch_related_taxons(query) + {'bat': ['Chiroptera', 'bat']} + + """ + entity_taxon_map = {} + for _ in range(int(os.getenv("RETRY_COUNT", 3))): + try: + llm_response = Settings.llm.complete(wrap_with_entity_prompt(query)) + entities = json.loads(llm_response.text)['entity'] + logger.info(entities) + entity_taxon_map = goat_api_call_for_taxon(entities) + break + except Exception: + pass + return entity_taxon_map + + +def goat_api_call_for_taxon(entities: list) -> dict: + """ + Makes an API call to retrieve taxons for a list of entities. + + Args: + entities (list): A list of entities for which taxons need to be + retrieved. + + Returns: + dict: A dictionary mapping entities to their corresponding taxons. + + Raises: + Exception: If an error occurs while making the API call to retrieve + taxons. + + Example: + >>> entities = ["bat", "cat", "dog"] + >>> goat_api_call_for_taxon(entities) + {'bat': ['Chiroptera', 'bat'], 'cat': ['Felis', 'cat'], + 'dog': ['Canis', 'dog']} + """ + entity_result_map = {} + for entity in entities: + try: + response = requests.get(os.getenv('GOAT_BASE_URL', 'https://goat.genomehubs.org/api/v2') + + f"/lookup?searchTerm={entity}" + + "&result=taxon&taxonomy=ncbi") + json_data = response.json() if response and response.status_code == 200 else None + entity_result_map[entity] = [x["result"] for x in json_data['results']] + except Exception: + pass + return entity_result_map diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1c001e4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +Flask +llama-index +llama-index-llms-ollama +llama-index-llms-replicate +llama-index-embeddings-huggingface +llama-parse +Werkzeug==2.2.2 +python-dotenv diff --git a/rich_queries/queryV1.json b/rich_queries/queryV1.json new file mode 100644 index 0000000..9e54fee --- /dev/null +++ b/rich_queries/queryV1.json @@ -0,0 +1,70 @@ +[ + { + "english_query": "What species of bats have been sequenced?", + "api_query": "https://goat.genomehubs.org/search?result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%289397%5BChiroptera%5D%29%20AND%20tax_rank%28species%29%20AND%20assembly_span&excludeMissing%5B0%5D=assembly_span&excludeAncestral%5B0%5D=assembly_span" + }, + { + "english_query": "How many assemblies have been produced this month?", + "api_query": "https://goat.genomehubs.org/search?result=assembly&includeEstimates=false&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=last_updated%3E%3D2024-05-01&fields=assembly_span%2Cassembly_level%2Clast_updated&names=&ranks=&excludeMissing%5B0%5D=assembly_level" + }, + { + "english_query": "What’s the sequencing status of the greater spear-nosed bat?", + "api_query": "https://goat.genomehubs.org/search?query=tax_name%289423%5BPhyllostomus%20hastatus%5D%29%20AND%20sequencing_status&result=taxon&includeEstimates=false&summaryValues=count&taxonomy=ncbi&size=100&offset=0&fields=sequencing_status&names=&ranks=" + }, + { + "english_query": "What information do we have about spiders?", + "api_query": "https://goat.genomehubs.org/record?recordId=6893&result=taxon&taxonomy=ncbi#Araneae" + }, + { + "english_query": "Does Borneo magnolia have RNA-sequencing?", + "api_query": "https://goat.genomehubs.org/search?result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&fields=total_runs%2Ctotal_reads%2Clibrary_source%2Cplatform%2Csra_accession%2Crun_accession%2Creads&names=&ranks=&query=tax_name%282933196%5BMagnolia%20borneensis%5D%29%20AND%20sra_accession&emptyColumns=true" + }, + { + "english_query": "What mushroom species have RNA seq?", + "api_query": "https://goat.genomehubs.org/search?result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&fields=total_runs%2Ctotal_reads%2Clibrary_source%2Cplatform%2Csra_accession%2Crun_accession%2Creads&names=&ranks=&query=tax_tree%285204%5BBasidiomycota%5D%29%20AND%20tax_rank%28species%29%20AND%20sra_accession" + }, + { + "english_query": "How many bird species have genome assemblies?", + "api_query": "https://goat.genomehubs.org/search?query=tax_tree%288782%5BAves%5D%29%20AND%20tax_rank%28species%29%20AND%20assembly_span&result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&emptyColumns=false&excludeAncestral%5B0%5D=assembly_span&excludeMissing%5B0%5D=assembly_spanhttps://goat.genomehubs.org/search?result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%281442%5BAves%5D%29%20AND%20tax_rank%28species%29%20AND%20assembly_span&excludeMissing%5B0%5D=assembly_span&excludeAncestral%5B0%5D=assembly_span" + }, + { + "english_query": "What sequencing platforms are used for rodent transcriptomes?", + "api_query": "https://goat.genomehubs.org/search?result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&fields=platform&names=&ranks=&query=tax_tree%289989%5BRodentia%5D%29%20AND%20platformhttps://goat.genomehubs.org/search?result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&fields=platform&names=&ranks=&query=tax_tree%281472%5BRodentia%5D%29%20AND%20platform" + }, + { + "english_query": "List all assemblies for humans", + "api_query": "https://goat.genomehubs.org/search?result=assembly&includeEstimates=false&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_name%289606%5BHomo%20sapiens%5D%29%20AND%20assembly_level" + }, + { + "english_query": "Find samples with RNA-seq data for wolves", + "api_query": "https://goat.genomehubs.org/search?result=sample&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&fields=sra_accession%2Clibrary_source%2Crun_accession%2Cplatform%2Creads&names=&ranks=&query=tax_name%289612%5BCanis%20lupus%5D%29%20AND%20sra_accession" + }, + { + "english_query": "How many genome assemblies are available for the genus fruit fly?", + "api_query": "https://goat.genomehubs.org/search?result=assembly&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%287215%5BDrosophila%5D%29%20AND%20assembly_span" + }, + { + "english_query": "What are the latest assemblies for the family cat?", + "api_query": "https://goat.genomehubs.org/search?result=assembly&includeEstimates=false&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%289681%5BFelidae%5D%29%20AND%20last_updated%3E%3D2024-01-01&fields=assembly_span%2Cassembly_level%2Clast_updatedhttps://goat.genomehubs.org/search?result=assembly&includeEstimates=false&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%289685%5BFelidae%5D%29%20AND%20last_updated%3E%3D2024-01-01&fields=assembly_span%2Cassembly_level%2Clast_updated" + }, + { + "english_query": "Which species in the order Primates have been sequenced?", + "api_query": "https://goat.genomehubs.org/search?result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%289443%5BPrimates%5D%29%20AND%20tax_rank%28species%29%20AND%20assembly_span&excludeMissing%5B0%5D=assembly_span&excludeAncestral%5B0%5D=assembly_span" + }, + { + "english_query": "Do any samples for cattle include RNA-seq data?", + "api_query": "https://goat.genomehubs.org/search?result=sample&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&fields=sra_accession%2Clibrary_source%2Crun_accession%2Cplatform%2Creads&names=&ranks=&query=tax_name%289913%5BBos%20taurus%5D%29%20AND%20sra_accession" + }, + { + "english_query": "How many genome assemblies for fungi have been updated recently?", + "api_query": "https://goat.genomehubs.org/search?result=assembly&includeEstimates=false&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%284751%5BFungi%5D%29%20AND%20last_updated%3E%3D2024-01-01" + }, + { + "english_query": "Find all genome assemblies for reptiles.", + "api_query": "https://goat.genomehubs.org/search?result=assembly&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%288504%5BReptilia%5D%29%20AND%20assembly_span&excludeMissing%5B0%5D=assembly_spanhttps://goat.genomehubs.org/search?result=assembly&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&query=tax_tree%288457%5BReptilia%5D%29%20AND%20assembly_span&excludeMissing%5B0%5D=assembly_span" + }, + { + "english_query": "What are the available RNA-seq platforms for the common fruit fly?", + "api_query": "https://goat.genomehubs.org/search?result=taxon&includeEstimates=true&summaryValues=count&taxonomy=ncbi&size=100&offset=0&fields=platform&names=&ranks=&query=tax_name%287227%5BDrosophila%20melanogaster%5D%29%20AND%20sra_accession" + } +] diff --git a/templates/chat.html b/templates/chat.html new file mode 100644 index 0000000..6a50124 --- /dev/null +++ b/templates/chat.html @@ -0,0 +1,139 @@ + + + + + + GoaT NLP + + + + + + +
+

GoaT NLP Tester

+
+
+

You are connected to LLama3, enter a query to get the API URL in the response

+
+
+
+ + +
+
+ +