Use external vector database (#21)

toptal · Apr 12, 2024 · 261c74c · 261c74c
1 parent 511d450
commit 261c74c
Show file tree

Hide file tree

Showing 25 changed files with 552 additions and 581 deletions.
diff --git a/.env.example b/.env.example
@@ -1,7 +1,11 @@
 # Example of .env file. Check out 1Password for the actual values.
 
 NUR_API_HOST=localhost
-NUR_API_PORT=8000
+NUR_API_PORT=8080
+
+CHROMA_HOST=localhost
+CHROMA_PORT=8000
+CHROMA_DATABASE=default_database
 
 CONFLUENCE_BASE_URL="https://<namespace>.atlassian.net/"
 CONFLUENCE_USER="[email protected]"

diff --git a/api/endpoint.py b/api/endpoint.py
@@ -6,9 +6,9 @@
 from credentials import oai_api_key
 from slack.event_consumer import process_question, process_feedback
 from pydantic import BaseModel
-from vector.chroma import vectorize_document_and_store_in_db
 from configuration import api_host, api_port
-from interactions.vectorize_and_store import vectorize_interaction_and_store_in_db
+import vector.pages
+import vector.interactions
 
 processor = FastAPI()
 
@@ -63,7 +63,7 @@ def create_embeds(EmbedRequest: EmbedRequest):
     """
     # Using threading to process the embedding generation and storage without blocking the endpoint response
     page_id = EmbedRequest.page_id
-    thread = threading.Thread(target=vectorize_document_and_store_in_db, args=(page_id,))
+    thread = threading.Thread(target=vector.pages.generate_one_embedding_to_database, args=(page_id,))
     thread.start()
     return {"message": "Embedding generation initiated, processing in background", "page_id": page_id}
 
@@ -77,7 +77,7 @@ def create_interaction_embeds(InteractionEmbedRequest: InteractionEmbedRequest):
     print(f"Received interaction embed request for ID: {interaction_id}")  # Debugging line
 
     # Use threading to process the embedding generation and storage without blocking the endpoint response
-    thread = threading.Thread(target=vectorize_interaction_and_store_in_db, args=(interaction_id,))
+    thread = threading.Thread(target=vector.interactions.generate_one_embedding_to_database, args=(interaction_id,))
     thread.start()
 
     # Make sure to return a response that matches what your client expects

diff --git a/bin/manage b/bin/manage
diff --git a/bin/rename_collection.py b/bin/rename_collection.py
diff --git a/compose.yml b/compose.yml
@@ -1,40 +1,12 @@
-name: nur
+name: top-assist
 
 services:
-  web:
-    build:
-      context: .
-      dockerfile: Dockerfile
-    command:
-      - poetry
-      - run
-      - python
-      - api/endpoint.py
-    ports:
-      - "8000:8000"
-    volumes:
-      - shared_content:/app/content
-    environment:
-      - NUR_API_HOST=web
-      - NUR_API_PORT=8000
-
-  slack:
-    depends_on:
-      - web
-    build:
-      context: .
-      dockerfile: Dockerfile
-    command:
-      - poetry
-      - run
-      - python
-      - slack/bot.py
+  chroma:
+    image: ghcr.io/chroma-core/chroma:latest
     volumes:
-      - shared_content:/app/content
-    environment:
-      - NUR_API_HOST=web
-      - NUR_API_PORT=8000
+      - chroma_data:/chroma/.chroma/index
+    ports:
+      - 8000:8000
 
 volumes:
-  shared_content:
-
+  chroma_data:
diff --git a/configuration.py b/configuration.py
@@ -23,8 +23,13 @@ def get_project_root() -> str:
 chart_folder_path = os.path.join(project_path, "content", "charts")
 sql_file_path = os.path.join(project_path, "content", "database", "confluence_pages_sql.db")
 db_url = 'sqlite:///' + sql_file_path
-vector_folder_path = os.path.join(project_path, "content", "vectors", "confluence_pages")
-interactions_folder_path = os.path.join(project_path, "content", "vectors", "confluence_interactions")
+
+# Chroma vector database configuration
+chroma_host = os.environ.get("CHROMA_HOST")
+chroma_port = int(os.environ.get("CHROMA_PORT"))
+chroma_database = os.environ.get("CHROMA_DATABASE")
+vector_collection_pages = "pages"
+vector_collection_interactions = "interactions"
 
 # Assistant IDs
 qa_assistant_id = os.environ.get("OPENAI_ASSISTANT_ID_QA")
@@ -45,20 +50,15 @@ def get_project_root() -> str:
 
 # page retrieval for answering questions
 # document count is recommended from 3 to 15 where 3 is minimum cost and 15 is maximum comprehensive answer
-document_count = 2
+question_context_pages_count = 2
 # interaction retrieval for identifying knowledge gaps interaction_retrieval_count is recommended from 3 to 10 where
 # 3 is minimum cost and 10 is maximum comprehensive list of questions
-interaction_retrieval_count = 5
+knowledge_gap_interaction_retrieval_count = 5
 
 # Configuration for the Nur Services API
 # get the values from the environment variables if available or use the default values
-api_host = os.environ.get("NUR_API_HOST", "localhost")
-api_port = int(os.environ.get("NUR_API_PORT", "8000"))
-
-# Name of the vector collection
-pages_collection_name = "pages"
-interactions_collection_name = "interactions"
-
+api_host = os.environ.get("NUR_API_HOST")
+api_port = int(os.environ.get("NUR_API_PORT"))
 
 # System Knowledge space name on Confluence
 system_knowledge_space_private = "Nur Documentation"

diff --git a/confluence/importer.py b/confluence/importer.py
@@ -1,12 +1,9 @@
 from datetime import datetime
 import logging
-import requests
-import time
 
-from configuration import api_host, api_port
 from database.page_manager import PageManager
 from database.space_manager import SpaceManager
-from vector.create_vector_db import add_embeds_to_vector_db
+import vector.pages
 
 from .client import ConfluenceClient
 from .retriever import retrieve_space
@@ -15,55 +12,6 @@
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
-def submit_embedding_creation_request(page_id: str):
-    endpoint_url = f'http://{api_host}:{api_port}/api/v1/embeds'
-    headers = {"Content-Type": "application/json"}
-    payload = {"page_id": page_id}
-
-    try:
-        response = requests.post(endpoint_url, json=payload, headers=headers)
-        response.raise_for_status()  # This will raise for HTTP errors
-        logging.info(f"Embedding creation request successful for page ID {page_id}.")
-    except requests.exceptions.HTTPError as e:
-        logging.error(f"HTTP error occurred while submitting embedding creation request for page ID {page_id}: {e}")
-    except Exception as e:
-        logging.error(f"An error occurred while submitting embedding creation request for page ID {page_id}: {e}")
-
-
-def generate_missing_page_embeddings(retry_limit: int = 3, wait_time: int = 5) -> None:
-    for attempt in range(retry_limit):
-        # Retrieve the IDs of pages that are still missing embeddings.
-        page_ids = PageManager().get_page_ids_missing_embeds()
-        # If there are no pages missing embeddings, exit the loop and end the process.
-        if not page_ids:
-            logging.info("All pages have embeddings. Process complete.")
-            return
-
-        logging.info(f"Attempt {attempt + 1} of {retry_limit}: Processing {len(page_ids)} pages missing embeddings.")
-        for page_id in page_ids:
-            # Submit a request to generate an embedding for each page ID.
-            submit_embedding_creation_request(page_id)
-            # A brief delay between requests to manage load and potentially avoid rate limiting.
-            time.sleep(0.2)
-
-        logging.info(f"Waiting for {wait_time} seconds for embeddings to be processed...")
-        time.sleep(wait_time)
-
-        # After waiting, retrieve the list of pages still missing embeddings to see if the list has decreased.
-        # This retrieval is crucial to ensure that the loop only continues if there are still pages that need processing.
-        if page_ids := PageManager().get_page_ids_missing_embeds():
-            logging.info(f"After attempt {attempt + 1}, {len(page_ids)} pages are still missing embeds.")
-        else:
-            logging.info("All pages now have embeddings. Process complete.")
-            break  # Break out of the loop if there are no more pages missing embeddings.
-
-    # After exhausting the retry limit, check if there are still pages without embeddings.
-    if page_ids:
-        logging.info("Some pages still lack embeddings after all attempts.")
-    else:
-        logging.info("All pages now have embeddings. Process complete.")
-
-
 def tui_choose_space():
     """
     Prompt the user to choose a Confluence space from a list of available spaces.
@@ -84,12 +32,13 @@ def import_space(space_key, space_name):
 
     pages = retrieve_space(space_key)
     PageManager().store_pages_data(space_key, pages)
-    generate_missing_page_embeddings()
+
+    vector.pages.generate_missing_embeddings_to_database()
 
     SpaceManager().upsert_space_info(
         space_key=space_key,
         space_name=space_name,
         last_import_date=import_date
     )
 
-    add_embeds_to_vector_db(space_key)
+    vector.pages.import_from_database(space_key)
diff --git a/content/vectors/.gitignore b/content/vectors/.gitignore
diff --git a/interactions/identify_knowledge_gap.py b/interactions/identify_knowledge_gap.py
@@ -1,65 +1,24 @@
-# ./interactions/identify_knowledge_gap.py
-import chromadb
-import logging
 import json
+import logging
 from typing import List, Tuple
-from configuration import interactions_folder_path, embedding_model_id, knowledge_gap_discussions_channel_id
-from configuration import interaction_retrieval_count, interactions_collection_name
-from configuration import quizz_assistant_id
-from open_ai.embedding.embed_manager import embed_text
+
+from configuration import (
+    knowledge_gap_discussions_channel_id,
+    knowledge_gap_interaction_retrieval_count,
+    quizz_assistant_id,
+)
+from database.interaction_manager import QAInteractionManager, QAInteraction
+from database.quiz_question_manager import QuizQuestionManager
 from open_ai.assistants.utility import extract_assistant_response, initiate_client
 from open_ai.assistants.thread_manager import ThreadManager
 from open_ai.assistants.assistant_manager import AssistantManager
-from database.interaction_manager import QAInteractionManager, QAInteraction
-from database.quiz_question_manager import QuizQuestionManager
 from slack.message_manager import post_questions_to_slack
+import vector.interactions
 
 
 logging.basicConfig(level=logging.INFO)
 
 
-def retrieve_relevant_interaction_ids(query: str) -> List[str]:
-    """
-    Retrieve the most relevant interactions for a given query using ChromaDB.
-
-    Args:
-        query (str): The query to retrieve relevant interactions for.
-
-    Returns:
-        List[str]: A list of interaction IDs of the most relevant interactions.
-    """
-
-    # Generate the query embedding using OpenAI
-    try:
-        query_embedding = embed_text(text=query, model=embedding_model_id)
-    except Exception as e:
-        logging.error(f"Error generating query embedding: {e}")
-        return []
-
-    # Initialize the ChromaDB client
-    client = chromadb.PersistentClient(path=interactions_folder_path)
-
-    collections = client.list_collections()
-    logging.info(f"Available collections: {collections}")
-
-    collection = client.get_collection(interactions_collection_name)
-
-    # Perform a similarity search in the collection
-    similar_items = collection.query(
-        query_embeddings=[query_embedding],
-        n_results=interaction_retrieval_count
-    )
-
-    # Extract and return the interaction IDs from the results
-    if 'ids' in similar_items:
-        interaction_ids = [id for sublist in similar_items['ids'] for id in sublist]
-    else:
-        logging.warning("No 'ids' key found in similar_items, no interactions retrieved.")
-        interaction_ids = []
-
-    return interaction_ids
-
-
 def format_interactions(interactions: List['QAInteraction']) -> Tuple[str, List[str]]:
     """
     Format a list of QAInteraction objects into a human-readable string and collect user IDs.
@@ -200,7 +159,7 @@ def process_and_store_questions(assistant_response_json):
     return quiz_question_dtos
 
 
-def strip_json(assistant_response):
+def strip_json(assistant_response: str):
     """
     Receives the response and extracts only the JSON from it, then returns the JSON.
     This function assumes the JSON content is wrapped in triple backticks with 'json' as a language identifier.
@@ -212,10 +171,15 @@ def strip_json(assistant_response):
         str: The extracted JSON content as a string. Returns an empty JSON array '[]' if extraction fails.
     """
     try:
-        # Attempt to find the start of the JSON content
-        start_index = assistant_response.index("```json") + len("```json")
-        end_index = assistant_response.index("```", start_index)
-        json_content = assistant_response[start_index:end_index].strip()
+        json_content = assistant_response
+
+        # Strip markdown quotes if any
+        if "```json" in assistant_response:
+            start_index = assistant_response.index("```json") + len("```json")
+            end_index = assistant_response.index("```", start_index)
+            json_content = json_content[start_index:end_index]
+
+        json_content = json_content.strip()
         # Basic validation to check if it's likely to be JSON
         if json_content.startswith("[") and json_content.endswith("]"):
             return json_content
@@ -229,7 +193,7 @@ def strip_json(assistant_response):
 
 def identify_knowledge_gaps(context):
     query = f"no information in context: {context}"
-    interaction_ids = retrieve_relevant_interaction_ids(query)
+    interaction_ids = vector.interactions.retrieve_relevant_ids(query, count=knowledge_gap_interaction_retrieval_count)
     relevant_qa_interactions = QAInteractionManager().get_interactions_by_interaction_ids(interaction_ids)
     formatted_interactions, user_ids = format_interactions(relevant_qa_interactions)
     assistant_response, thread_ids = query_assistant_with_context(context, formatted_interactions)