Skip to content
This repository has been archived by the owner on Apr 29, 2024. It is now read-only.

Commit

Permalink
Use external vector database (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
id-ilych authored Apr 12, 2024
1 parent 511d450 commit 261c74c
Show file tree
Hide file tree
Showing 25 changed files with 552 additions and 581 deletions.
6 changes: 5 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# Example of .env file. Check out 1Password for the actual values.

NUR_API_HOST=localhost
NUR_API_PORT=8000
NUR_API_PORT=8080

CHROMA_HOST=localhost
CHROMA_PORT=8000
CHROMA_DATABASE=default_database

CONFLUENCE_BASE_URL="https://<namespace>.atlassian.net/"
CONFLUENCE_USER="[email protected]"
Expand Down
8 changes: 4 additions & 4 deletions api/endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
from credentials import oai_api_key
from slack.event_consumer import process_question, process_feedback
from pydantic import BaseModel
from vector.chroma import vectorize_document_and_store_in_db
from configuration import api_host, api_port
from interactions.vectorize_and_store import vectorize_interaction_and_store_in_db
import vector.pages
import vector.interactions

processor = FastAPI()

Expand Down Expand Up @@ -63,7 +63,7 @@ def create_embeds(EmbedRequest: EmbedRequest):
"""
# Using threading to process the embedding generation and storage without blocking the endpoint response
page_id = EmbedRequest.page_id
thread = threading.Thread(target=vectorize_document_and_store_in_db, args=(page_id,))
thread = threading.Thread(target=vector.pages.generate_one_embedding_to_database, args=(page_id,))
thread.start()
return {"message": "Embedding generation initiated, processing in background", "page_id": page_id}

Expand All @@ -77,7 +77,7 @@ def create_interaction_embeds(InteractionEmbedRequest: InteractionEmbedRequest):
print(f"Received interaction embed request for ID: {interaction_id}") # Debugging line

# Use threading to process the embedding generation and storage without blocking the endpoint response
thread = threading.Thread(target=vectorize_interaction_and_store_in_db, args=(interaction_id,))
thread = threading.Thread(target=vector.interactions.generate_one_embedding_to_database, args=(interaction_id,))
thread.start()

# Make sure to return a response that matches what your client expects
Expand Down
1 change: 0 additions & 1 deletion bin/manage

This file was deleted.

32 changes: 0 additions & 32 deletions bin/rename_collection.py

This file was deleted.

42 changes: 7 additions & 35 deletions compose.yml
Original file line number Diff line number Diff line change
@@ -1,40 +1,12 @@
name: nur
name: top-assist

services:
web:
build:
context: .
dockerfile: Dockerfile
command:
- poetry
- run
- python
- api/endpoint.py
ports:
- "8000:8000"
volumes:
- shared_content:/app/content
environment:
- NUR_API_HOST=web
- NUR_API_PORT=8000

slack:
depends_on:
- web
build:
context: .
dockerfile: Dockerfile
command:
- poetry
- run
- python
- slack/bot.py
chroma:
image: ghcr.io/chroma-core/chroma:latest
volumes:
- shared_content:/app/content
environment:
- NUR_API_HOST=web
- NUR_API_PORT=8000
- chroma_data:/chroma/.chroma/index
ports:
- 8000:8000

volumes:
shared_content:

chroma_data:
22 changes: 11 additions & 11 deletions configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ def get_project_root() -> str:
chart_folder_path = os.path.join(project_path, "content", "charts")
sql_file_path = os.path.join(project_path, "content", "database", "confluence_pages_sql.db")
db_url = 'sqlite:///' + sql_file_path
vector_folder_path = os.path.join(project_path, "content", "vectors", "confluence_pages")
interactions_folder_path = os.path.join(project_path, "content", "vectors", "confluence_interactions")

# Chroma vector database configuration
chroma_host = os.environ.get("CHROMA_HOST")
chroma_port = int(os.environ.get("CHROMA_PORT"))
chroma_database = os.environ.get("CHROMA_DATABASE")
vector_collection_pages = "pages"
vector_collection_interactions = "interactions"

# Assistant IDs
qa_assistant_id = os.environ.get("OPENAI_ASSISTANT_ID_QA")
Expand All @@ -45,20 +50,15 @@ def get_project_root() -> str:

# page retrieval for answering questions
# document count is recommended from 3 to 15 where 3 is minimum cost and 15 is maximum comprehensive answer
document_count = 2
question_context_pages_count = 2
# interaction retrieval for identifying knowledge gaps interaction_retrieval_count is recommended from 3 to 10 where
# 3 is minimum cost and 10 is maximum comprehensive list of questions
interaction_retrieval_count = 5
knowledge_gap_interaction_retrieval_count = 5

# Configuration for the Nur Services API
# get the values from the environment variables if available or use the default values
api_host = os.environ.get("NUR_API_HOST", "localhost")
api_port = int(os.environ.get("NUR_API_PORT", "8000"))

# Name of the vector collection
pages_collection_name = "pages"
interactions_collection_name = "interactions"

api_host = os.environ.get("NUR_API_HOST")
api_port = int(os.environ.get("NUR_API_PORT"))

# System Knowledge space name on Confluence
system_knowledge_space_private = "Nur Documentation"
Expand Down
59 changes: 4 additions & 55 deletions confluence/importer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from datetime import datetime
import logging
import requests
import time

from configuration import api_host, api_port
from database.page_manager import PageManager
from database.space_manager import SpaceManager
from vector.create_vector_db import add_embeds_to_vector_db
import vector.pages

from .client import ConfluenceClient
from .retriever import retrieve_space
Expand All @@ -15,55 +12,6 @@
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def submit_embedding_creation_request(page_id: str):
endpoint_url = f'http://{api_host}:{api_port}/api/v1/embeds'
headers = {"Content-Type": "application/json"}
payload = {"page_id": page_id}

try:
response = requests.post(endpoint_url, json=payload, headers=headers)
response.raise_for_status() # This will raise for HTTP errors
logging.info(f"Embedding creation request successful for page ID {page_id}.")
except requests.exceptions.HTTPError as e:
logging.error(f"HTTP error occurred while submitting embedding creation request for page ID {page_id}: {e}")
except Exception as e:
logging.error(f"An error occurred while submitting embedding creation request for page ID {page_id}: {e}")


def generate_missing_page_embeddings(retry_limit: int = 3, wait_time: int = 5) -> None:
for attempt in range(retry_limit):
# Retrieve the IDs of pages that are still missing embeddings.
page_ids = PageManager().get_page_ids_missing_embeds()
# If there are no pages missing embeddings, exit the loop and end the process.
if not page_ids:
logging.info("All pages have embeddings. Process complete.")
return

logging.info(f"Attempt {attempt + 1} of {retry_limit}: Processing {len(page_ids)} pages missing embeddings.")
for page_id in page_ids:
# Submit a request to generate an embedding for each page ID.
submit_embedding_creation_request(page_id)
# A brief delay between requests to manage load and potentially avoid rate limiting.
time.sleep(0.2)

logging.info(f"Waiting for {wait_time} seconds for embeddings to be processed...")
time.sleep(wait_time)

# After waiting, retrieve the list of pages still missing embeddings to see if the list has decreased.
# This retrieval is crucial to ensure that the loop only continues if there are still pages that need processing.
if page_ids := PageManager().get_page_ids_missing_embeds():
logging.info(f"After attempt {attempt + 1}, {len(page_ids)} pages are still missing embeds.")
else:
logging.info("All pages now have embeddings. Process complete.")
break # Break out of the loop if there are no more pages missing embeddings.

# After exhausting the retry limit, check if there are still pages without embeddings.
if page_ids:
logging.info("Some pages still lack embeddings after all attempts.")
else:
logging.info("All pages now have embeddings. Process complete.")


def tui_choose_space():
"""
Prompt the user to choose a Confluence space from a list of available spaces.
Expand All @@ -84,12 +32,13 @@ def import_space(space_key, space_name):

pages = retrieve_space(space_key)
PageManager().store_pages_data(space_key, pages)
generate_missing_page_embeddings()

vector.pages.generate_missing_embeddings_to_database()

SpaceManager().upsert_space_info(
space_key=space_key,
space_name=space_name,
last_import_date=import_date
)

add_embeds_to_vector_db(space_key)
vector.pages.import_from_database(space_key)
2 changes: 0 additions & 2 deletions content/vectors/.gitignore

This file was deleted.

78 changes: 21 additions & 57 deletions interactions/identify_knowledge_gap.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,24 @@
# ./interactions/identify_knowledge_gap.py
import chromadb
import logging
import json
import logging
from typing import List, Tuple
from configuration import interactions_folder_path, embedding_model_id, knowledge_gap_discussions_channel_id
from configuration import interaction_retrieval_count, interactions_collection_name
from configuration import quizz_assistant_id
from open_ai.embedding.embed_manager import embed_text

from configuration import (
knowledge_gap_discussions_channel_id,
knowledge_gap_interaction_retrieval_count,
quizz_assistant_id,
)
from database.interaction_manager import QAInteractionManager, QAInteraction
from database.quiz_question_manager import QuizQuestionManager
from open_ai.assistants.utility import extract_assistant_response, initiate_client
from open_ai.assistants.thread_manager import ThreadManager
from open_ai.assistants.assistant_manager import AssistantManager
from database.interaction_manager import QAInteractionManager, QAInteraction
from database.quiz_question_manager import QuizQuestionManager
from slack.message_manager import post_questions_to_slack
import vector.interactions


logging.basicConfig(level=logging.INFO)


def retrieve_relevant_interaction_ids(query: str) -> List[str]:
"""
Retrieve the most relevant interactions for a given query using ChromaDB.
Args:
query (str): The query to retrieve relevant interactions for.
Returns:
List[str]: A list of interaction IDs of the most relevant interactions.
"""

# Generate the query embedding using OpenAI
try:
query_embedding = embed_text(text=query, model=embedding_model_id)
except Exception as e:
logging.error(f"Error generating query embedding: {e}")
return []

# Initialize the ChromaDB client
client = chromadb.PersistentClient(path=interactions_folder_path)

collections = client.list_collections()
logging.info(f"Available collections: {collections}")

collection = client.get_collection(interactions_collection_name)

# Perform a similarity search in the collection
similar_items = collection.query(
query_embeddings=[query_embedding],
n_results=interaction_retrieval_count
)

# Extract and return the interaction IDs from the results
if 'ids' in similar_items:
interaction_ids = [id for sublist in similar_items['ids'] for id in sublist]
else:
logging.warning("No 'ids' key found in similar_items, no interactions retrieved.")
interaction_ids = []

return interaction_ids


def format_interactions(interactions: List['QAInteraction']) -> Tuple[str, List[str]]:
"""
Format a list of QAInteraction objects into a human-readable string and collect user IDs.
Expand Down Expand Up @@ -200,7 +159,7 @@ def process_and_store_questions(assistant_response_json):
return quiz_question_dtos


def strip_json(assistant_response):
def strip_json(assistant_response: str):
"""
Receives the response and extracts only the JSON from it, then returns the JSON.
This function assumes the JSON content is wrapped in triple backticks with 'json' as a language identifier.
Expand All @@ -212,10 +171,15 @@ def strip_json(assistant_response):
str: The extracted JSON content as a string. Returns an empty JSON array '[]' if extraction fails.
"""
try:
# Attempt to find the start of the JSON content
start_index = assistant_response.index("```json") + len("```json")
end_index = assistant_response.index("```", start_index)
json_content = assistant_response[start_index:end_index].strip()
json_content = assistant_response

# Strip markdown quotes if any
if "```json" in assistant_response:
start_index = assistant_response.index("```json") + len("```json")
end_index = assistant_response.index("```", start_index)
json_content = json_content[start_index:end_index]

json_content = json_content.strip()
# Basic validation to check if it's likely to be JSON
if json_content.startswith("[") and json_content.endswith("]"):
return json_content
Expand All @@ -229,7 +193,7 @@ def strip_json(assistant_response):

def identify_knowledge_gaps(context):
query = f"no information in context: {context}"
interaction_ids = retrieve_relevant_interaction_ids(query)
interaction_ids = vector.interactions.retrieve_relevant_ids(query, count=knowledge_gap_interaction_retrieval_count)
relevant_qa_interactions = QAInteractionManager().get_interactions_by_interaction_ids(interaction_ids)
formatted_interactions, user_ids = format_interactions(relevant_qa_interactions)
assistant_response, thread_ids = query_assistant_with_context(context, formatted_interactions)
Expand Down
Loading

0 comments on commit 261c74c

Please sign in to comment.