Step4

toptal · Apr 12, 2024 · 1decee6 · 1decee6
1 parent 9ae9213
commit 1decee6
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -161,6 +161,20 @@ If the above works, you should see Nur's manager command prompt - the same thing
 
 Bear in mind that the Dockerized version uses a shared volume named `nur_shared_content`. This is bootstrapped during the first installation, but will persist until it is manually removed.
 
+## Add a new data model using Alembic
+
+
+```warp-runnable-command
+# Move to the poetry package
+cd code/nur/nurai
+# Import the model in /nurai/migrations/env.py
+# Create a new migration
+poetry run alembic revision --autogenerate -m "Initialize the db"
+# Apply the migration
+poetry run alembic upgrade head
+```
+
+
 ## Network traffic
 
 1. Outgoing to Open AI API (for Embeds and completion)

diff --git a/database/page_manager.py b/database/page_manager.py
@@ -57,7 +57,6 @@ def get_page_ids_missing_embeds(self, session):
         Retrieve the page IDs of pages that are missing embeddings.
         :return: A list of page IDs.
         """
-        # with session as session:
         records = session.query(PageData).filter(
             (PageData.lastUpdated > PageData.last_embedded) |
             (PageData.last_embedded.is_(None))
@@ -72,7 +71,6 @@ def get_all_page_data_from_db(self, session, space_key=None):
         :param space_key: Optional; the specific space key to filter pages by.
         :return: Tuple of page_ids (list of page IDs), all_documents (list of document strings), and embeddings (list of embeddings as strings)
         """
-        # with session as session:
         if space_key:
             records = session.query(PageData).filter(PageData.space_key == space_key).all()
         else:
@@ -100,7 +98,6 @@ def add_or_update_embed_vector(self, page_id, embed_vector, session):
             page_id (str): The ID of the page to update.
             embed_vector: The embed vector data to be added or updated, expected to be a list of floats.
         """
-        # with session as session:
         page = session.query(PageData).filter_by(page_id=page_id).first()
 
         if page:
@@ -117,7 +114,6 @@ def find_page(self, page_id, session) -> Optional[PageData]:
         :param page_id: The ID of the page to find.
         :return: The page data if found, or None if not found.
         """
-        # with session as session:
         return session.query(PageData).filter_by(page_id=page_id).first()
 
     def find_pages(self, page_ids, session) -> List[PageData]:
@@ -126,7 +122,6 @@ def find_pages(self, page_ids, session) -> List[PageData]:
         :param page_ids: A list of page IDs to find.
         :return: A list of page data if found, or an empty list if not found.
         """
-        # with session as session:
         return session.query(PageData).filter(PageData.page_id.in_(page_ids)).all()
 
     def format_page_for_llm(self, page: PageData) -> str:

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,32 @@
+version: '3.3'
+services:
+  db:
+    image: postgres:12
+    restart: always
+    environment:
+      POSTGRES_DB: 'top-assist'
+      POSTGRES_USER: 'toptal'
+      POSTGRES_PASSWORD: 'toptal'
+    ports:
+      - '5432:5432'
+    expose:
+      - '5432'
+    volumes:
+      - postgres-data:/var/lib/postgresql/data
+  chroma:
+    image: ghcr.io/chroma-core/chroma:latest
+    volumes:
+      - chroma-data:/chroma/.chroma/index
+    ports:
+      - 8001:8000
+    networks:
+      - net
+
+volumes:
+  postgres-data:
+  chroma-data:
+    driver: local
+
+networks:
+  net:
+    driver: bridge
diff --git a/interactions/identify_knowledge_gap.py b/interactions/identify_knowledge_gap.py
@@ -6,7 +6,6 @@
 from configuration import interactions_folder_path, embedding_model_id, knowledge_gap_discussions_channel_id
 from configuration import interaction_retrieval_count, interactions_collection_name
 from configuration import quizz_assistant_id
-from database.database import get_db_session
 from open_ai.embedding.embed_manager import embed_text
 from open_ai.assistants.utility import extract_assistant_response, initiate_client
 from open_ai.assistants.thread_manager import ThreadManager
@@ -213,14 +212,18 @@ def strip_json(assistant_response):
         str: The extracted JSON content as a string. Returns an empty JSON array '[]' if extraction fails.
     """
     try:
-        # Attempt to find the start of the JSON content
-        if assistant_response.find("```json") == 1:
+        json_content = assistant_response
+
+        # Strip markdown quotes if any
+        if "```json" in assistant_response:
             start_index = assistant_response.index("```json") + len("```json")
             end_index = assistant_response.index("```", start_index)
-            assistant_response = assistant_response[start_index:end_index].strip()
+            json_content = json_content[start_index:end_index]
+
+        json_content = json_content.strip()
         # Basic validation to check if it's likely to be JSON
-        if assistant_response.startswith("[") and assistant_response.endswith("]"):
-            return assistant_response
+        if json_content.startswith("[") and json_content.endswith("]"):
+            return json_content
         else:
             logging.error("Extracted content does not appear to be valid JSON.")
             return "[]"

diff --git a/interactions/vectorize_and_store.py b/interactions/vectorize_and_store.py
@@ -122,14 +122,14 @@ def submit_create_interaction_embeds_request(interaction_id):
     post_request(interaction_embeds_endpoint, {"interaction_id": interaction_id}, data_type='Interaction embed')
 
 
-def vectorize_interactions_and_store_in_db(db, retry_limit: int = 3, wait_time: int = 5) -> None:
+def vectorize_interactions_and_store_in_db(session, retry_limit: int = 3, wait_time: int = 5) -> None:
     """
     Vectorize all interactions without embeds and store them in the database,
     with retries for failed attempts.
     """
     for attempt in range(retry_limit):
         # Retrieve interactions that are still missing embeddings.
-        interactions = get_qna_interactions_without_embeds(db)
+        interactions = get_qna_interactions_without_embeds(session)
 
         # If there are no interactions missing embeddings, exit the loop and end the process.
         if not interactions:
@@ -151,7 +151,7 @@ def vectorize_interactions_and_store_in_db(db, retry_limit: int = 3, wait_time:
         time.sleep(wait_time)
 
         # After waiting, retrieve the list of interactions still missing embeddings to see if the list has decreased.
-        interactions = get_qna_interactions_without_embeds(db)
+        interactions = get_qna_interactions_without_embeds(session)
         if not interactions:
             print("All interactions now have embeddings. Process complete.")
             break  # Break out of the loop if there are no more interactions missing embeddings.

diff --git a/main.py b/main.py
@@ -45,36 +45,36 @@ def main_menu():
             print("Loading new documentation space...")
             space_key, space_name = tui_choose_space()
             if space_key and space_name:
-                with get_db_session() as db_session:
-                    import_space(space_key, space_name, db_session)
+                with get_db_session() as session:
+                    import_space(space_key, space_name, session)
             print()
             print(f"Space '{space_name}' retrieval and indexing complete.")
 
         elif choice == "2":
             question = ask_question()
             if question:
-                with get_db_session() as db_session:
-                    answer, thread_id = answer_question_with_assistant(question, db_session)
+                with get_db_session() as session:
+                    answer, thread_id = answer_question_with_assistant(question, session)
                     print(f"\nThread ID: {thread_id}\nAnswer: {answer}")
 
         elif choice == "3":
             print("Creating vector db for interactions")
-            with get_db_session() as db_session:
-                vectorize_interactions_and_store_in_db(db_session)
-                VectorInteractionManager().add_to_vector(db_session)
+            with get_db_session() as session:
+                vectorize_interactions_and_store_in_db(session)
+                VectorInteractionManager().add_to_vector(session)
 
         elif choice == "4":
             load_manage_assistants()
 
         elif choice == "5":
             context = input("Enter the context you want to identifying knowledge gaps in\nex:(billing reminders): ")
-            with get_db_session() as db_session:
-                identify_knowledge_gaps(context, db_session)
+            with get_db_session() as session:
+                identify_knowledge_gaps(context, session)
 
         elif choice == "6":
             print("Starting 3D visualization process...")
-            with get_db_session() as db_session:
-                load_confluence_pages_spacial_distribution(db_session)
+            with get_db_session() as session:
+                load_confluence_pages_spacial_distribution(session)
 
         elif choice == "0":
             print("Exiting program.")