ydennisy · ydennisy · Apr 28, 2024
diff --git a/backend/Pipfile b/backend/Pipfile
@@ -23,6 +23,7 @@ httpx = "==0.27.0"
 pymupdf = "==1.24.2"
 pydantic = {extras = ["email"], version = "==2.7.1"}
 python-multipart = "==0.0.9"
+instructor = "*"
 
 [dev-packages]
 

diff --git a/backend/Pipfile.lock b/backend/Pipfile.lock
diff --git a/backend/app/db.py b/backend/app/db.py
@@ -89,18 +89,30 @@ def update_urls(self, urls: list[URL]):
     def create_text_nodes(self, nodes: list[TextNode], user_id: str):
         text_nodes_to_persist = []
         text_node_chunks_to_persist = []
+        text_nodes_to_text_node_concepts_to_persist = []
         for node in nodes:
             text_node, text_node_chunks = node.to_persistence()
             text_node["user_id"] = user_id
+
             for chunk in text_node_chunks:
                 chunk["user_id"] = user_id
+
+            for concept_id in node.concept_ids:
+                text_nodes_to_text_node_concepts_to_persist.append(
+                    {"text_node_id": node.id, "text_node_concept_id": concept_id}
+                )
+
             text_nodes_to_persist.append(text_node)
             text_node_chunks_to_persist.extend(text_node_chunks)
 
         self._client.table("text_nodes").insert(text_nodes_to_persist).execute()
         self._client.table("text_node_chunks").insert(
             text_node_chunks_to_persist
         ).execute()
+        print(text_nodes_to_text_node_concepts_to_persist)
+        self._client.table("text_node_to_text_node_concepts").insert(
+            text_nodes_to_text_node_concepts_to_persist
+        ).execute()
 
     def get_urls_feed(self, user_id: str):
         result = (
@@ -123,3 +135,26 @@ def get_user_id_by_email_alias(self, app_email_alias: str):
         if len(result.data) != 1:
             return None
         return result.data[0]["id"]
+
+    def get_text_node_concept_ids(self, concepts: list[str]) -> list[int]:
+        existing_concepts = (
+            self._client.table("text_node_concepts")
+            .select("id, name")
+            .in_("name", concepts)
+            .execute()
+            .data
+        )
+
+        existing_concept_names = [c["name"] for c in existing_concepts]
+        new_concept_names = [
+            {"name": c} for c in concepts if c not in existing_concept_names
+        ]
+        new_concepts = (
+            self._client.table("text_node_concepts")
+            .insert(new_concept_names)
+            .execute()
+            .data
+        )
+        concept_ids = [c["id"] for c in (existing_concepts + new_concepts)]
+
+        return concept_ids
diff --git a/backend/app/domain/node.py b/backend/app/domain/node.py
@@ -18,18 +18,29 @@ def __init__(self, text: str, text_node_id: str) -> None:
 
 class TextNode:
     def __init__(
-        self, url_feed_id: str, url: str, title: str, text: str, summary: str
+        self,
+        url_feed_id: str,
+        url: str,
+        title: str,
+        text: str,
+        summary: str,
+        concept_ids: list[int],
     ) -> None:
         self.id = uuid7()
         self.url_feed_id = url_feed_id
         self.url = url
         self.title = title
         self.text = text
         self.summary = summary
+        self._concept_ids = concept_ids
         self.embedding = None
         self.chunks: list[TextNodeChunk] = []
         self.create_title_if_missing()
 
+    @property
+    def concept_ids(self) -> list[int]:
+        return self._concept_ids
+
     def create_chunks(self, chunker: NodeChunker) -> None:
         self.chunks = chunker.chunk(self.id, self.text)
 

diff --git a/backend/app/llm.py b/backend/app/llm.py
@@ -1,10 +1,13 @@
 import json
 from typing import List, Generator, Any
+
+import instructor
 from openai import OpenAI
+from pydantic import BaseModel
 
 client = OpenAI()
 
-MODEL_16K = "gpt-3.5-turbo-16k"
+MODEL_16K = "gpt-3.5-turbo-0125"
 
 PROMPT_TEMPLATE = (
     "A question and context documents are provided below."
@@ -22,6 +25,23 @@
     "{question}"
 )
 
+EXTRACT_CONCEPTS_PROMPT_TEMPLATE = (
+    "Please extract ONLY THE MOST IMPORTANT concepts, entities & topics from the provided text."
+    "DO NOT provide more than 8 results per text article."
+    "MAKE SURE the oncepts, entities & topics you select are relevant to the overall article, and are not ads or examples."
+    "---------------------\n"
+    "TEXT:\n"
+    "{text}"
+)
+
+EXTRACT_CONCEPTS_SYSTEM_PROMPT_TEMPLATE = (
+    "You are an information extraction system. You respond to each message with a list of useful named entities."
+    "Each named entity appears as one entry in a list."
+    "Ignore unimportant entities, e.g., of type formatting, citations, and references."
+    "The types of entities that we are most interested in are human, artificial object, spatio-temporal entity, corporate body, concrete object, talk, geographical feature, natural object, product, system."
+    "IMPORTANT: you only include entities that appear in the text."
+)
+
 
 def format_chunks(chunks: List[dict]) -> str:
     result = ""
@@ -74,3 +94,35 @@ def summarise_text(text: str) -> str:
         temperature=0,
     )
     return result.choices[0].message.content
+
+
+class NodeConcepts(BaseModel):
+    """
+    Represents a list of key concepts and entities extracted from text.
+    """
+
+    concepts: list[str]
+
+
+def extract_concepts(text: str) -> list[str]:
+    client = instructor.from_openai(OpenAI())
+
+    node_concepts = client.chat.completions.create(
+        model=MODEL_16K,
+        temperature=0,
+        response_model=NodeConcepts,
+        messages=[
+            {
+                "role": "system",
+                "content": "EXTRACT_CONCEPTS_SYSTEM_PROMPT_TEMPLATE",
+            },
+            {
+                "role": "user",
+                "content": EXTRACT_CONCEPTS_PROMPT_TEMPLATE.format(text=text),
+            },
+        ],
+    )
+
+    concepts = [n.lower().replace(" ", "-") for n in node_concepts.concepts]
+
+    return concepts
diff --git a/backend/app/services/indexing.py b/backend/app/services/indexing.py
@@ -1,5 +1,6 @@
 from app.db import DB
 from app.llm import summarise_text
+from app.llm import extract_concepts
 from app.utils import URLProcessor
 from app.utils import URLProcessingResult
 from app.utils import NodeChunker
@@ -20,12 +21,16 @@ async def index(self, urls: list[URL], user_id: str):
         for idx, processed_url in enumerate(processed_urls):
             try:
                 if isinstance(processed_url, URLProcessingResult):
+                    concepts = extract_concepts(processed_url.text)
+                    print(concepts)
+                    concept_ids = db.get_text_node_concept_ids(concepts)
                     text_node = TextNode(
                         url=processed_url.url,
                         url_feed_id=urls[idx].id,
                         title=processed_url.title,
                         text=processed_url.text,
                         summary=summarise_text(processed_url.text),
+                        concept_ids=concept_ids,
                     )
                     text_node.create_chunks(NodeChunker)
                     text_node.create_embeddings(NodeEmbedder)

diff --git a/backend/supabase/migrations/20240428112033_add_text_node_concepts.sql b/backend/supabase/migrations/20240428112033_add_text_node_concepts.sql
@@ -0,0 +1,21 @@
+-- Create a table to store text node concepts.
+create table
+  public.text_node_concepts (
+    id serial primary key,
+    "name" varchar not null
+  );
+
+alter table public.text_node_concepts enable row level security;
+
+-- Create a table to connect text nodes to text node concepts, with a many-to-many relationship.
+create table 
+    public.text_node_to_text_node_concepts (
+      text_node_id uuid not null,
+      text_node_concept_id int not null,
+      primary key (text_node_id, text_node_concept_id),
+      foreign key (text_node_id) references public.text_nodes (id),
+      foreign key (text_node_concept_id) references public.text_node_concepts (id)
+);
+
+alter table public.text_node_to_text_node_concepts enable row level security;
+