Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adds basic concept extraction from URLs #26

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ httpx = "==0.27.0"
pymupdf = "==1.24.2"
pydantic = {extras = ["email"], version = "==2.7.1"}
python-multipart = "==0.0.9"
instructor = "*"

[dev-packages]

Expand Down
65 changes: 61 additions & 4 deletions backend/Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions backend/app/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,18 +89,30 @@ def update_urls(self, urls: list[URL]):
def create_text_nodes(self, nodes: list[TextNode], user_id: str):
text_nodes_to_persist = []
text_node_chunks_to_persist = []
text_nodes_to_text_node_concepts_to_persist = []
for node in nodes:
text_node, text_node_chunks = node.to_persistence()
text_node["user_id"] = user_id

for chunk in text_node_chunks:
chunk["user_id"] = user_id

for concept_id in node.concept_ids:
text_nodes_to_text_node_concepts_to_persist.append(
{"text_node_id": node.id, "text_node_concept_id": concept_id}
)

text_nodes_to_persist.append(text_node)
text_node_chunks_to_persist.extend(text_node_chunks)

self._client.table("text_nodes").insert(text_nodes_to_persist).execute()
self._client.table("text_node_chunks").insert(
text_node_chunks_to_persist
).execute()
print(text_nodes_to_text_node_concepts_to_persist)
self._client.table("text_node_to_text_node_concepts").insert(
text_nodes_to_text_node_concepts_to_persist
).execute()

def get_urls_feed(self, user_id: str):
result = (
Expand All @@ -123,3 +135,26 @@ def get_user_id_by_email_alias(self, app_email_alias: str):
if len(result.data) != 1:
return None
return result.data[0]["id"]

def get_text_node_concept_ids(self, concepts: list[str]) -> list[int]:
existing_concepts = (
self._client.table("text_node_concepts")
.select("id, name")
.in_("name", concepts)
.execute()
.data
)

existing_concept_names = [c["name"] for c in existing_concepts]
new_concept_names = [
{"name": c} for c in concepts if c not in existing_concept_names
]
new_concepts = (
self._client.table("text_node_concepts")
.insert(new_concept_names)
.execute()
.data
)
concept_ids = [c["id"] for c in (existing_concepts + new_concepts)]

return concept_ids
13 changes: 12 additions & 1 deletion backend/app/domain/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,29 @@ def __init__(self, text: str, text_node_id: str) -> None:

class TextNode:
def __init__(
self, url_feed_id: str, url: str, title: str, text: str, summary: str
self,
url_feed_id: str,
url: str,
title: str,
text: str,
summary: str,
concept_ids: list[int],
) -> None:
self.id = uuid7()
self.url_feed_id = url_feed_id
self.url = url
self.title = title
self.text = text
self.summary = summary
self._concept_ids = concept_ids
self.embedding = None
self.chunks: list[TextNodeChunk] = []
self.create_title_if_missing()

@property
def concept_ids(self) -> list[int]:
return self._concept_ids

def create_chunks(self, chunker: NodeChunker) -> None:
self.chunks = chunker.chunk(self.id, self.text)

Expand Down
54 changes: 53 additions & 1 deletion backend/app/llm.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import json
from typing import List, Generator, Any

import instructor
from openai import OpenAI
from pydantic import BaseModel

client = OpenAI()

MODEL_16K = "gpt-3.5-turbo-16k"
MODEL_16K = "gpt-3.5-turbo-0125"

PROMPT_TEMPLATE = (
"A question and context documents are provided below."
Expand All @@ -22,6 +25,23 @@
"{question}"
)

EXTRACT_CONCEPTS_PROMPT_TEMPLATE = (
"Please extract ONLY THE MOST IMPORTANT concepts, entities & topics from the provided text."
"DO NOT provide more than 8 results per text article."
"MAKE SURE the oncepts, entities & topics you select are relevant to the overall article, and are not ads or examples."
"---------------------\n"
"TEXT:\n"
"{text}"
)

EXTRACT_CONCEPTS_SYSTEM_PROMPT_TEMPLATE = (
"You are an information extraction system. You respond to each message with a list of useful named entities."
"Each named entity appears as one entry in a list."
"Ignore unimportant entities, e.g., of type formatting, citations, and references."
"The types of entities that we are most interested in are human, artificial object, spatio-temporal entity, corporate body, concrete object, talk, geographical feature, natural object, product, system."
"IMPORTANT: you only include entities that appear in the text."
)


def format_chunks(chunks: List[dict]) -> str:
result = ""
Expand Down Expand Up @@ -74,3 +94,35 @@ def summarise_text(text: str) -> str:
temperature=0,
)
return result.choices[0].message.content


class NodeConcepts(BaseModel):
"""
Represents a list of key concepts and entities extracted from text.
"""

concepts: list[str]


def extract_concepts(text: str) -> list[str]:
client = instructor.from_openai(OpenAI())

node_concepts = client.chat.completions.create(
model=MODEL_16K,
temperature=0,
response_model=NodeConcepts,
messages=[
{
"role": "system",
"content": "EXTRACT_CONCEPTS_SYSTEM_PROMPT_TEMPLATE",
},
{
"role": "user",
"content": EXTRACT_CONCEPTS_PROMPT_TEMPLATE.format(text=text),
},
],
)

concepts = [n.lower().replace(" ", "-") for n in node_concepts.concepts]

return concepts
5 changes: 5 additions & 0 deletions backend/app/services/indexing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from app.db import DB
from app.llm import summarise_text
from app.llm import extract_concepts
from app.utils import URLProcessor
from app.utils import URLProcessingResult
from app.utils import NodeChunker
Expand All @@ -20,12 +21,16 @@ async def index(self, urls: list[URL], user_id: str):
for idx, processed_url in enumerate(processed_urls):
try:
if isinstance(processed_url, URLProcessingResult):
concepts = extract_concepts(processed_url.text)
print(concepts)
concept_ids = db.get_text_node_concept_ids(concepts)
text_node = TextNode(
url=processed_url.url,
url_feed_id=urls[idx].id,
title=processed_url.title,
text=processed_url.text,
summary=summarise_text(processed_url.text),
concept_ids=concept_ids,
)
text_node.create_chunks(NodeChunker)
text_node.create_embeddings(NodeEmbedder)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- Create a table to store text node concepts.
create table
public.text_node_concepts (
id serial primary key,
"name" varchar not null
);

alter table public.text_node_concepts enable row level security;

-- Create a table to connect text nodes to text node concepts, with a many-to-many relationship.
create table
public.text_node_to_text_node_concepts (
text_node_id uuid not null,
text_node_concept_id int not null,
primary key (text_node_id, text_node_concept_id),
foreign key (text_node_id) references public.text_nodes (id),
foreign key (text_node_concept_id) references public.text_node_concepts (id)
);

alter table public.text_node_to_text_node_concepts enable row level security;

Loading