Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

With_guardrails #1

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed .DS_Store
Binary file not shown.
12 changes: 6 additions & 6 deletions backend/app/api/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@


delivery_days: How many days did it take for the product \
to arrive? If this information is not found, output is "not found"., do not use -1
to arrive? If this information is not found, output is "not found".,

price_value: Extract any sentences about the value or price,\
and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
and output them as a comma separated Python list. If this information is not found, output is "not found"

customer_negative_feedback: Extract any problems customers are facing with the current product \
If this information is not found, output is "not found", do not use -1
If this information is not found, output is "not found"

feature_requests: Extract any sentences about feature requests,\
and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
and output them as a comma separated Python list. If this information is not found, output is "not found"

competitor_mentions: Extract any sentences about the competition\
and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
and output them as a comma separated Python list. If this information is not found, output is "not found"


Format the output as JSON with the following keys:
Expand All @@ -33,7 +33,7 @@


final_prompt_string = """\
For the following text, distill the following information from from the key attributes and maintain spacing between words , please ignore negative or "not found" values:
For the following text, amalgamate the values from the text into a single json output, keep the attributes provided below. please ignore "not found" values:

delivery_days: How many days did it take for the product \
to arrive? If this information is not found, output is "not found", do not use -1
Expand Down
51 changes: 25 additions & 26 deletions backend/app/api/routes.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
import asyncio
import os
import tempfile
import time
import openai

from fastapi import APIRouter, UploadFile
from fastapi.responses import HTMLResponse
from langchain.chat_models import ChatOpenAI
from concurrent.futures import ThreadPoolExecutor

from dotenv import load_dotenv, find_dotenv
from prompts import product_prompt_template, final_product_prompt_template
from .prompts import product_prompt_template, final_product_prompt_template
from utils.document_utils import extract_text_from_pdf, split_into_chunks
from rich import print

import guardrails as gd

_ = load_dotenv(find_dotenv())

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

router = APIRouter()



@router.get("/", response_class=HTMLResponse)
def root():
return """
Expand All @@ -46,40 +47,38 @@ def root():

@router.post("/analyze")
async def analyze_document(file: UploadFile) -> dict:
start = time.time()
filename = file.filename
breakpoint()
loop = asyncio.get_event_loop()

with ThreadPoolExecutor() as executor:
if filename.endswith(".pdf"):

### unable to extract
pdf_bytes = await file.read() # read file into bytes
temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) # write bytes to a temporary file
temp_pdf_file.write(pdf_bytes)

extracted_text = await loop.run_in_executor(
executor, extract_text_from_pdf, temp_pdf_file.name
executor, extract_text_from_pdf, file.file
)

temp_pdf_file.close()
os.unlink(temp_pdf_file.name)

guard = gd.Guard.from_rail('/Users/Zachary_Royals/Code/zelta-challenge/backend/app/api/sales_transcript.rail')

chunks = await loop.run_in_executor(
executor, split_into_chunks, extracted_text
)
chat = ChatOpenAI(temperature=0.0, model="gpt-4")

# run tasks in parallel
tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks]
insights = await asyncio.gather(*tasks)

#append insights into final product prompt
summary = final_product_prompt_template.format_messages(text=insights)
chat = ChatOpenAI(temperature=0.0, model="gpt-4")
final_insights = await loop.run_in_executor(executor, chat, summary)

return final_insights
validated_outputs = []
for chunk in chunks:
_, validated_output = guard(
openai.ChatCompletion.create,
prompt_params={"sales_transcript": chunk},
model="gpt-4",
max_tokens=6000,
temperature=0.0,

)
validated_outputs.append(validated_output)

# additional prompt to still collection of validated outputs?
execution_time = time.time() - start
print(f'Time taken: {execution_time} seconds')
return validated_output

elif file.endswith(".txt"):
return "This is a text file."
Expand Down
29 changes: 29 additions & 0 deletions backend/app/api/sales_transcript.rail
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<rail version="0.1">

<output>
<object name="customer_feedback" description="customer feedback coming from speaker two">
<string name="delivery_days" description="delivery days of the product"/>
<string name="price_value" description="Mentions about the price to value"/>
<string name="feature_requests" description="Features or improvements that could be added"/>
</object>
</output>


<prompt>


Given the following document, construct a JSON that follows that correct schema.


{{sales_transcript}}

@xml_prefix_prompt


{{output_schema}}


@json_suffix_prompt
</prompt>

</rail>
7 changes: 3 additions & 4 deletions backend/app/utils/document_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@

from langchain.text_splitter import CharacterTextSplitter
import pdfplumber
import math
# from PyPDF2 import PdfReader

from nltk.tokenize import word_tokenize, sent_tokenize
Expand All @@ -12,13 +11,13 @@ def extract_text_from_pdf(pdf) -> str:
return text


def split_into_chunks(text: str) -> list[str]:
chunk_size = len(text) / 2
chunk_overlap = math.floor(chunk_size * .05)
def split_into_chunks(text: str, chunk_size: int=6000, chunk_overlap: int=400) -> list[str]:

text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
)
chunks = text_splitter.split_text(text)
print(f'There are {len(chunks)} chunks with a chunk size of {chunk_size} and an overlap size of {chunk_overlap}')
return chunks

# def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]:
Expand Down