Luca-Blight · Luca-Blight · Jun 28, 2023 · Jul 10, 2023 · Jul 12, 2023 · Jul 12, 2023
diff --git a/.DS_Store b/.DS_Store
diff --git a/backend/app/api/prompts.py b/backend/app/api/prompts.py
@@ -6,19 +6,19 @@
 
 
 delivery_days: How many days did it take for the product \
-to arrive? If this information is not found, output is "not found"., do not use -1
+to arrive? If this information is not found, output is "not found"., 
 
 price_value: Extract any sentences about the value or price,\
-and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
+and output them as a comma separated Python list. If this information is not found, output is "not found"
 
 customer_negative_feedback: Extract any problems customers are facing with the current product \
-If this information is not found,  output is "not found", do not use -1
+If this information is not found,  output is "not found"
 
 feature_requests: Extract any sentences about feature requests,\
-and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
+and output them as a comma separated Python list. If this information is not found, output is "not found"
 
 competitor_mentions: Extract any sentences about the competition\
-and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
+and output them as a comma separated Python list. If this information is not found, output is "not found"
 
 
 Format the output as JSON with the following keys:
@@ -33,7 +33,7 @@
 
 
 final_prompt_string = """\
-For the following text, distill the following information from from the key attributes and maintain spacing between words , please ignore negative  or "not found" values:
+For the following text, amalgamate the values from the text into a single json output, keep the attributes provided below. please ignore "not found" values:
 
 delivery_days: How many days did it take for the product \
 to arrive? If this information is not found, output is "not found", do not use -1

diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py
@@ -1,25 +1,26 @@
 import asyncio
 import os
-import tempfile
+import time
+import openai
 
 from fastapi import APIRouter, UploadFile
 from fastapi.responses import HTMLResponse
 from langchain.chat_models import ChatOpenAI
 from concurrent.futures import ThreadPoolExecutor
 
 from dotenv import load_dotenv, find_dotenv
-from prompts import product_prompt_template, final_product_prompt_template
+from .prompts import product_prompt_template, final_product_prompt_template
 from utils.document_utils import extract_text_from_pdf, split_into_chunks
+from rich import print
 
+import guardrails as gd
 
 _ = load_dotenv(find_dotenv())
 
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 
 router = APIRouter()
 
-
-
 @router.get("/", response_class=HTMLResponse)
 def root():
     return """
@@ -46,40 +47,38 @@ def root():
 
 @router.post("/analyze")
 async def analyze_document(file: UploadFile) -> dict:
+    start = time.time()
     filename = file.filename
-    breakpoint()
     loop = asyncio.get_event_loop()
 
     with ThreadPoolExecutor() as executor:
         if filename.endswith(".pdf"):
-
-            ### unable to extract 
-            pdf_bytes = await file.read()  # read file into bytes
-            temp_pdf_file = tempfile.NamedTemporaryFile(delete=False) # write bytes to a temporary file
-            temp_pdf_file.write(pdf_bytes)
-
             extracted_text = await loop.run_in_executor(
-                executor, extract_text_from_pdf, temp_pdf_file.name
+                executor, extract_text_from_pdf, file.file
             )
-
-            temp_pdf_file.close()
-            os.unlink(temp_pdf_file.name)
 
+            guard = gd.Guard.from_rail('/Users/Zachary_Royals/Code/zelta-challenge/backend/app/api/sales_transcript.rail')
+
             chunks = await loop.run_in_executor(
                 executor, split_into_chunks, extracted_text
             )
-            chat = ChatOpenAI(temperature=0.0, model="gpt-4")
-
             # run tasks in parallel
-            tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks]
-            insights = await asyncio.gather(*tasks)
-
-            #append insights into final product prompt
-            summary = final_product_prompt_template.format_messages(text=insights)
-            chat = ChatOpenAI(temperature=0.0, model="gpt-4")
-            final_insights = await loop.run_in_executor(executor, chat, summary)
-
-            return final_insights
+            validated_outputs = []
+            for chunk in chunks:
+                _, validated_output = guard(
+                                                    openai.ChatCompletion.create,
+                                                    prompt_params={"sales_transcript": chunk},
+                                                    model="gpt-4",
+                                                    max_tokens=6000,
+                                                    temperature=0.0,
+
+                                            )
+                validated_outputs.append(validated_output)
+
+            # additional prompt to still collection of validated outputs?
+            execution_time = time.time() - start
+            print(f'Time taken: {execution_time} seconds')
+            return validated_output
 
         elif file.endswith(".txt"):
             return "This is a text file."

diff --git a/backend/app/api/sales_transcript.rail b/backend/app/api/sales_transcript.rail
@@ -0,0 +1,29 @@
+<rail version="0.1">
+
+<output>
+    <object name="customer_feedback" description="customer feedback coming from speaker two">
+        <string name="delivery_days" description="delivery days of the product"/>
+        <string name="price_value" description="Mentions about the price to value"/>
+        <string name="feature_requests" description="Features or improvements that could be added"/>
+    </object>
+</output>
+
+
+<prompt>
+
+
+Given the following document, construct a JSON that follows that correct schema.
+
+
+{{sales_transcript}}
+
+@xml_prefix_prompt  
+
+
+{{output_schema}}  
+
+
+@json_suffix_prompt
+</prompt>
+
+</rail>
diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py
@@ -1,7 +1,6 @@
 
 from langchain.text_splitter import CharacterTextSplitter
 import pdfplumber
-import math
 # from PyPDF2 import PdfReader
 
 from nltk.tokenize import word_tokenize, sent_tokenize
@@ -12,13 +11,13 @@ def extract_text_from_pdf(pdf) -> str:
     return text
 
 
-def split_into_chunks(text: str) -> list[str]:
-    chunk_size = len(text) / 2
-    chunk_overlap = math.floor(chunk_size * .05)
+def split_into_chunks(text: str, chunk_size: int=6000, chunk_overlap: int=400) -> list[str]:
+
     text_splitter = CharacterTextSplitter(
         separator="\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
     )
     chunks = text_splitter.split_text(text)
+    print(f'There are {len(chunks)} chunks with a chunk size of {chunk_size} and an overlap size of {chunk_overlap}')
     return chunks
 
 # def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]: