Modifications to frontend and backend

Luca-Blight · Jun 28, 2023 · 08c76c0 · 08c76c0
1 parent d7853b3
commit 08c76c0
Show file tree

Hide file tree

Showing 5 changed files with 105 additions and 43 deletions.
diff --git a/backend/app/api/prompts.py b/backend/app/api/prompts.py
@@ -2,23 +2,23 @@
 
 
 product_prompt_string = """\
-For the following text, extract the following information from speaker 2 and remove any spaces between letters of each word. 
+For the following text, extract the following information from speaker 2 and maintain spacing between words.
 
 
 delivery_days: How many days did it take for the product \
-to arrive? If this information is not found, output -1.
+to arrive? If this information is not found, output is "not found"., do not use -1
 
 price_value: Extract any sentences about the value or price,\
-and output them as a comma separated Python list.
+and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
 
 customer_negative_feedback: Extract any problems customers are facing with the current product \
-If this information is not found, output -1.
+If this information is not found,  output is "not found", do not use -1
 
 feature_requests: Extract any sentences about feature requests,\
-and output them as a comma separated Python list.
+and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
 
 competitor_mentions: Extract any sentences about the competition\
-and output them as a comma separated Python list.
+and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
 
 
 Format the output as JSON with the following keys:
@@ -33,22 +33,25 @@
 
 
 final_prompt_string = """\
-For the following text, distill the following information from from the text elements, please ignore negative values and remove brackets:
+For the following text, distill the following information from from the key attributes and maintain spacing between words , please ignore negative  or "not found" values:
 
 delivery_days: How many days did it take for the product \
-to arrive? If this information is not found, output -1.
+to arrive? If this information is not found, output is "not found", do not use -1
 
 price_value: Extract any sentences about the value or price,\
-and output them as a comma separated Python list.
+and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
+
 
 customer_negative_feedback: Extract any problems customers are facing with the current product \
-If this information is not found, output -1.
+If this information is not found, output is "not found", do not use -1
 
 feature_requests: Extract any sentences about feature requests,\
-and output them as a comma separated Python list.
+and output them as a comma separated Python list.  If this information is not found, output is "not found", do not use -1
+
 
 competitor_mentions: Extract any sentences about the competition\
-and output them as a comma separated Python list.
+and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1
+
 
 
 Format the output as JSON with the following keys:

diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py
@@ -73,16 +73,12 @@ async def analyze_document(file: UploadFile) -> dict:
             chunks = await loop.run_in_executor(
                 executor, split_into_chunks, extracted_text
             )
-
-            insights = []
-
-            for chunk in chunks:
-                transcript = product_prompt_template.format_messages(text=chunk)
-                chat = ChatOpenAI(temperature=0.0, model="gpt-4")
-                # run blocking operations in a thread pool
-                insight = await loop.run_in_executor(executor, chat, transcript)
-                insights.append(insight)
-
+            chat = ChatOpenAI(temperature=0.0, model="gpt-4")
+
+            # run tasks in parallel
+            tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks]
+            insights = await asyncio.gather(*tasks)
+
             summary = final_product_prompt_template.format_messages(text=insights)
             chat = ChatOpenAI(temperature=0.0, model="gpt-4")
             # run blocking operations in a thread pool

diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py
@@ -1,13 +1,12 @@
 
 from langchain.text_splitter import CharacterTextSplitter
-from PyPDF2 import PdfReader
+# from PyPDF2 import PdfReader
+import pdfplumber
+from nltk.tokenize import word_tokenize, sent_tokenize
 
 def extract_text_from_pdf(pdf):
-    pdf_reader = PdfReader(pdf)
-    text = ""
-    for page in pdf_reader.pages:
-        text += page.extract_text()
-
+    with pdfplumber.open(pdf) as pdf_reader:
+        text = "\n".join(page.extract_text() for page in pdf_reader.pages)
     return text
 
 
@@ -17,3 +16,22 @@ def split_into_chunks(text: str) -> list[str]:
     )
     chunks = text_splitter.split_text(text)
     return chunks
+# def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]:
+#     sentences = sent_tokenize(text)
+#     chunks = []
+#     current_chunk = ''
+#     current_token_count = 0
+
+#     for sentence in sentences:
+#         tokens = word_tokenize(sentence)
+#         if current_token_count + len(tokens) > max_token_count:
+#             chunks.append(current_chunk)
+#             current_chunk = sentence
+#             current_token_count = len(tokens)
+#         else:
+#             current_chunk += ' ' + sentence
+#             current_token_count += len(tokens)
+
+#     if current_chunk:
+#         chunks.append(current_chunk)
+#     return chunks
diff --git a/frontend/src/App.css b/frontend/src/App.css
@@ -85,3 +85,11 @@
   min-height: 90vh; /* Adjust this as needed */
   background-color: #1f987a;
 }
+
+
+.attributes {
+  color: white;
+  font-size: '20px';
+  text-align: 'center';
+  font-family: 'Times New Roman';
+}
diff --git a/frontend/src/FileUpload.js b/frontend/src/FileUpload.js
@@ -1,26 +1,46 @@
 import './App.css';
-import React, { useState } from 'react';
+import React, { useState, useEffect } from 'react';
 import axios from 'axios';
 
 const FileUpload = () => {
   const [file, setFile] = useState(null);
   const [responseData, setResponseData] = useState({});
+  const [isProcessing, setIsProcessing] = useState(false);
+  const [processingText, setProcessingText] = useState('Processing');
+
+  useEffect(() => {
+    if (isProcessing) {
+      const id = setInterval(() => {
+        setProcessingText((text) => {
+          return text.length < 15 ? text + '.' : 'Processing.';
+        });
+      }, 500);
+      return () => clearInterval(id);
+    } else {
+      setProcessingText('Processing');
+    }
+  }, [isProcessing]);
 
   const submitFile = async (event) => {
     event.preventDefault();
     const formData = new FormData();
     formData.append('file', file);
+    setIsProcessing(true);
 
     try {
       const res = await axios.post('http://localhost:8000/analyze', formData, {
         headers: {
           'Content-Type': 'multipart/form-data',
         },
       });
-      console.log(res.data.content);
-      setResponseData(res.data.content);
+      const parsedData = JSON.parse(res.data.content);
+      console.log(parsedData);
+
+      setResponseData(parsedData);
     } catch (error) {
       console.error(error);
+    } finally {
+      setIsProcessing(false);
     }
   };
 
@@ -48,19 +68,36 @@ const FileUpload = () => {
           onChange={handleFileUpload}
         />
         <button type='submit'>Submit</button>
+        {isProcessing && <p style={{ color: 'blue' }}>{processingText}</p>}
+
+        <p className='attributes'>
+          Here are the insights discovered from the transcript
+        </p>
+        {responseData.delivery_days && (
+          <p className='attributes'>
+            Delivery Days: {responseData.delivery_days}
+          </p>
+        )}
+        {responseData.price_value && (
+          <p className='attributes'>Price Value: {responseData.price_value}</p>
+        )}
+        {responseData.customer_negative_feedback && (
+          <p className='attributes'>
+            Customer Negative Feedback:{' '}
+            {responseData.customer_negative_feedback}
+          </p>
+        )}
+        {responseData.feature_requests && (
+          <p className='attributes'>
+            Feature Requests: {responseData.feature_requests}
+          </p>
+        )}
+        {responseData.competitor_mentions && (
+          <p className='attributes'>
+            Competitor Mentions: {responseData.competitor_mentions}
+          </p>
+        )}
       </form>
-      <div>
-        {responseData.delivery_days ? 
-          <p>Delivery Days: {responseData.delivery_days}</p> : null}
-        {responseData.price_value ?
-          <p>Price Value: {responseData.price_value}</p> : null}
-        {responseData.customer_negative_feedback ? 
-          <p>Customer Negative Feedback: {responseData.customer_negative_feedback}</p> : null}
-        {responseData.feature_requests ? 
-          <p>Feature Requests: {responseData.feature_requests}</p> : null}
-        {responseData.competitor_mentions ? 
-          <p>Competitor Mentions: {responseData.competitor_mentions}</p> : null}
-      </div>
     </div>
   );
 };