From 08c76c0b8df07b2cf57beb276eb2393b0c2ea457 Mon Sep 17 00:00:00 2001 From: zachary_royals Date: Tue, 27 Jun 2023 23:07:52 -0400 Subject: [PATCH] Modifications to frontend and backend --- backend/app/api/prompts.py | 27 ++++++------ backend/app/api/routes.py | 16 +++---- backend/app/utils/document_utils.py | 30 ++++++++++--- frontend/src/App.css | 8 ++++ frontend/src/FileUpload.js | 67 ++++++++++++++++++++++------- 5 files changed, 105 insertions(+), 43 deletions(-) diff --git a/backend/app/api/prompts.py b/backend/app/api/prompts.py index 7d73222..2aa232a 100644 --- a/backend/app/api/prompts.py +++ b/backend/app/api/prompts.py @@ -2,23 +2,23 @@ product_prompt_string = """\ -For the following text, extract the following information from speaker 2 and remove any spaces between letters of each word. +For the following text, extract the following information from speaker 2 and maintain spacing between words. delivery_days: How many days did it take for the product \ -to arrive? If this information is not found, output -1. +to arrive? If this information is not found, output is "not found"., do not use -1 price_value: Extract any sentences about the value or price,\ -and output them as a comma separated Python list. +and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 customer_negative_feedback: Extract any problems customers are facing with the current product \ -If this information is not found, output -1. +If this information is not found, output is "not found", do not use -1 feature_requests: Extract any sentences about feature requests,\ -and output them as a comma separated Python list. +and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 competitor_mentions: Extract any sentences about the competition\ -and output them as a comma separated Python list. +and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 Format the output as JSON with the following keys: @@ -33,22 +33,25 @@ final_prompt_string = """\ -For the following text, distill the following information from from the text elements, please ignore negative values and remove brackets: +For the following text, distill the following information from from the key attributes and maintain spacing between words , please ignore negative or "not found" values: delivery_days: How many days did it take for the product \ -to arrive? If this information is not found, output -1. +to arrive? If this information is not found, output is "not found", do not use -1 price_value: Extract any sentences about the value or price,\ -and output them as a comma separated Python list. +and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 + customer_negative_feedback: Extract any problems customers are facing with the current product \ -If this information is not found, output -1. +If this information is not found, output is "not found", do not use -1 feature_requests: Extract any sentences about feature requests,\ -and output them as a comma separated Python list. +and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 + competitor_mentions: Extract any sentences about the competition\ -and output them as a comma separated Python list. +and output them as a comma separated Python list. If this information is not found, output is "not found", do not use -1 + Format the output as JSON with the following keys: diff --git a/backend/app/api/routes.py b/backend/app/api/routes.py index 833f58f..9e492fa 100644 --- a/backend/app/api/routes.py +++ b/backend/app/api/routes.py @@ -73,16 +73,12 @@ async def analyze_document(file: UploadFile) -> dict: chunks = await loop.run_in_executor( executor, split_into_chunks, extracted_text ) - - insights = [] - - for chunk in chunks: - transcript = product_prompt_template.format_messages(text=chunk) - chat = ChatOpenAI(temperature=0.0, model="gpt-4") - # run blocking operations in a thread pool - insight = await loop.run_in_executor(executor, chat, transcript) - insights.append(insight) - + chat = ChatOpenAI(temperature=0.0, model="gpt-4") + + # run tasks in parallel + tasks = [loop.run_in_executor(executor, chat, product_prompt_template.format_messages(text=chunk)) for chunk in chunks] + insights = await asyncio.gather(*tasks) + summary = final_product_prompt_template.format_messages(text=insights) chat = ChatOpenAI(temperature=0.0, model="gpt-4") # run blocking operations in a thread pool diff --git a/backend/app/utils/document_utils.py b/backend/app/utils/document_utils.py index a0e2325..6d930d6 100644 --- a/backend/app/utils/document_utils.py +++ b/backend/app/utils/document_utils.py @@ -1,13 +1,12 @@ from langchain.text_splitter import CharacterTextSplitter -from PyPDF2 import PdfReader +# from PyPDF2 import PdfReader +import pdfplumber +from nltk.tokenize import word_tokenize, sent_tokenize def extract_text_from_pdf(pdf): - pdf_reader = PdfReader(pdf) - text = "" - for page in pdf_reader.pages: - text += page.extract_text() - + with pdfplumber.open(pdf) as pdf_reader: + text = "\n".join(page.extract_text() for page in pdf_reader.pages) return text @@ -17,3 +16,22 @@ def split_into_chunks(text: str) -> list[str]: ) chunks = text_splitter.split_text(text) return chunks +# def split_into_chunks(text: str, max_token_count: int=7000) -> list[str]: +# sentences = sent_tokenize(text) +# chunks = [] +# current_chunk = '' +# current_token_count = 0 + +# for sentence in sentences: +# tokens = word_tokenize(sentence) +# if current_token_count + len(tokens) > max_token_count: +# chunks.append(current_chunk) +# current_chunk = sentence +# current_token_count = len(tokens) +# else: +# current_chunk += ' ' + sentence +# current_token_count += len(tokens) + +# if current_chunk: +# chunks.append(current_chunk) +# return chunks diff --git a/frontend/src/App.css b/frontend/src/App.css index 54907e6..54108aa 100644 --- a/frontend/src/App.css +++ b/frontend/src/App.css @@ -85,3 +85,11 @@ min-height: 90vh; /* Adjust this as needed */ background-color: #1f987a; } + + +.attributes { + color: white; + font-size: '20px'; + text-align: 'center'; + font-family: 'Times New Roman'; +} diff --git a/frontend/src/FileUpload.js b/frontend/src/FileUpload.js index bab8a91..4809c12 100644 --- a/frontend/src/FileUpload.js +++ b/frontend/src/FileUpload.js @@ -1,15 +1,31 @@ import './App.css'; -import React, { useState } from 'react'; +import React, { useState, useEffect } from 'react'; import axios from 'axios'; const FileUpload = () => { const [file, setFile] = useState(null); const [responseData, setResponseData] = useState({}); + const [isProcessing, setIsProcessing] = useState(false); + const [processingText, setProcessingText] = useState('Processing'); + + useEffect(() => { + if (isProcessing) { + const id = setInterval(() => { + setProcessingText((text) => { + return text.length < 15 ? text + '.' : 'Processing.'; + }); + }, 500); + return () => clearInterval(id); + } else { + setProcessingText('Processing'); + } + }, [isProcessing]); const submitFile = async (event) => { event.preventDefault(); const formData = new FormData(); formData.append('file', file); + setIsProcessing(true); try { const res = await axios.post('http://localhost:8000/analyze', formData, { @@ -17,10 +33,14 @@ const FileUpload = () => { 'Content-Type': 'multipart/form-data', }, }); - console.log(res.data.content); - setResponseData(res.data.content); + const parsedData = JSON.parse(res.data.content); + console.log(parsedData); + + setResponseData(parsedData); } catch (error) { console.error(error); + } finally { + setIsProcessing(false); } }; @@ -48,19 +68,36 @@ const FileUpload = () => { onChange={handleFileUpload} /> + {isProcessing &&

{processingText}

} + +

+ Here are the insights discovered from the transcript +

+ {responseData.delivery_days && ( +

+ Delivery Days: {responseData.delivery_days} +

+ )} + {responseData.price_value && ( +

Price Value: {responseData.price_value}

+ )} + {responseData.customer_negative_feedback && ( +

+ Customer Negative Feedback:{' '} + {responseData.customer_negative_feedback} +

+ )} + {responseData.feature_requests && ( +

+ Feature Requests: {responseData.feature_requests} +

+ )} + {responseData.competitor_mentions && ( +

+ Competitor Mentions: {responseData.competitor_mentions} +

+ )} -
- {responseData.delivery_days ? -

Delivery Days: {responseData.delivery_days}

: null} - {responseData.price_value ? -

Price Value: {responseData.price_value}

: null} - {responseData.customer_negative_feedback ? -

Customer Negative Feedback: {responseData.customer_negative_feedback}

: null} - {responseData.feature_requests ? -

Feature Requests: {responseData.feature_requests}

: null} - {responseData.competitor_mentions ? -

Competitor Mentions: {responseData.competitor_mentions}

: null} -
); };