-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
150 lines (124 loc) · 7.09 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import pandas as pd
from groq import Groq
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Initialize the Groq client with the API key from environment variables
client = Groq(
api_key=os.environ.get("GROQ_API_KEY"),
)
def process_reviews(filepath):
"""
Process reviews from a specified file and return sentiment analysis results.
Parameters:
filepath (str): The path to the file containing reviews.
Returns:
dict: A dictionary with sentiment scores for positive, negative, and neutral sentiments.
"""
# Extract the file extension to determine the file type
file_extension = filepath.split('.')[-1]
# Read the data from the file based on its extension
if file_extension == 'csv':
df = pd.read_csv(filepath) # Read CSV file
elif file_extension == 'xlsx':
df = pd.read_excel(filepath) # Read Excel file
else:
raise Exception("Invalid file format") # Raise error if the file format is unsupported
# Find the review column in a case-insensitive manner
review_column = next((col for col in df.columns if col.strip().lower() == 'review'), None)
if review_column is None:
raise Exception("Missing 'Review' column or equivalent") # Raise error if review column is missing
# Drop rows where the review column is NaN (empty)
df = df.dropna(subset=[review_column])
reviews = df[review_column].tolist() # Convert review column to a list
# Print total number of non-empty reviews for debugging
print(f"Total number of non-empty reviews: {len(reviews)}")
# Get sentiment analysis results
sentiment_counts, unclassified_reviews = get_sentiment_analysis(reviews)
# Return the sentiment scores directly
return {
"positive": sentiment_counts["positive"],
"negative": sentiment_counts["negative"],
"neutral": sentiment_counts["neutral"],
}
def get_sentiment_analysis(reviews, batch_size=10):
"""
Analyze sentiment of a list of reviews and categorize them into positive, negative, and neutral.
Parameters:
reviews (list): List of reviews to analyze.
batch_size (int): Number of reviews to process in each API call.
Returns:
tuple: Overall sentiment counts and a list of unclassified reviews.
"""
overall_sentiment = {"positive": 0, "negative": 0, "neutral": 0} # Initialize sentiment counts
unclassified_reviews = [] # List to hold unclassified reviews
# Process reviews in batches
for i in range(0, len(reviews), batch_size):
batch = reviews[i:i+batch_size] # Create a batch of reviews
prompt = (
"Analyze the sentiment of each of the following reviews. "
"For each review, respond with ONLY 'Positive', 'Negative', or 'Neutral'. "
"After classifying all reviews, provide a summary count in the format:\n"
"Positive: <count>\nNegative: <count>\nNeutral: <count>\n\n"
"Reviews:\n"
)
prompt += "\n".join(f"{j+1}. {review}" for j, review in enumerate(batch)) # Add reviews to prompt
try:
# Send the prompt to the Groq API to classify sentiments
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-8b-8192",
)
response_content = chat_completion.choices[0].message.content.strip() # Get the response content
print(f"Batch {i//batch_size + 1} response:\n{response_content}\n") # Print response for debugging
lines = response_content.split('\n') # Split response into lines
classifications = [] # List to hold classifications
# Extract sentiment classifications from the response
for line in lines:
if line and line[0].isdigit(): # Check if the line starts with a number (indicating a review)
parts = line.split('.', 1) # Split by the first period
if len(parts) > 1:
classification = parts[1].strip().lower() # Get the sentiment classification
if classification in ['positive', 'negative', 'neutral']:
classifications.append(classification) # Add valid classification to list
# Find and parse the summary of sentiment counts from the response
summary_start = next((i for i, line in enumerate(lines) if line.startswith("Positive:")), -1)
if summary_start != -1:
summary = lines[summary_start:] # Extract the summary lines
batch_sentiment = {"positive": 0, "negative": 0, "neutral": 0} # Initialize batch sentiment counts
for line in summary:
for key in batch_sentiment: # Check each line for sentiment counts
if line.lower().startswith(key):
batch_sentiment[key] = int(line.split(":")[1].strip()) # Parse count
# Update overall sentiment counts
for key in overall_sentiment:
overall_sentiment[key] += batch_sentiment[key]
# Check for unclassified reviews in this batch
if len(classifications) < len(batch):
unclassified_reviews.extend(batch[len(classifications):]) # Add unclassified reviews to the list
except Exception as e:
print(f"Error processing batch {i//batch_size + 1}: {str(e)}") # Log error
unclassified_reviews.extend(batch) # Add entire batch to unclassified if there's an error
# Final pass for unclassified reviews
if unclassified_reviews:
print("Processing unclassified reviews:") # Indicate unclassified review processing
for review in unclassified_reviews[:]: # Iterate over a copy of the list
prompt = f"Analyze the sentiment of the following review. Respond with ONLY 'Positive', 'Negative', or 'Neutral'.\n\nReview: {review}"
try:
# Send each unclassified review for classification
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-8b-8192",
)
response = chat_completion.choices[0].message.content.strip().lower() # Get the response
if response in overall_sentiment:
overall_sentiment[response] += 1 # Increment the appropriate sentiment count
unclassified_reviews.remove(review) # Remove from unclassified list
print(f"Classified as {response}: {review}") # Log classification
else:
print(f"Failed to classify: {review}") # Log failure to classify
except Exception as e:
print(f"Error processing review: {str(e)}") # Log error for unclassified review
print(f"Overall sentiment counts: {overall_sentiment}") # Print final sentiment counts
return overall_sentiment, unclassified_reviews # Return overall counts and unclassified reviews