-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbp_call_center_audio_analysis.py
486 lines (451 loc) · 21.1 KB
/
bp_call_center_audio_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
import json
import logging
import os
from enum import Enum
from typing import Optional
import azure.functions as func
from dotenv import load_dotenv
from haystack.components.generators.chat.azure import AzureOpenAIChatGenerator
from haystack.dataclasses import ChatMessage
from haystack.utils import Secret
from openai import AsyncAzureOpenAI
from pydantic import BaseModel, Field
from src.components.speech import (
AOAI_WHISPER_MIME_TYPE_MAPPER,
BATCH_TRANSCRIPTION_MIME_TYPE_MAPPER,
AzureSpeechTranscriber,
is_phrase_start_time_match,
)
from src.components.utils import (
InvalidFileTypeError,
base64_file_to_buffer,
get_file_ext_and_mime_type,
)
from src.helpers.common import MeasureRunTime
from src.result_enrichment.common import is_value_in_content
from src.schema import LLMResponseBaseModel
load_dotenv()
bp_call_center_audio_analysis = func.Blueprint()
SPEECH_REGION = os.getenv("SPEECH_REGION")
SPEECH_API_KEY = os.getenv("SPEECH_API_KEY")
AOAI_LLM_DEPLOYMENT = os.getenv("AOAI_LLM_DEPLOYMENT")
AOAI_WHISPER_DEPLOYMENT = os.getenv("AOAI_WHISPER_DEPLOYMENT")
AOAI_ENDPOINT = os.getenv("AOAI_ENDPOINT")
AOAI_API_KEY = os.getenv("AOAI_API_KEY")
# Load the API key as a Secret, so that it is not logged in any traces or saved if the Haystack component is exported.
AOAI_API_KEY_SECRET = Secret.from_token(AOAI_API_KEY)
### Setup components
aoai_whisper_async_client = AsyncAzureOpenAI(
azure_endpoint=AOAI_ENDPOINT,
azure_deployment=AOAI_WHISPER_DEPLOYMENT,
api_key=AOAI_API_KEY,
api_version="2024-06-01",
)
transcriber = AzureSpeechTranscriber(
speech_region=SPEECH_REGION,
speech_key=SPEECH_API_KEY,
aoai_whisper_async_client=aoai_whisper_async_client,
)
# Define the configuration for the transcription job
# More info: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/fast-transcription-create
# And: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-transcription-create?pivots=rest-api#request-configuration-options
fast_transcription_definition = {
"locales": ["en-US"],
"profanityFilterMode": "Masked",
"diarizationEnabled": False, # Not available for fast transcription as of August 2024
"wordLevelTimestampsEnabled": True,
}
# More info: https://platform.openai.com/docs/guides/speech-to-text
aoai_whisper_kwargs = {
"language": "en",
"prompt": None,
"temperature": None,
"timeout": 60,
}
azure_generator = AzureOpenAIChatGenerator(
azure_endpoint=AOAI_ENDPOINT,
azure_deployment=AOAI_LLM_DEPLOYMENT,
api_key=AOAI_API_KEY_SECRET,
api_version="2024-06-01",
generation_kwargs={
"response_format": {"type": "json_object"}
}, # Ensure we get JSON responses
)
# Create mappers to handle different types of transcription methods
TRANSCRIPTION_METHOD_TO_MIME_MAPPER = {
"fast": BATCH_TRANSCRIPTION_MIME_TYPE_MAPPER,
"aoai_whisper": AOAI_WHISPER_MIME_TYPE_MAPPER,
}
### Setup Pydantic models for validation of LLM calls, and the Function response itself
# Classification fields
class CustomerSatisfactionEnum(Enum):
Satisfied = "Satisfied"
Dissatisfied = "Dissatisfied"
CUSTOMER_SATISFACTION_VALUES = [e.value for e in CustomerSatisfactionEnum]
class CustomerSentimentEnum(Enum):
Positive = "Positive"
Neutral = "Neutral"
Negative = "Negative"
CUSTOMER_SENTIMENT_VALUES = [e.value for e in CustomerSentimentEnum]
# Setup a class for the raw keywords returned by the LLM, and then the enriched version (after we match the keywords to the transcription)
class RawKeyword(LLMResponseBaseModel):
keyword: str = Field(
description="A keyword extracted from the call. This should be a direct match to a word or phrase in the transcription without modification of the spelling or grammar.",
examples=["credit card account"],
)
timestamp: str = Field(
description="The timestamp of the sentence from which the keyword was uttered.",
examples=["0:18"],
)
class ProcessedKeyWord(RawKeyword):
keyword_matched_to_transcription_sentence: bool = Field(
description="Whether the keyword was matched to a single sentence in the transcription.",
)
full_sentence_text: Optional[str] = Field(
default=None,
description="The full text of the sentence in which the keyword was uttered.",
)
sentence_confidence: Optional[float] = Field(
default=None,
description="The confidence score of the sentence from which the keyword was extracted.",
)
sentence_start_time_secs: Optional[float] = Field(
default=None,
description="The start time of the sentence in the audio recording.",
)
sentence_end_time_secs: Optional[float] = Field(
default=None,
description="The end time of the sentence in the audio recording.",
)
# Define the full schema for the LLM's response. We inherit from LLMResponseBaseModel to get the prompt generation functionality.
class LLMRawResponseModel(LLMResponseBaseModel):
"""
Defines the required JSON schema for the LLM to adhere to. This can be used
to validate that the LLM's raw text response can be parsed into the format
that is expected by downstream processes (e.g. when we need to save the data
into a database).
This class inherits from LLMResponseBaseModel and sets a description and
example for each field, allowing us to run `model.get_prompt_json_example()`
to generate a prompt-friendly string representation of the expected JSON
that we can provide to the LLM.
"""
call_summary: str = Field(
description="A summary of the call, including the topics and key action items. This should be no more than 20 words long.",
examples=[
"The customer called to close their credit card account. The agent closed the account and a confirmation email was sent."
],
)
customer_satisfaction: CustomerSatisfactionEnum = Field(
description=f"Is the customer satisfied with the agent interaction. It must only be one of these options: {CUSTOMER_SATISFACTION_VALUES}.",
examples=[CUSTOMER_SATISFACTION_VALUES[0]],
)
customer_sentiment: CustomerSentimentEnum = Field(
description=f"The sentiment of the customer on the call. It must be one of these options: {CUSTOMER_SENTIMENT_VALUES}.",
examples=[CUSTOMER_SATISFACTION_VALUES[-1]],
)
next_action: Optional[str] = Field(
description="The next action that needs to be taken, if there is one. This should be no more than 20 words long. If no action is necessary, return null.",
examples=["The agent will send a follow-up email to the customer."],
)
next_action_sentence_timestamp: Optional[str] = Field(
description="The timestamp of the sentence where the next action was mentioned. This should be the timestamp written in the transcription.",
examples=["6:12"],
)
keywords: list[RawKeyword] = Field(
description=(
"A list of keywords related to the purpose of the call, the products they are interested in, or any issues that have occurred. "
"Each result should include the exact keyword and a timestamp of the sentence where it was uttered. "
"Each keyword should match a word or phrase in the transcription without modification of the spelling or grammar)."
),
examples=[
[
{"keyword": "credit card account", "timestamp": "0:18"},
{"keyword": "bank account", "timestamp": "0:46"},
{"keyword": "insurance", "timestamp": "2:37"},
{"keyword": "complaint", "timestamp": "4:52"},
]
],
)
class ProcessedResultModel(LLMRawResponseModel):
"""
Defined the schema for the processed result that will be returned by the
function. This class inherits from LLMRawResponseModel but overwrites the
`keywords` field to use the ProcessedKeyWord model instead of the
RawKeyword. This way we can return the processed keywords with additional
metadata.
"""
keywords: list[ProcessedKeyWord] = Field(
description=(
"A list of key phrases related to the purpose of the call, the products they are interested in, or any issues that have occurred. "
"Each item includes the keyword and timestamp of the sentence as extracted by the LLM, along with additional metadata "
"that is merged from the Transcription result."
),
)
class FunctionReponseModel(BaseModel):
"""
Defines the schema that will be returned by the function. We'll use this to
ensure that the response contains the correct values and structure, and
to allow a partially filled response to be returned in case of an error.
"""
success: bool = Field(
default=False, description="Indicates whether the pipeline was successful."
)
result: Optional[ProcessedResultModel] = Field(
default=None, description="The final result of the pipeline."
)
error_text: Optional[str] = Field(
default=None,
description="If an error occurred, this field will contain the error message.",
)
speech_extracted_text: Optional[str] = Field(
default=None,
description="The raw & formatted text content extracted by Azure AI Speech.",
)
speech_raw_response: Optional[list | dict] = Field(
default=None, description="The raw API response from Azure AI Speech."
)
speech_time_taken_secs: Optional[float] = Field(
default=None,
description="The time taken to transcribe the text using Azure AI Speech.",
)
llm_input_messages: Optional[list[dict]] = Field(
default=None, description="The messages that were sent to the LLM."
)
llm_reply_messages: Optional[list[dict]] = Field(
default=None, description="The messages that were received from the LLM."
)
llm_raw_response: Optional[str] = Field(
default=None, description="The raw text response from the LLM."
)
llm_time_taken_secs: Optional[float] = Field(
default=None, description="The time taken to receive a response from the LLM."
)
func_time_taken_secs: Optional[float] = Field(
default=None, description="The total time taken to process the request."
)
# Create the system prompt for the LLM, dynamically including the JSON schema
# of the expected response so that any changes to the schema are automatically
# reflected in the prompt, and in a JSON format that is similar in structure
# to the training data on which the LLM was trained (increasing reliability of
# the result).
LLM_SYSTEM_PROMPT = (
"You are a customer service contact center agent, and you specialize in summarizing and classifying "
"the content of customer service call recordings.\n"
"Your task is to review a customer service call and extract all of the key information from the call recording.\n"
f"{LLMRawResponseModel.get_prompt_json_example(include_preceding_json_instructions=True)}"
)
@bp_call_center_audio_analysis.route(route="call_center_audio_analysis")
async def call_center_audio_analysis(
req: func.HttpRequest,
) -> func.HttpResponse:
logging.info("Python HTTP trigger function processed a request.")
try:
func_timer = MeasureRunTime()
func_timer.start()
# Create the object to hold all intermediate and final values. We will progressively update
# values as each stage of the pipeline is completed, allowing us to return a partial
# response in case of an error at any stage.
output_model = FunctionReponseModel(success=False)
## Check the request body
# Transcription method
request_json_content = json.loads(req.files["json"].read().decode("utf-8"))
transcription_method = request_json_content["method"]
if transcription_method not in TRANSCRIPTION_METHOD_TO_MIME_MAPPER:
output_model.error_text = f"Invalid transcription method `{transcription_method}`. Please use one of {TRANSCRIPTION_METHOD_TO_MIME_MAPPER.keys().tolist()}"
logging.exception(output_model.error_text)
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=500,
)
valid_mime_to_filetype_mapper = TRANSCRIPTION_METHOD_TO_MIME_MAPPER[
transcription_method
]
# Audio file & type
audio_file = req.files["audio"]
audio_file_b64 = audio_file.read()
try:
audio_file_ext, _audio_file_content_type = get_file_ext_and_mime_type(
valid_mimes_to_file_ext_mapper=valid_mime_to_filetype_mapper,
filename=audio_file.filename,
content_type=audio_file.content_type,
)
audio_filename = (
audio_file.filename if audio_file.filename else f"file.{audio_file_ext}"
)
except InvalidFileTypeError as e:
output_model.error_text = (
"Please sbumit a file with a valid filename or content type. " + str(e)
)
logging.exception(output_model.error_text)
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=500,
)
# Get the transcription result
try:
with MeasureRunTime() as speech_timer:
if transcription_method == "fast":
transcription, raw_transcription_api_response = (
await transcriber.get_fast_transcription_async(
audio_file=audio_file_b64,
definition=fast_transcription_definition,
)
)
else:
audio_file = base64_file_to_buffer(
b64_str=audio_file_b64, name=audio_filename
)
transcription, raw_transcription_api_response = (
await transcriber.get_aoai_whisper_transcription_async(
audio_file=audio_file,
**aoai_whisper_kwargs,
)
)
formatted_transcription_text = transcription.to_formatted_str(
transcription_prefix_format="Language: {language}\nDuration: {formatted_duration} minutes\n\nConversation:\n",
phrase_format="[{start_min}:{start_sub_sec}] {auto_phrase_source_name} {auto_phrase_source_id}: {display_text}",
)
output_model.speech_extracted_text = formatted_transcription_text
output_model.speech_raw_response = raw_transcription_api_response
output_model.speech_time_taken_secs = speech_timer.time_taken
except Exception as _e:
output_model.error_text = "An error occurred during audio transcription."
output_model.func_time_taken_secs = func_timer.stop()
logging.exception(output_model.error_text)
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=500,
)
# Create the messages to send to the LLM in the following order:
# 1. System prompt
# 2. Audio transcription, formatted in a clear way
try:
input_messages = [
ChatMessage.from_system(LLM_SYSTEM_PROMPT),
ChatMessage.from_user(formatted_transcription_text),
]
output_model.llm_input_messages = [
msg.to_openai_format() for msg in input_messages
]
except Exception as _e:
output_model.error_text = (
"An error occurred while creating the LLM input messages."
)
output_model.func_time_taken_secs = func_timer.stop()
logging.exception(output_model.error_text)
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=500,
)
# Send request to LLM
try:
with MeasureRunTime() as llm_timer:
llm_result = azure_generator.run(messages=input_messages)
output_model.llm_time_taken_secs = llm_timer.time_taken
except Exception as _e:
output_model.error_text = "An error occurred when sending the LLM request."
output_model.func_time_taken_secs = func_timer.stop()
logging.exception(output_model.error_text)
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=500,
)
# Validate that the LLM response matches the expected schema
try:
output_model.llm_reply_messages = [
msg.to_openai_format() for msg in llm_result["replies"]
]
if len(llm_result["replies"]) != 1:
raise ValueError(
"The LLM response did not contain exactly one message."
)
output_model.llm_raw_response = llm_result["replies"][0].content
llm_structured_response = LLMRawResponseModel(
**json.loads(llm_result["replies"][0].content)
)
except Exception as _e:
output_model.error_text = "An error occurred when validating the LLM's returned response into the expected schema."
output_model.func_time_taken_secs = func_timer.stop()
logging.exception(output_model.error_text)
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=500,
)
# Process each keyword and add additional metadata from the transcription and return a processed result
try:
processed_keywords = list()
for keyword in llm_structured_response.keywords:
# Find the sentence in the transcription that contains the keyword.
# Search for all sentences with the same timestamp and where the LLM's
# text was contained in the sentence. If not available, mark the keyword as not matched.
keyword_sentence_start_time_secs = int(
keyword.timestamp.split(":")[0]
) * 60 + int(keyword.timestamp.split(":")[1])
matching_phrases = [
phrase
for phrase in transcription.phrases
if is_value_in_content(
keyword.keyword.lower(), phrase.display_text.lower()
)
and is_phrase_start_time_match(
expected_start_time_secs=keyword_sentence_start_time_secs,
phrase=phrase,
start_time_tolerance_secs=1,
)
]
if len(matching_phrases) == 1:
processed_keywords.append(
ProcessedKeyWord(
**keyword.dict(),
keyword_matched_to_transcription_sentence=True,
full_sentence_text=matching_phrases[0].display_text,
sentence_confidence=matching_phrases[0].confidence,
sentence_start_time_secs=matching_phrases[0].start_secs,
sentence_end_time_secs=matching_phrases[0].end_secs,
)
)
else:
processed_keywords.append(
ProcessedKeyWord(
**keyword.dict(),
keyword_matched_to_transcription_sentence=False,
)
)
# Construct processed model, replacing the raw keywords with the processed keywords
llm_structured_response_dict = llm_structured_response.dict()
llm_structured_response_dict.pop("keywords")
output_model.result = ProcessedResultModel(
**llm_structured_response_dict,
keywords=processed_keywords,
)
except Exception as _e:
output_model.error_text = (
"An error occurred when post-processing the keywords."
)
output_model.func_time_taken_secs = func_timer.stop()
logging.exception(output_model.error_text)
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=500,
)
# All steps completed successfully, set success=True and return the final result
output_model.success = True
output_model.func_time_taken_secs = func_timer.stop()
return func.HttpResponse(
body=output_model.model_dump_json(),
mimetype="application/json",
status_code=200,
)
except Exception as _e:
logging.exception("An error occurred during processing.")
return func.HttpResponse(
"An error occurred during processing.",
status_code=500,
)