From 1a7bc469d376951c198695e292e45b23b36d4e2e Mon Sep 17 00:00:00 2001 From: h3110Fr13nd Date: Sat, 24 Aug 2024 19:23:28 +0530 Subject: [PATCH 1/2] feat: Replace torchaudio with pydub refactor: Removed unnecessary dependencies Removed Requirements - python-dateutil - tiktoken - torchaudio - scipy - tokenizers - huggingface-hub - sentence-transformers - optimum[onnxruntime] Major Changes in This Commit - torchaudio to pydub - bolna/helpers/utils.py - save_audio_file_to_s3 - resample - pcm_to_wav_bytes - wav_bytes_to_pcm - bolna/synthesizer/basesynthesizer - resample - sklearn to np - bolna/memory/cache/vector_cache - __get_top_cosine_similarity_doc --- bolna/helpers/analytics_helpers.py | 3 +- bolna/helpers/utils.py | 113 +++++++++----------- bolna/helpers/vad.py | 1 - bolna/memory/cache/vector_cache.py | 11 +- bolna/synthesizer/azure_synthesizer.py | 2 +- bolna/synthesizer/base_synthesizer.py | 11 +- bolna/synthesizer/elevenlabs_synthesizer.py | 2 +- bolna/synthesizer/melo_synthesizer.py | 2 +- bolna/synthesizer/openai_synthesizer.py | 2 +- bolna/synthesizer/polly_synthesizer.py | 4 +- bolna/transcriber/bodhi_transcriber.py | 4 - bolna/transcriber/deepgram_transcriber.py | 3 - bolna/transcriber/whisper_transcriber.py | 9 +- requirements.txt | 12 +-- 14 files changed, 68 insertions(+), 111 deletions(-) diff --git a/bolna/helpers/analytics_helpers.py b/bolna/helpers/analytics_helpers.py index 8790d081..9bbd1420 100644 --- a/bolna/helpers/analytics_helpers.py +++ b/bolna/helpers/analytics_helpers.py @@ -2,7 +2,6 @@ import os from datetime import datetime, timezone from dotenv import load_dotenv -from dateutil import parser import copy from .utils import format_messages from .logger_config import configure_logger @@ -80,7 +79,7 @@ def update_execution_details(current_high_level_assistant_analytics_data, run_de def update_historical_values(arr, current_run_val, last_updated_at, should_increment, multiplier = 0, interval_minutes=1440): now = datetime.now(timezone.utc) - last_updated_datetime = parser.isoparse(last_updated_at) + last_updated_datetime = datetime.fromisoformat(last_updated_at) difference_in_minutes = (now - last_updated_datetime).total_seconds() / 60 if not arr or len(arr) == 0: diff --git a/bolna/helpers/utils.py b/bolna/helpers/utils.py index 2fcec132..1a783d9b 100644 --- a/bolna/helpers/utils.py +++ b/bolna/helpers/utils.py @@ -1,7 +1,6 @@ import datetime import json import asyncio -import math import re import copy import hashlib @@ -11,9 +10,6 @@ import wave import numpy as np import aiofiles -import torch -import torchaudio -from scipy.io import wavfile from botocore.exceptions import BotoCoreError, ClientError from aiobotocore.session import AioSession from contextlib import AsyncExitStack @@ -90,12 +86,9 @@ def float32_to_int16(float_audio): def wav_bytes_to_pcm(wav_bytes): wav_buffer = io.BytesIO(wav_bytes) - rate, data = wavfile.read(wav_buffer) - if data.dtype == np.int16: - return data.tobytes() - if data.dtype == np.float32: - data = float32_to_int16(data) - return data.tobytes() + audio_segment = AudioSegment.from_file(wav_buffer, format="wav") + pcm_data = audio_segment.raw_data + return pcm_data # def wav_bytes_to_pcm(wav_bytes): @@ -337,15 +330,18 @@ def yield_chunks_from_memory(audio_bytes, chunk_size=512): yield audio_bytes[i:i + chunk_size] -def pcm_to_wav_bytes(pcm_data, sample_rate = 16000, num_channels = 1, sample_width = 2): - buffer = io.BytesIO() - bit_depth = 16 - if len(pcm_data)%2 == 1: +def pcm_to_wav_bytes(pcm_data, sample_rate=16000, num_channels=1, sample_width=2): + if len(pcm_data) % 2 == 1: pcm_data += b'\x00' - tensor_pcm = torch.frombuffer(pcm_data, dtype=torch.int16) - tensor_pcm = tensor_pcm.float() / (2**(bit_depth - 1)) - tensor_pcm = tensor_pcm.unsqueeze(0) - torchaudio.save(buffer, tensor_pcm, sample_rate, format='wav') + audio_segment = AudioSegment( + data=pcm_data, + sample_width=sample_width, + frame_rate=sample_rate, + channels=num_channels + ) + buffer = io.BytesIO() + audio_segment.export(buffer, format="wav") + return buffer.getvalue() @@ -359,16 +355,16 @@ def convert_audio_to_wav(audio_bytes, source_format = 'flac'): return buffer.getvalue() -def resample(audio_bytes, target_sample_rate, format = "mp3"): +def resample(audio_bytes, target_sample_rate, format="mp3"): audio_buffer = io.BytesIO(audio_bytes) - waveform, orig_sample_rate = torchaudio.load(audio_buffer, format = format) + audio_segment = AudioSegment.from_file(audio_buffer, format=format) + orig_sample_rate = audio_segment.frame_rate if orig_sample_rate == target_sample_rate: return audio_bytes - resampler = torchaudio.transforms.Resample(orig_sample_rate, target_sample_rate) - audio_waveform = resampler(waveform) - audio_buffer = io.BytesIO() logger.info(f"Resampling from {orig_sample_rate} to {target_sample_rate}") - torchaudio.save(audio_buffer, audio_waveform, target_sample_rate, format="wav") + resampled_audio = audio_segment.set_frame_rate(target_sample_rate) + audio_buffer = io.BytesIO() + resampled_audio.export(audio_buffer, format="wav") return audio_buffer.getvalue() @@ -450,61 +446,52 @@ async def write_request_logs(message, run_id): else: await log_file.write(log_string) -async def save_audio_file_to_s3(conversation_recording, sampling_rate = 24000, assistant_id = None, run_id = None): +async def save_audio_file_to_s3(conversation_recording, sampling_rate=24000, assistant_id=None, run_id=None): last_frame_end_time = conversation_recording['output'][0]['start_time'] logger.info(f"LENGTH OF OUTPUT AUDIO {len(conversation_recording['output'])}") - initial_gap = (last_frame_end_time - conversation_recording["metadata"]["started"] ) *1000 + initial_gap = (last_frame_end_time - conversation_recording["metadata"]["started"]) * 1000 logger.info(f"Initial gap {initial_gap}") + combined_audio = AudioSegment.silent(duration=initial_gap, frame_rate=sampling_rate) + for i, frame in enumerate(conversation_recording['output']): - frame_start_time = frame['start_time'] - logger.info(f"Processing frame {i}, fram start time = {last_frame_end_time}, frame start time= {frame_start_time}") + frame_start_time = frame['start_time'] + logger.info(f"Processing frame {i}, frame start time = {last_frame_end_time}, frame start time = {frame_start_time}") + if last_frame_end_time < frame_start_time: gap_duration_samples = frame_start_time - last_frame_end_time - silence = AudioSegment.silent(duration=gap_duration_samples*1000, frame_rate=sampling_rate) + silence = AudioSegment.silent(duration=gap_duration_samples * 1000, frame_rate=sampling_rate) combined_audio += silence + last_frame_end_time = frame_start_time + frame['duration'] - frame_as = AudioSegment.from_file(io.BytesIO(frame['data']), format = "wav") - combined_audio +=frame_as - - webm_segment = AudioSegment.from_file(io.BytesIO(conversation_recording['input']["data"])) - wav_bytes = io.BytesIO() - webm_segment.export(wav_bytes, format="wav") - wav_bytes.seek(0) # Reset the pointer to the start - waveform, sample_rate = torchaudio.load(wav_bytes) - resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=sampling_rate) - downsampled_waveform = resampler(waveform) - torchaudio_wavio = io.BytesIO() - torchaudio.save(torchaudio_wavio, downsampled_waveform, sampling_rate, format= "wav") + frame_as = AudioSegment.from_file(io.BytesIO(frame['data']), format="wav") + combined_audio += frame_as + + webm_segment = AudioSegment.from_file(io.BytesIO(conversation_recording['input']["data"]), format="webm") + webm_segment = webm_segment.set_frame_rate(sampling_rate) + audio_segment_bytes = io.BytesIO() + combined_audio = combined_audio.set_frame_rate(sampling_rate) combined_audio.export(audio_segment_bytes, format="wav") audio_segment_bytes.seek(0) - waveform_audio_segment, sample_rate = torchaudio.load(audio_segment_bytes) - - if waveform_audio_segment.shape[0] > 1: - waveform_audio_segment = waveform_audio_segment[:1, :] - - # Adjust shapes to be [1, N] if not already - downsampled_waveform = downsampled_waveform.unsqueeze(0) if downsampled_waveform.dim() == 1 else downsampled_waveform - waveform_audio_segment = waveform_audio_segment.unsqueeze(0) if waveform_audio_segment.dim() == 1 else waveform_audio_segment - - # Ensure both waveforms have the same length - max_length = max(downsampled_waveform.size(1), waveform_audio_segment.size(1)) - downsampled_waveform_padded = torch.nn.functional.pad(downsampled_waveform, (0, max_length - downsampled_waveform.size(1))) - waveform_audio_segment_padded = torch.nn.functional.pad(waveform_audio_segment, (0, max_length - waveform_audio_segment.size(1))) - stereo_waveform = torch.cat((downsampled_waveform_padded, waveform_audio_segment_padded), 0) - - # Verify the stereo waveform shape is [2, M] - assert stereo_waveform.shape[0] == 2, "Stereo waveform should have 2 channels." - key = f'{assistant_id + run_id.split("#")[1]}.wav' - + + combined_audio_segment = AudioSegment.from_file(audio_segment_bytes, format="wav") + combined_audio_segment = combined_audio_segment.set_channels(1) + + if len(webm_segment) > len(combined_audio_segment): + combined_audio_segment = combined_audio_segment + AudioSegment.silent(duration=len(webm_segment) - len(combined_audio_segment)) + elif len(webm_segment) < len(combined_audio_segment): + webm_segment = webm_segment + AudioSegment.silent(duration=len(combined_audio_segment) - len(webm_segment)) + webm_segment = webm_segment.set_channels(1) + combined_audio_segment = combined_audio_segment.set_channels(1) + stereo_audio_segment = webm_segment.overlay(combined_audio_segment) audio_buffer = io.BytesIO() - torchaudio.save(audio_buffer, stereo_waveform, 24000, format="wav") + stereo_audio_segment.export(audio_buffer, format="wav") audio_buffer.seek(0) + key = f'{assistant_id + run_id.split("#")[1]}.wav' logger.info(f"Storing in {RECORDING_BUCKET_URL}{key}") - await store_file(bucket_name=RECORDING_BUCKET_NAME, file_key=key, file_data=audio_buffer, content_type="wav") - + await store_file(bucket_name=RECORDING_BUCKET_NAME, file_key=key, file_data=audio_buffer, content_type="audio/wav") return f'{RECORDING_BUCKET_URL}{key}' def list_number_of_wav_files_in_directory(directory): diff --git a/bolna/helpers/vad.py b/bolna/helpers/vad.py index 4cbb7370..7e9388a0 100644 --- a/bolna/helpers/vad.py +++ b/bolna/helpers/vad.py @@ -1,5 +1,4 @@ import os -import subprocess import requests import torch import numpy as np diff --git a/bolna/memory/cache/vector_cache.py b/bolna/memory/cache/vector_cache.py index 9f047257..4b4b10d9 100644 --- a/bolna/memory/cache/vector_cache.py +++ b/bolna/memory/cache/vector_cache.py @@ -1,12 +1,9 @@ from bolna.helpers.logger_config import configure_logger from bolna.memory.cache.base_cache import BaseCache -from typing import List import numpy as np from fastembed import TextEmbedding -from sentence_transformers import util import numpy as np -from sklearn.metrics.pairwise import cosine_similarity logger = configure_logger(__name__) @@ -23,11 +20,9 @@ def set(self, documents): ) def __get_top_cosine_similarity_doc(self, query_embedding): - #util.pytorch_cos_sim(self.embeddings, query_embedding) - # scores = np.dot(self.embeddings, query_embedding) - # sorted_scores = np.argsort(scores)[::-1] - - similarities = cosine_similarity([query_embedding], self.embeddings)[0] + query_norm = query_embedding / np.linalg.norm(query_embedding) + embeddings_norm = self.embeddings / np.linalg.norm(self.embeddings, axis=1)[:, np.newaxis] + similarities = np.dot(embeddings_norm, query_norm) most_similar_index = np.argmax(similarities) return self.documents[most_similar_index] diff --git a/bolna/synthesizer/azure_synthesizer.py b/bolna/synthesizer/azure_synthesizer.py index 59bd27ad..3c20ecf0 100644 --- a/bolna/synthesizer/azure_synthesizer.py +++ b/bolna/synthesizer/azure_synthesizer.py @@ -1,7 +1,7 @@ import os from dotenv import load_dotenv from bolna.helpers.logger_config import configure_logger -from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample, wav_bytes_to_pcm +from bolna.helpers.utils import create_ws_data_packet, wav_bytes_to_pcm from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache from .base_synthesizer import BaseSynthesizer import azure.cognitiveservices.speech as speechsdk diff --git a/bolna/synthesizer/base_synthesizer.py b/bolna/synthesizer/base_synthesizer.py index de5e0e7b..f540541c 100644 --- a/bolna/synthesizer/base_synthesizer.py +++ b/bolna/synthesizer/base_synthesizer.py @@ -1,7 +1,7 @@ import io -import torchaudio from bolna.helpers.logger_config import configure_logger import asyncio +from pydub import AudioSegment logger = configure_logger(__name__) @@ -29,12 +29,11 @@ def get_synthesized_characters(self): return 0 def resample(self, audio_bytes): - audio_buffer = io.BytesIO(audio_bytes) - waveform, orig_sample_rate = torchaudio.load(audio_buffer) - resampler = torchaudio.transforms.Resample(orig_sample_rate, 8000) - audio_waveform = resampler(waveform) + audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes)) + audio_segment = audio_segment.set_frame_rate(8000) + audio_segment = audio_segment.set_channels(1) audio_buffer = io.BytesIO() - torchaudio.save(audio_buffer, audio_waveform, 8000, format="wav") + audio_segment.export(audio_buffer, format="wav") audio_buffer.seek(0) audio_data = audio_buffer.read() return audio_data diff --git a/bolna/synthesizer/elevenlabs_synthesizer.py b/bolna/synthesizer/elevenlabs_synthesizer.py index 6814df15..0b9a16b4 100644 --- a/bolna/synthesizer/elevenlabs_synthesizer.py +++ b/bolna/synthesizer/elevenlabs_synthesizer.py @@ -11,7 +11,7 @@ from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache from .base_synthesizer import BaseSynthesizer from bolna.helpers.logger_config import configure_logger -from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample +from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, resample logger = configure_logger(__name__) diff --git a/bolna/synthesizer/melo_synthesizer.py b/bolna/synthesizer/melo_synthesizer.py index 7c2e64d2..9bf4ca2b 100644 --- a/bolna/synthesizer/melo_synthesizer.py +++ b/bolna/synthesizer/melo_synthesizer.py @@ -3,7 +3,7 @@ import os from dotenv import load_dotenv from bolna.helpers.logger_config import configure_logger -from bolna.helpers.utils import create_ws_data_packet, resample, wav_bytes_to_pcm +from bolna.helpers.utils import create_ws_data_packet, wav_bytes_to_pcm from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache from .base_synthesizer import BaseSynthesizer import json diff --git a/bolna/synthesizer/openai_synthesizer.py b/bolna/synthesizer/openai_synthesizer.py index 1273d95e..631ffc56 100644 --- a/bolna/synthesizer/openai_synthesizer.py +++ b/bolna/synthesizer/openai_synthesizer.py @@ -2,7 +2,7 @@ import os from dotenv import load_dotenv from bolna.helpers.logger_config import configure_logger -from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample +from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, resample from .base_synthesizer import BaseSynthesizer from openai import AsyncOpenAI import io diff --git a/bolna/synthesizer/polly_synthesizer.py b/bolna/synthesizer/polly_synthesizer.py index 52e8f825..46f2022a 100644 --- a/bolna/synthesizer/polly_synthesizer.py +++ b/bolna/synthesizer/polly_synthesizer.py @@ -4,7 +4,7 @@ from aiobotocore.session import AioSession from contextlib import AsyncExitStack from bolna.helpers.logger_config import configure_logger -from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample +from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache from .base_synthesizer import BaseSynthesizer @@ -14,7 +14,7 @@ class PollySynthesizer(BaseSynthesizer): def __init__(self, voice, language, audio_format="pcm", sampling_rate=8000, stream=False, engine="neural", - buffer_size=400, speaking_rate = "100%", volume = "0dB", caching= True, **kwargs): + buffer_size=400, speaking_rate="100%", volume="0dB", caching=True, **kwargs): super().__init__(stream, buffer_size) self.engine = engine self.format = self.get_format(audio_format.lower()) diff --git a/bolna/transcriber/bodhi_transcriber.py b/bolna/transcriber/bodhi_transcriber.py index cf1f6c72..3a2921f0 100644 --- a/bolna/transcriber/bodhi_transcriber.py +++ b/bolna/transcriber/bodhi_transcriber.py @@ -2,8 +2,6 @@ from audioop import ulaw2lin import traceback import uuid -import numpy as np -import torch import websockets import os import json @@ -14,9 +12,7 @@ from .base_transcriber import BaseTranscriber from bolna.helpers.logger_config import configure_logger from bolna.helpers.utils import create_ws_data_packet -import ssl -torch.set_num_threads(1) logger = configure_logger(__name__) load_dotenv() diff --git a/bolna/transcriber/deepgram_transcriber.py b/bolna/transcriber/deepgram_transcriber.py index b2ad076b..a54bb9ef 100644 --- a/bolna/transcriber/deepgram_transcriber.py +++ b/bolna/transcriber/deepgram_transcriber.py @@ -1,7 +1,6 @@ import asyncio import traceback import numpy as np -import torch import websockets import os import json @@ -13,8 +12,6 @@ from bolna.helpers.logger_config import configure_logger from bolna.helpers.utils import create_ws_data_packet -torch.set_num_threads(1) - logger = configure_logger(__name__) load_dotenv() diff --git a/bolna/transcriber/whisper_transcriber.py b/bolna/transcriber/whisper_transcriber.py index 68be8fbe..ff1c530e 100644 --- a/bolna/transcriber/whisper_transcriber.py +++ b/bolna/transcriber/whisper_transcriber.py @@ -1,26 +1,19 @@ import asyncio -# from asyncio.base_tasks import tasks import traceback import numpy as np -import torch import websockets import os import json import time from .base_transcriber import BaseTranscriber from bolna.helpers.logger_config import configure_logger -from bolna.helpers.utils import create_ws_data_packet, int2float -from bolna.helpers.vad import VAD +from bolna.helpers.utils import create_ws_data_packet from audioop import ulaw2lin, ratecv -import json -import os -import time from queue import Queue from websockets.exceptions import * import uvloop asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) -torch.set_num_threads(1) logger = configure_logger(__name__) diff --git a/requirements.txt b/requirements.txt index 2c18dae0..afe33995 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ aiobotocore==2.9.0 aiofiles==23.2.1 -aiohttp==3.9.1 +aiohttp==3.9.5 azure-cognitiveservices-speech==1.38.0 daily-python==0.9.1 fastapi==0.108.0 @@ -10,20 +10,12 @@ numpy==1.26.1 openai>=1.10.0 pydantic==2.5.3 pydub==0.25.1 -python-dateutil==2.8.2 python-dotenv==1.0.0 redis==5.0.1 requests==2.31.0 -tiktoken>=0.6.0 -torchaudio==2.0.1 twilio==8.9.0 uvicorn==0.22.0 websockets==10.4 onnxruntime>=1.16.3 -scipy==1.11.4 uvloop==0.19.0 -tokenizers==0.15.2 -huggingface-hub==0.20.1 -semantic-router==0.0.46 -sentence-transformers==3.0.1 -optimum[onnxruntime] +semantic-router==0.0.58 From 73002ce150e84ea292c9d2cf10a7d92c0cd6633b Mon Sep 17 00:00:00 2001 From: h3110Fr13nd Date: Mon, 26 Aug 2024 01:37:46 +0530 Subject: [PATCH 2/2] Refactor audio resampling logic using soxr library --- bolna/helpers/utils.py | 19 +++++++++++-------- bolna/synthesizer/base_synthesizer.py | 20 +++++++++++++------- requirements.txt | 8 +++++--- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/bolna/helpers/utils.py b/bolna/helpers/utils.py index 1a783d9b..f17a13d8 100644 --- a/bolna/helpers/utils.py +++ b/bolna/helpers/utils.py @@ -10,6 +10,8 @@ import wave import numpy as np import aiofiles +import soxr +import soundfile as sf from botocore.exceptions import BotoCoreError, ClientError from aiobotocore.session import AioSession from contextlib import AsyncExitStack @@ -356,15 +358,16 @@ def convert_audio_to_wav(audio_bytes, source_format = 'flac'): def resample(audio_bytes, target_sample_rate, format="mp3"): - audio_buffer = io.BytesIO(audio_bytes) - audio_segment = AudioSegment.from_file(audio_buffer, format=format) - orig_sample_rate = audio_segment.frame_rate - if orig_sample_rate == target_sample_rate: - return audio_bytes - logger.info(f"Resampling from {orig_sample_rate} to {target_sample_rate}") - resampled_audio = audio_segment.set_frame_rate(target_sample_rate) + audio_data, orig_sample_rate = sf.read(io.BytesIO(audio_bytes), dtype="int16") + resampler = soxr.resample(audio_data, orig_sample_rate, target_sample_rate, "VHQ") audio_buffer = io.BytesIO() - resampled_audio.export(audio_buffer, format="wav") + audio_segment = AudioSegment( + data=resampler.tobytes(), + sample_width=2, + frame_rate=target_sample_rate, + channels=1 + ) + audio_segment.export(audio_buffer, format="wav") return audio_buffer.getvalue() diff --git a/bolna/synthesizer/base_synthesizer.py b/bolna/synthesizer/base_synthesizer.py index f540541c..b0da2773 100644 --- a/bolna/synthesizer/base_synthesizer.py +++ b/bolna/synthesizer/base_synthesizer.py @@ -1,6 +1,9 @@ import io from bolna.helpers.logger_config import configure_logger import asyncio +import numpy as np +import soxr +import soundfile as sf from pydub import AudioSegment logger = configure_logger(__name__) @@ -28,15 +31,18 @@ def synthesize(self, text): def get_synthesized_characters(self): return 0 - def resample(self, audio_bytes): - audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes)) - audio_segment = audio_segment.set_frame_rate(8000) - audio_segment = audio_segment.set_channels(1) + def resample(audio_bytes, target_sample_rate=8000): + audio_data, orig_sample_rate = sf.read(io.BytesIO(audio_bytes), dtype="int16") + resampler = soxr.resample(audio_data, orig_sample_rate, target_sample_rate, "VHQ") audio_buffer = io.BytesIO() + audio_segment = AudioSegment( + data=resampler.tobytes(), + sample_width=2, + frame_rate=target_sample_rate, + channels=1 + ) audio_segment.export(audio_buffer, format="wav") - audio_buffer.seek(0) - audio_data = audio_buffer.read() - return audio_data + return audio_buffer.getvalue() def get_engine(self): return "default" diff --git a/requirements.txt b/requirements.txt index afe33995..d3035264 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,15 +7,17 @@ fastapi==0.108.0 fastembed==0.2.7 litellm==1.40.20 numpy==1.26.1 +onnxruntime>=1.16.3 openai>=1.10.0 pydantic==2.5.3 pydub==0.25.1 python-dotenv==1.0.0 redis==5.0.1 requests==2.31.0 +semantic-router==0.0.58 +soundfile==0.12.1 +soxr==0.4.0 twilio==8.9.0 uvicorn==0.22.0 -websockets==10.4 -onnxruntime>=1.16.3 uvloop==0.19.0 -semantic-router==0.0.58 +websockets==10.4