Problem with threads and urllib #161
-
This is sort of a weird bug, and one I haven't particularly discovered it's kind for a while. I have 2 files, import time
import pyaudio
import threading
import wave
from datetime import datetime
from new_STT_exp import STTHandler
CHUNK_SIZE = 4096
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000 # Amount of bits per second
p = pyaudio.PyAudio()
ONE_SECONDS_WORTH = RATE
class AudioRecordingHandler:
"""
A class that handles the recording of audio from the microphone, compiling it as a single file (instead of frames).\n
It also sends the audio data to the Speech to Text API (Deepgram) and receives text data back.
"""
def __init__(self) -> None:
self.stream: pyaudio.Stream | None = None
self.frames: list[bytes] = [] # Each frame should be ~1 second of audio
def save_one_second(self) -> None:
"""
Runs in another thread and regularly polls the audio stream in from the microphone, \n
appends it to it's list, and sends it to the speech to text API.
"""
while True:
if self.stream is None:
return
start_time = datetime.now()
try:
frame = self.stream.read(ONE_SECONDS_WORTH)
self.frames.append(frame)
except OSError: # OSError: [Errno -9981] Input overflowed
return
end_time = datetime.now()
time_taken = (end_time - start_time).total_seconds()
if time_taken < 1:
time.sleep(1 - time_taken)
def start_recording(self) -> None:
"""Starts capturing audio data from the microphone and also starts a new thread to regularly save the audio data"""
stream: pyaudio._Stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK_SIZE,
)
self.stream = stream
thread = threading.Thread(target=self.save_one_second)
thread.start()
def stop_recording(self) -> str:
"""Stops capturing audio data from the microphone and returns the complete audio transcript as a string."""
time.sleep(0.5) # This is to make sure the chunks don't get truncated early
self.stream.stop_stream(); self.stream.close(); self.stream = None # type: ignore
complete_recording= b"".join(self.frames)
text = STTHandler(complete_recording, True).transcribe()
print(f"Stop recording function is returning: {text=}")
return text
if __name__ == "__main__":
audio_handler = AudioRecordingHandler()
#print("Start recording!")
audio_handler.start_recording()
time.sleep(3)
print("Slept 3 seconds, stop recording!")
text = audio_handler.stop_recording()
print(f"{text=}") And also, the from deepgram import Deepgram
from deepgram._types import PrerecordedOptions, BufferSource
with open("keys/deepgram_key.txt", "r") as f:
DEEPGRAM_API_KEY = f.read()
API_SETTINGS = PrerecordedOptions(punctuate=True, model='nova')
class STTHandler:
def __init__(self, audio_data: bytes) -> None:
self.deepgram = Deepgram(DEEPGRAM_API_KEY)
self.source: BufferSource = {'buffer': audio_data, 'mimetype': "audio/mp3"}
def transcribe(self) -> str:
print("Start, pre transcription")
response = self.deepgram.transcription.sync_prerecorded(self.source, option=API_SETTINGS) # type: ignore , keywords=['first:5', 'second']
raw_text = response["results"]['channels'][0]['alternatives'][0]['transcript'] # type: ignore
print(f"Transcription: {raw_text}")
return raw_text
if __name__ == "__main__":
with open("assets/audio/recent_user_speech.mp3", 'rb') as file:
audio_bytes = file.read()
text = STTHandler(audio_bytes, True).transcribe()
print(text) So what's my problem? My question is A) How can I fix this, and B) Is there anything the library can do to protect against this? Many thanks in advance for the help! N.b. I'm trying to make a script which will capture X seconds of data, then transcribe them using Deepgram, then return the response. |
Beta Was this translation helpful? Give feedback.
Replies: 3 comments 4 replies
-
@lukeocodes @jjmaldonis |
Beta Was this translation helpful? Give feedback.
-
Hey @UP929312, I was able to reproduce the issue. I also found out that the issue is (for better or worse) not related to the call to Deepgram's API. If I replace the API call to Deepgram's API with Please see below for the exact code I am using to reproduce this. Specifically, see the from deepgram._types import PrerecordedOptions, BufferSource
from deepgram import Deepgram
from datetime import datetime
import threading
import pyaudio
import time
import os
DEEPGRAM_API_KEY = os.environ["DEEPGRAM_API_KEY"] # Your Deepgram API Key
API_SETTINGS = PrerecordedOptions(punctuate=True, model="nova")
class STTHandler:
def __init__(self, audio_data: bytes) -> None:
self.deepgram = Deepgram(DEEPGRAM_API_KEY)
self.source: BufferSource = {"buffer": audio_data, "mimetype": "audio/mp3"}
def transcribe(self) -> str:
try:
print("Start, pre transcription")
time.sleep(2) # COMMENT THIS LINE OUT TO SEE THE PROGRAM COMPLETE SUCCESSFULLY
# response = self.deepgram.transcription.sync_prerecorded(self.source, option=API_SETTINGS) # type: ignore , keywords=['first:5', 'second']
# raw_text = response["results"]["channels"][0]["alternatives"][0]["transcript"] # type: ignore
raw_text = ""
print(f"Transcription: {raw_text}")
return raw_text
except Exception as e:
print(e)
return ""
CHUNK_SIZE = 4096
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000 # Amount of bits per second
p = pyaudio.PyAudio()
ONE_SECONDS_WORTH = RATE
class AudioRecordingHandler:
"""
A class that handles the recording of audio from the microphone, compiling it as a single file (instead of frames).\n
It also sends the audio data to the Speech to Text API (Deepgram) and receives text data back.
"""
def __init__(self) -> None:
self.stream: pyaudio.Stream | None = None
self.frames: list[bytes] = [] # Each frame should be ~1 second of audio
self._STOP_STREAM = False
def save_one_second(self) -> None:
"""
Runs in another thread and regularly polls the audio stream in from the microphone, \n
appends it to it's list, and sends it to the speech to text API.
"""
START = datetime.now()
max_recording_duration = 5.0 # in seconds
while True and (datetime.now() - START).seconds < max_recording_duration:
if self.stream is None:
print("Stream was stopped, exiting recording")
return
start_time = datetime.now()
try:
print("Talk to me!")
frame = self.stream.read(ONE_SECONDS_WORTH)
self.frames.append(frame)
except OSError as e: # OSError: [Errno -9981] Input overflowed
print(f"OSError: {e}")
return
end_time = datetime.now()
time_taken = (end_time - start_time).total_seconds()
if time_taken < 1:
time.sleep(1 - time_taken)
def start_recording(self) -> None:
"""Starts capturing audio data from the microphone and also starts a new thread to regularly save the audio data"""
stream: pyaudio._Stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK_SIZE,
)
self.stream = stream
thread = threading.Thread(target=self.save_one_second)
thread.start()
# self.save_one_second() # Use this line when the two `thread` lines are commented out
def stop_recording(self) -> str:
"""Stops capturing audio data from the microphone and returns the complete audio transcript as a string."""
time.sleep(0.5) # This is to make sure the chunks don't get truncated early
self.stream.stop_stream()
self.stream.close()
self.stream = None # type: ignore
print("STREAM IS CLOSED")
complete_recording = b"".join(self.frames)
print(f"Recording is {len(complete_recording)} bytes")
text = STTHandler(complete_recording).transcribe()
print(f"Stop recording function is returning: {text=}")
return text
if __name__ == "__main__":
audio_handler = AudioRecordingHandler()
# print("Start recording!")
audio_handler.start_recording()
time.sleep(3)
print("Slept 3 seconds, stop recording!")
text = audio_handler.stop_recording()
print(f"{text=}") |
Beta Was this translation helpful? Give feedback.
-
The answer was a threading issue, solved using: |
Beta Was this translation helpful? Give feedback.
The answer was a threading issue, solved using:
https://people.csail.mit.edu/hubert/pyaudio/docs/#example-callback-mode-audio-i-o