Skip to content

Commit

Permalink
Further optimisations to continuous audio input
Browse files Browse the repository at this point in the history
  • Loading branch information
paulovcmedeiros committed Feb 29, 2024
1 parent a4c8160 commit 4032241
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 61 deletions.
119 changes: 82 additions & 37 deletions pyrobbot/app/app_page_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def render_custom_audio_player(
# Adaped from: <https://discuss.streamlit.io/t/
# how-to-play-an-audio-file-automatically-generated-using-text-to-speech-
# in-streamlit/33201/2>

autoplay = "autoplay" if autoplay else ""
hidden = "hidden" if hidden else ""

Expand Down Expand Up @@ -299,7 +300,7 @@ def voice_output(self) -> bool:
"""Return the state of the voice output toggle."""
return st.session_state.get("toggle_voice_output", False)

def play_chime(self, chime_type: str = "correct-answer-tone"):
def play_chime(self, chime_type: str = "correct-answer-tone", parent_element=None):
"""Sound a chime to send notificatons to the user."""
type2filename = {
"correct-answer-tone": "mixkit-correct-answer-tone-2870.wav",
Expand All @@ -309,7 +310,9 @@ def play_chime(self, chime_type: str = "correct-answer-tone"):
chime = AudioSegment.from_file(
GeneralDefinitions.APP_DIR / "data" / type2filename[chime_type], format="wav"
)
self.render_custom_audio_player(chime, hidden=True, autoplay=True)
self.render_custom_audio_player(
chime, hidden=True, autoplay=True, parent_element=parent_element
)

def render_chat_input_widgets(self):
"""Render chat inut widgets and return the user's input."""
Expand Down Expand Up @@ -356,35 +359,34 @@ def _render_chatbot_page(self): # noqa: PLR0915
chat_msgs_container = st.container(height=600, border=False)
with chat_msgs_container:
status_msg_container = st.empty()
self.render_chat_history()

self.render_chat_input_widgets()

if not self.state.get("chat_started", False):
self.play_chime()

logger.debug("Waiting for user text prompt...")
with status_msg_container:
st.status("Waiting for user text prompt...")
self.play_chime()
st.status(f"{self.chat_obj.assistant_name} is listening...")
logger.debug("Waiting for user text prompt...")
while True:
with contextlib.suppress(queue.Empty):
if prompt := self.parent.text_prompt_queue.get_nowait():
break
with contextlib.suppress(queue.Empty):
if prompt := self.text_prompt_queue.get_nowait():
break
logger.trace("Still waiting for user text prompt...")
time.sleep(0.1)

if prompt:
self.parent.reply_ongoing.set()
logger.opt(colors=True).debug(
"<yellow>Got prompt from user: {}</yellow>", prompt
)
if prompt := prompt.strip():
self.play_chime("option-select")
self.parent.reply_ongoing.set()
logger.opt(colors=True).debug("<yellow>Recived prompt: {}</yellow>", prompt)
status_msg_container.empty()

else:
logger.opt(colors=True).debug("<yellow>Received empty prompt</yellow>")
self.parent.reply_ongoing.clear()
logger.opt(colors=True).debug("<yellow>No prompt from user.</yellow>")

with chat_msgs_container:
self.render_chat_history()
# Process user input
if prompt:
time_now = datetime.datetime.now().replace(microsecond=0)
Expand All @@ -407,26 +409,35 @@ def _render_chatbot_page(self): # noqa: PLR0915
text_reply_container = st.empty()
audio_reply_container = st.empty()
question_answer_chunks_queue = queue.Queue()
partial_audios_queue = queue.Queue()

def answer_question(prompt):
for chunk in self.chat_obj.answer_question(prompt):
question_answer_chunks_queue.put(chunk.content)
question_answer_chunks_queue.put(None)

# Create separate threads to process text and audio replies
answer_question_thread = threading.Thread(
target=answer_question, args=(prompt,)
target=answer_question,
args=(self.chat_obj, prompt, question_answer_chunks_queue),
)
play_partial_audios_thread = threading.Thread(
target=play_partial_audios,
args=(
partial_audios_queue,
self.render_custom_audio_player,
audio_reply_container,
),
daemon=False,
)
add_script_run_ctx(answer_question_thread)
answer_question_thread.start()
for thread in (answer_question_thread, play_partial_audios_thread):
add_script_run_ctx(thread)
thread.start()

# Render text reply
# Render the reply
chunk = ""
full_response = ""
current_audio = AudioSegment.empty()
full_audio = AudioSegment.silent(duration=0)
text_reply_container.markdown("▌")
while (chunk is not None) or (current_audio is not None):
logger.trace("Waiting for text or audio chunks...")
# Render text
with contextlib.suppress(queue.Empty):
chunk = question_answer_chunks_queue.get_nowait()
if chunk is not None:
Expand All @@ -437,25 +448,22 @@ def answer_question(prompt):
# Render audio (if any)
with contextlib.suppress(queue.Empty):
current_audio = self.chat_obj.play_speech_queue.get_nowait()
if current_audio is not None:
full_audio += current_audio.speech
self.render_custom_audio_player(
current_audio.speech,
parent_element=audio_reply_container,
)
audio_reply_container.empty()
self.chat_obj.play_speech_queue.task_done()

text_reply_container.caption(
datetime.datetime.now().replace(microsecond=0)
)
text_reply_container.markdown(full_response)
if current_audio is None:
partial_audios_queue.put(None)
else:
partial_audios_queue.put(current_audio.speech)
full_audio += current_audio.speech

logger.opt(colors=True).debug(
"<yellow>Replied to user prompt '{}': {}</yellow>",
prompt,
full_response,
)
text_reply_container.caption(
datetime.datetime.now().replace(microsecond=0)
)
text_reply_container.markdown(full_response)

self.chat_history.append(
{
Expand All @@ -466,6 +474,10 @@ def answer_question(prompt):
}
)

while play_partial_audios_thread.is_alive():
logger.debug("Waiting for partial audios to finish playing...")
time.sleep(0.1)

if full_audio.duration_seconds > 0:
self.render_custom_audio_player(
full_audio,
Expand All @@ -491,7 +503,6 @@ def answer_question(prompt):
title_container.header(title, divider="rainbow")

self.parent.reply_ongoing.clear()
self.play_chime()

if not self.parent.reply_ongoing.is_set():
logger.debug("Rerunning the app")
Expand All @@ -503,3 +514,37 @@ def render(self):
self.render_cost_estimate_page()
else:
self._render_chatbot_page()


def answer_question(chat_obj, prompt, question_answer_chunks_queue):
"""Get chunks of the reply to the prompt and put them in the queue."""
for chunk in chat_obj.answer_question(prompt):
question_answer_chunks_queue.put(chunk.content)
question_answer_chunks_queue.put(None)


def play_partial_audios(
partial_audios_queue, audio_player_rendering_function, parent_element
):
"""Play queued audio segments."""
logger.debug("Playing partial audios...")
while True:
try:
partial_audio = partial_audios_queue.get()
if partial_audio is None:
partial_audios_queue.task_done()
break

logger.debug("Playing partial audio...")
audio_player_rendering_function(
partial_audio,
parent_element=parent_element,
autoplay=True,
hidden=True,
)
parent_element.empty()
partial_audios_queue.task_done()
except Exception as error: # noqa: BLE001
logger.error(error)
break
logger.debug("Partial audios finished playing.")
38 changes: 22 additions & 16 deletions pyrobbot/app/multipage.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,7 @@ def handle_continuous_user_prompt():
else:
logger.debug(
'Page "{}"\'s audio input too short ({} < {} sec). Discarding.',
app_page.title,
concatenated_audio.duration_seconds,
min_audio_duration_seconds,
)
Expand Down Expand Up @@ -325,26 +326,31 @@ def render_continuous_audio_input_widget(self):
selected_page = None

def audio_frame_callback(frame):
logger.trace("Received raw audio frame from the stream")
try:
logger.trace("Received raw audio frame from the stream")

if selected_page is None:
logger.trace("No page selected. Discardig audio chunk")
return frame
if selected_page is None:
logger.trace("No page selected. Discardig audio chunk")
return frame

if self.reply_ongoing.is_set():
logger.trace("Reply is ongoing. Discardig audio chunk")
return frame
if self.reply_ongoing.is_set():
logger.trace("Reply is ongoing. Discardig audio chunk")
return frame

if not self.continuous_user_prompt_queue.empty():
logger.trace(
"There are still {} items in the audio input queue. Discardig chunk",
self.continuous_user_prompt_queue.qsize(),
)
return frame
if not self.continuous_user_prompt_queue.empty():
logger.trace(
"Audio input queue not empty {} items). Discardig chunk",
self.continuous_user_prompt_queue.qsize(),
)
return frame
except Exception as error: # noqa: BLE001
logger.opt(exception=True).debug(error)
logger.error(error)
else:
frame_info = {"frame": frame, "page": selected_page}
self.incoming_frame_queue.put(frame_info)
logger.trace("Raw audio frame sent to the processing queue")

frame_info = {"frame": frame, "page": selected_page}
self.incoming_frame_queue.put(frame_info)
logger.trace("Raw audio frame sent to the processing queue")
return frame

add_script_run_ctx(audio_frame_callback)
Expand Down
23 changes: 15 additions & 8 deletions pyrobbot/voice_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,9 @@ def answer_question(self, question: str):
if sentence_for_tts and not self.reply_only_as_text:
self.tts_conversion_queue.put(sentence_for_tts)

# Signal that the current answer is finished
self.tts_conversion_queue.put(None)

def handle_update_audio_history(self, current_answer_audios_queue: queue.Queue):
"""Handle updating the chat history with the latest reply's audio file path."""
# Merge all AudioSegments in self.current_answer_audios_queue into a single one
Expand All @@ -203,9 +206,12 @@ def handle_update_audio_history(self, current_answer_audios_queue: queue.Queue):
try:
new_audio = current_answer_audios_queue.get()
if new_audio is not None:
# Reply not yet finished
merged_audio += new_audio
current_answer_audios_queue.task_done()
continue

# Now the reply has finished
if merged_audio.duration_seconds <= min_audio_duration_seconds:
current_answer_audios_queue.task_done()
continue
Expand Down Expand Up @@ -413,26 +419,27 @@ def handle_tts_queue(self, text_queue: queue.Queue):
while not self.exit_chat.is_set():
try:
text = text_queue.get()
if text is None:
# Signal that the current anwer is finished
self.current_answer_audios_queue.put(None)
self.play_speech_queue.put(None)
continue

text = text.strip()
if text and not self.interrupt_reply.is_set():
tts = self.tts(text)
logger.debug("Received text '{}' for TTS", text)

# Trigger the TTS conversion
_ = tts.speech

logger.debug("Sending TTS for '{}' to the playing queue", text)
# Keep track of audios for the current answer (for the history db)
self.current_answer_audios_queue.put(tts.speech)

# Dispatch the audio to be played
logger.debug("Sending audio for text '{}' to the playing queue", text)
self.play_speech_queue.put(tts)

if text_queue.empty():
# Signal that the current anwer is finished
self.current_answer_audios_queue.put(None)
self.play_speech_queue.put(None)

except Exception as error: # noqa: PERF203, BLE001
except Exception as error: # noqa: BLE001
logger.opt(exception=True).debug(error)
logger.error(error)
finally:
Expand Down

0 comments on commit 4032241

Please sign in to comment.