Backup of some rudmentary ideas on auto-voice

paulovcmedeiros · Feb 27, 2024 · a8d5347 · a8d5347
1 parent c4c41df
commit a8d5347
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 50 deletions.
diff --git a/pyrobbot/__init__.py b/pyrobbot/__init__.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 """Unnoficial OpenAI API UI and CLI tool."""
-import contextlib
 import os
 import sys
 import tempfile
@@ -47,7 +46,11 @@ class GeneralDefinitions:
 
     # Location info
     IPINFO = defaultdict(lambda: "unknown")
-    with contextlib.suppress(
-        requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError
-    ):
+    try:
         IPINFO = ipinfo.getHandler().getDetails().all
+    except (
+        requests.exceptions.ReadTimeout,
+        requests.exceptions.ConnectionError,
+        ipinfo.exceptions.RequestQuotaExceededError,
+    ) as error:
+        logger.warning("Cannot get current location info. {}", error)
diff --git a/pyrobbot/app/.streamlit/config.toml b/pyrobbot/app/.streamlit/config.toml
@@ -4,7 +4,7 @@
   gatherUsageStats = false
 
 [runner]
-  fastReruns = false
+  fastReruns = true
 
 [server]
   runOnSave = true

diff --git a/pyrobbot/app/app_page_templates.py b/pyrobbot/app/app_page_templates.py
@@ -18,6 +18,7 @@
 from PIL import Image
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
+from pydub.playback import play
 from streamlit.runtime.scriptrunner import add_script_run_ctx
 from streamlit_mic_recorder import mic_recorder
 from streamlit_webrtc import RTCConfiguration, WebRtcMode, webrtc_streamer
@@ -70,6 +71,7 @@ def __init__(
         self.page_id = str(uuid.uuid4())
         self.parent = parent
         self.page_number = self.parent.state.get("n_created_pages", 0) + 1
+        self.reply_ongoing = threading.Event()
 
         chat_number_for_title = f"Chat #{self.page_number}"
         if page_title is _RecoveredChat:
@@ -157,24 +159,29 @@ def render_custom_audio_player(
         audio: AudioSegment,
         parent_element=None,
         autoplay: bool = True,
+        hidden=False,
         start_sec: int = 0,
     ):
         """Autoplay an audio segment in the streamlit app."""
         # Adaped from: <https://discuss.streamlit.io/t/
         #    how-to-play-an-audio-file-automatically-generated-using-text-to-speech-
         #    in-streamlit/33201/2>
         autoplay = "autoplay" if autoplay else ""
+        hidden = "hidden" if hidden else ""
+
         data = audio.export(format="mp3").read()
         b64 = base64.b64encode(data).decode()
         md = f"""
-                <audio controls {autoplay} preload="auto">
+                <audio controls {autoplay} {hidden} preload="metadata">
                 <source src="data:audio/mp3;base64,{b64}#{start_sec}" type="audio/mp3">
                 </audio>
                 """
         parent_element = parent_element or st
         parent_element.markdown(md, unsafe_allow_html=True)
         if autoplay:
+            self.reply_ongoing.set()
             time.sleep(audio.duration_seconds)
+            self.reply_ongoing.clear()
 
 
 class ChatBotPage(AppPage):
@@ -217,27 +224,12 @@ def __init__(
         self.text_prompt_queue = queue.Queue()
         self.possible_speech_chunks_queue = queue.Queue()
         self.audio_playing_chunks_queue = queue.Queue()
-        self.reply_ongoing = threading.Event()
         self.incoming_frame_queue = queue.Queue()
 
-        self.chat_for_async = self.chat_obj
-
-        self.listen_thread = threading.Thread(
-            target=self.listen,
-            args=(self.chat_for_async, self.possible_speech_chunks_queue),
-            daemon=True,
-        )
-
+        self.listen_thread = threading.Thread(target=self.listen, daemon=True)
         self.continuous_user_prompt_thread = threading.Thread(
-            target=self.handle_continuous_user_prompt,
-            args=(
-                self.possible_speech_chunks_queue,
-                self.audio_playing_chunks_queue,
-                self.continuous_user_prompt_queue,
-            ),
-            daemon=True,
+            target=self.handle_continuous_user_prompt, daemon=True
         )
-
         # See <https://github.com/streamlit/streamlit/issues/1326#issuecomment-1597918085>
         add_script_run_ctx(self.listen_thread)
         add_script_run_ctx(self.continuous_user_prompt_thread)
@@ -268,7 +260,7 @@ def audio_frame_callback(frame):
 
         return self.stream_audio_context
 
-    def listen(self, chat_obj, possible_speech_chunks_queue):
+    def listen(self):
         """Listen for speech from the browser."""
         # This deque will be employed to keep a moving window of audio chunks to monitor
         # voice activity. The length of the deque is calculated such that the concatenated
@@ -279,6 +271,7 @@ def listen(self, chat_obj, possible_speech_chunks_queue):
         # changing-session-state-not-reflecting-in-active-python-thread/37683
 
         logger.debug("Listening for speech from the browser...")
+        chat_obj = self.chat_obj
         moving_window_speech_likelihood = 0.0
         user_has_been_speaking = False
         audio_chunks_moving_window = deque(
@@ -344,39 +337,33 @@ def listen(self, chat_obj, possible_speech_chunks_queue):
                     >= chat_obj.speech_likelihood_threshold
                 )
                 if user_has_been_speaking:
-                    if user_speaking_now:
-                        possible_speech_chunks_queue.put(audio_chunk)
-                    else:
-                        logger.info("User has stopped speaking.")
+                    self.possible_speech_chunks_queue.put(audio_chunk)
+                    if not user_speaking_now:
+                        logger.info("No more voice activity detected")
                         user_has_been_speaking = False
-                        possible_speech_chunks_queue.put(None)
+                        self.possible_speech_chunks_queue.put(None)
                         continue
                 elif user_speaking_now:
-                    logger.info("User has started speaking.")
+                    logger.info("Voice activity detected")
                     user_has_been_speaking = True
                     for past_audio_chunk in audio_chunks_moving_window:
-                        possible_speech_chunks_queue.put(past_audio_chunk["audio"])
+                        self.possible_speech_chunks_queue.put(past_audio_chunk["audio"])
 
             except Exception as error:  # noqa: BLE001
                 logger.error(error)
             finally:
                 self.incoming_frame_queue.task_done()
 
-    def handle_continuous_user_prompt(
-        self,
-        possible_speech_chunks_queue,
-        audio_playing_chunks_queue,
-        continuous_user_prompt_queue,
-    ):
+    def handle_continuous_user_prompt(self):
         """Play audio."""
         logger.debug("Handling continuous user prompt...")
         while True:
             try:
                 logger.trace("Waiting for new speech chunk...")
-                new_audio_chunk = possible_speech_chunks_queue.get()
+                new_audio_chunk = self.possible_speech_chunks_queue.get()
                 if self.reply_ongoing.is_set():
                     logger.debug("Reply is ongoing. Discardig audio chunk")
-                    possible_speech_chunks_queue.task_done()
+                    self.possible_speech_chunks_queue.task_done()
                     continue
 
                 logger.trace("Processing new speech chunk")
@@ -385,15 +372,15 @@ def handle_continuous_user_prompt(
                     # play_audio_queue and send the result to be played
                     logger.debug("Preparing audio to send as user input...")
                     concatenated_audio = AudioSegment.empty()
-                    while not audio_playing_chunks_queue.empty():
-                        concatenated_audio += audio_playing_chunks_queue.get()
-                    audio_playing_chunks_queue.task_done()
-                    continuous_user_prompt_queue.put(trim_silence(concatenated_audio))
+                    while not self.audio_playing_chunks_queue.empty():
+                        concatenated_audio += self.audio_playing_chunks_queue.get()
+                    self.audio_playing_chunks_queue.task_done()
+                    self.continuous_user_prompt_queue.put(concatenated_audio)
                     logger.debug("Done preparing audio to send as user input.")
                 else:
-                    audio_playing_chunks_queue.put(new_audio_chunk)
+                    self.audio_playing_chunks_queue.put(new_audio_chunk)
 
-                possible_speech_chunks_queue.task_done()
+                self.possible_speech_chunks_queue.task_done()
             except Exception as error:  # noqa: BLE001
                 logger.error(error)
 
@@ -482,6 +469,18 @@ def voice_output(self) -> bool:
         """Return the state of the voice output toggle."""
         return st.session_state.get("toggle_voice_output", False)
 
+    def play_chime(self, type: str = "correct-answer-tone"):
+        """Sound a chime to send notificatons to the user."""
+        type2filename = {
+            "correct-answer-tone": "mixkit-correct-answer-tone-2870.wav",
+            "option-select": "mixkit-interface-option-select-2573.wav",
+        }
+
+        chime = AudioSegment.from_file(
+            GeneralDefinitions.APP_DIR / "data" / type2filename[type], format="wav"
+        )
+        self.render_custom_audio_player(chime, hidden=True, autoplay=True)
+
     def get_chat_input(self):
         """Render chat inut widgets and return the user's input."""
         placeholder = (
@@ -508,7 +507,9 @@ def get_chat_input(self):
                         raise ValueError("The listen thread is not alive")
 
                     self.render_continuous_audio_input_widget()
-                    if not self.stream_audio_context.state.playing:
+                    if self.stream_audio_context.state.playing:
+                        self.play_chime()
+                    else:
                         logger.debug("Waiting for the audio stream to start...")
                         time.sleep(0.5)
                         if not text_prompt:
@@ -528,6 +529,7 @@ def get_chat_input(self):
 
                 recorded_prompt = ""
                 if audio.duration_seconds > min_audio_duration_seconds:
+                    # play(audio)
                     recorded_prompt = self.chat_obj.stt(audio).text
                     self.text_prompt_queue.put(recorded_prompt)
 
@@ -547,7 +549,9 @@ def _render_chatbot_page(self):  # noqa: PLR0915
         self.get_chat_input()
         logger.debug("Waiting for user text prompt...")
         try:
-            prompt = self.text_prompt_queue.get(timeout=2)
+            prompt = self.text_prompt_queue.get(
+                timeout=self.chat_obj.inactivity_timeout_seconds
+            )
             self.text_prompt_queue.task_done()
         except queue.Empty:
             prompt = None
@@ -566,6 +570,7 @@ def _render_chatbot_page(self):  # noqa: PLR0915
             self.render_chat_history()
             # Process user input
             if prompt:
+                self.play_chime("option-select")
                 time_now = datetime.datetime.now().replace(microsecond=0)
                 self.state.update({"chat_started": True})
                 # Display user message in chat message container
@@ -641,8 +646,11 @@ def _render_chatbot_page(self):  # noqa: PLR0915
                         self.sidebar_title = title
                         title_container.header(title, divider="rainbow")
 
-        self.reply_ongoing.clear()
-        st.rerun()
+                self.reply_ongoing.clear()
+
+        if prompt is None or not self.reply_ongoing.is_set():
+            logger.debug("Rerunning the app")
+            st.rerun()
 
     def render(self):
         """Render the app's chatbot or costs page, depending on user choice."""

diff --git a/pyrobbot/app/multipage.py b/pyrobbot/app/multipage.py
@@ -270,6 +270,7 @@ def render(self, **kwargs):
                         key="toggle_voice_output",
                         label=speaking_head_in_silhouette,
                         help="Toggle voice output",
+                        value=True,
                     )
                 with right:
                     # Add button to toggle continuous voice input

diff --git a/pyrobbot/chat_configs.py b/pyrobbot/chat_configs.py
@@ -187,7 +187,7 @@ class VoiceAssistantConfigs(BaseConfigModel):
     )
 
     inactivity_timeout_seconds: int = Field(
-        default=1,
+        default=2,
         gt=0,
         description="How much time user should be inactive "
         "for the assistant to stop listening",