nyunAI · Abhrant · Oct 23, 2024 · Oct 28, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/examples/experimentals/voice_engine/main.py b/examples/experimentals/voice_engine/main.py
@@ -13,9 +13,12 @@
 import requests
 import logging
 import time
+import os
+import re
+import psutil
 
 LOGGER = None
-DEFAULT_CONFIG = "recipe/default.yaml"
+DEFAULT_CONFIG = "/home/piuser/voice/nyuntam/examples/experimentals/voice_engine/recipe/rpi5.yaml"
 
 
 def set_logger(*args, **kwargs):
@@ -200,6 +203,11 @@ def parse_args():
 ##################################################
 
 
+class EnvironmentTypes(StrEnum):
+    STT = "stt"
+    LLM = "llm"
+
+
 @dataclass
 class STTInput:
     environment_config: STTEnvironmentConfig
@@ -356,14 +364,24 @@ def call(self, input: STTInput) -> EngineResponse:
         llm_input = LLMInput.from_stt_response(self.config.llm, stt_response)
         if llm_input.stream:
             # implement stream response handling
+            tts_processing_queue = queue.Queue()
             decoded_streams = queue.Queue()
             stream_queue = queue.Queue()
             stop_event = threading.Event()
+
             decode_thread = threading.Thread(
                 target=decode_stream,
-                args=(stop_event, stream_queue, decoded_streams, True),
+                args=(stop_event, stream_queue, decoded_streams, tts_processing_queue, True),
             )
             decode_thread.start()
+
+            tts_processing_thread = threading.Thread(
+                target=create_tts_wav,
+                args=(stop_event, tts_processing_queue)
+            )
+            tts_processing_thread.start()
+
+
             llm_input.data["stream"] = True
             ttfs = None
             response = call_llm_environment(llm_input)
@@ -379,7 +397,10 @@ def call(self, input: STTInput) -> EngineResponse:
                     stream_queue.put(line)
             tock = time.time()
             stop_event.set()
+
             decode_thread.join()
+            tts_processing_thread.join()
+
             llm_response = LLMResponse(
                 text=decoded_streams_to_text(list(decoded_streams.queue)),
                 streams=list(decoded_streams.queue),
@@ -436,6 +457,7 @@ def initialize_environment(config: EnvironmentConfig):
             + config.get_options().split()
             + config.get_model_option().split()
         )
+
         LOGGER.info(f"Initializing environment with command: {' '.join(cmd)}")
         return subprocess.Popen(cmd)
 
@@ -473,7 +495,9 @@ def decode_stream(
     stop_event: threading.Event,
     stream_queue: queue.Queue,
     decoded_streams: queue.Queue,
+    tts_processing_queue: queue.Queue,
     decode_and_print: bool = False,
+    decode_and_talk: bool = True
 ):
     while not stop_event.is_set() or not stream_queue.empty():
         try:
@@ -486,9 +510,120 @@ def decode_stream(
                 # if decode_and_print:
                 #     # print decoded stream continously with flush
                 #     print(json_response["content"], end="", flush=True)
+                if decode_and_talk:
+                    #use piper to start saying after delay of n tokens
+                    tts_processing_queue.put(json_response["content"])
+
         except queue.Empty:
             pass  # No data to process yet, continue
 
+def create_tts_wav(
+    stop_event: threading.Event,
+    tts_processing_queue: queue.Queue,
+    output_dir: str = "/home/piuser/voice/core/test-output"
+):
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+
+    piper_process = subprocess.Popen(
+        [
+            "piper", "--model", "en_US-lessac-medium", "--length-scale" , "1.5" , "--output_raw" 
+        ],
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True
+    )
+
+    piper_proc = psutil.Process(piper_process.pid)
+
+    # Define FFmpeg command to stream the audio over HTTP
+    ffmpeg_command = [
+        "ffmpeg",
+        "-f", "s16le",  # Input format (16-bit PCM, little-endian)
+        "-ar", "22050",  # Sample rate
+        "-ac", "1",  # Number of audio channels (mono)
+        "-i", "-",  # Input from stdin (output from Piper)
+        "-acodec", "aac",  # Audio codec (AAC)
+        "-ab", "128k",  # Audio bitrate
+        "-f", "adts",  # Output format
+        "-content_type", "audio/aac",  # Content type for the HTTP stream
+        "-listen", "1",  # Make FFmpeg act as a server
+        "http://0.0.0.0:8090/feed.aac",  # Output URL
+        "-acodec", "pcm_s16le",  # Audio codec for WAV
+        # os.path.join(output_dir, "output.wav")  # Output WAV file path
+    ]
+
+    ffmpeg_process = subprocess.Popen(
+        ffmpeg_command,
+        stdin=piper_process.stdout,  # Take input from Piper process
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+
+    buffer = ""
+
+    try:
+        while not (stop_event.is_set() and tts_processing_queue.empty()):
+            if tts_processing_queue.qsize() > 1:
+                try:
+                    # Get one item from the queue
+                    text_part = tts_processing_queue.get(timeout=0.001)
+                    buffer += text_part
+                    print("BUFFER : " , buffer)
+
+                    # Check if the buffer contains a full sentence
+                    if any(delimiter in buffer for delimiter in ['.', '!', '?' , ":" , ";"]):
+                        # Split the buffer into sentences
+                        sentences = re.split(r'(?<=[.!?])\s+', buffer)
+
+                        # Keep the last partial sentence in the buffer
+                        buffer = sentences.pop() if not re.search(r'[.!?]$', buffer) and len(sentences) > 1 else ""
+
+                        # print(sentences)
+
+                        # Join the complete sentences and send to Piper
+                        text = " ".join(sentences)
+
+                        if text:
+                            # Check if the Piper process is still running before writing
+                            try:
+                                print("Sending ---->", text)
+                                # Write the text to Piper's stdin
+                                piper_process.stdin.write(f"{text}\n")
+                                piper_process.stdin.flush()
+
+                            except BrokenPipeError:
+                                LOGGER.error("BrokenPipeError: Piper process terminated unexpectedly.")
+                                break
+
+                except queue.Empty:
+                    pass  # No data to process yet, continue
+
+    finally:
+
+        # Measure peak memory usage of Piper process
+        try:
+            peak_memory = piper_proc.memory_info().rss / (1024 * 1024)  # Convert to MB
+            print(f"Peak memory usage of Piper process: {peak_memory:.2f} MB")
+        except psutil.NoSuchProcess:
+            LOGGER.error("Piper process not found for memory measurement.")
+
+        # Stop Piper subprocess if it's still running
+        if piper_process.poll() is None:
+            piper_process.stdin.close()
+            piper_process.wait()
+            ffmpeg_process.terminate()
+            ffmpeg_process.wait()
+
+        # Print any errors from Piper
+        stderr_output = piper_process.stderr.read()
+        if stderr_output:
+            print("Piper stderr:", stderr_output)
+
+
+
+
 
 def decoded_streams_to_text(decoded_streams: tp.List[tp.Dict[str, tp.Any]]) -> str:
     return " ".join([stream["content"] for stream in decoded_streams])
@@ -539,4 +674,4 @@ def print_dict(d: dict, indent: int = 0):
             print(f"-" * 50)
     except Exception as e:
         engine.terminate()
-        raise e
+        raise e