also save audio transcriptions as files

h2oai · Sep 19, 2024 · c2fb6c3 · c2fb6c3
1 parent 7ff6764
commit c2fb6c3
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 3 deletions.
diff --git a/openai_server/agent_prompting.py b/openai_server/agent_prompting.py
@@ -504,7 +504,6 @@ def get_mermaid_renderer_helper():
 def get_image_generation_helper():
     imagegen_url = os.getenv("IMAGEGEN_OPENAI_BASE_URL", None)
     if imagegen_url:
-        # TODO: When available, get the model from the url
         if not os.getenv("IMAGEGEN_OPENAI_MODEL"):
             os.environ["IMAGEGEN_OPENAI_MODEL"] = "flux.1-schnell"
 
@@ -531,7 +530,10 @@ def get_image_generation_helper():
 def get_audio_transcription_helper():
     stt_url = os.getenv("STT_OPENAI_BASE_URL", None)
     if stt_url:
+        if not os.getenv("STT_OPENAI_MODEL"):
+            os.environ["STT_OPENAI_MODEL"] = "whisper-1"
         cwd = os.path.abspath(os.getcwd())
+        base_path = os.getenv("H2OGPT_OPENAI_BASE_FILE_PATH", "./openai_files/")
         audio_transcription = f"""\n* Audio transcription using python. Use for transcribing audio files to text.
     * For an audio transcription, you are recommended to use the existing pre-built python code, E.g.:
     ```sh
@@ -541,6 +543,7 @@ def get_audio_transcription_helper():
     ```
     * usage: python {cwd}/openai_server/agent_tools/audio_transcription.py [-h] --file_path FILE_PATH
     * If you make an audio transcription, ensure you use python or shell code properly to generate the text file.
+    * By default the text file will be saved in the base directory: {base_path}, you can read the text file from there.
     """
     else:
         audio_transcription = (

diff --git a/openai_server/agent_tools/audio_transcription.py b/openai_server/agent_tools/audio_transcription.py
@@ -5,7 +5,8 @@
 
 def main():
     parser = argparse.ArgumentParser(description="Get transcription of an audio file")
-    parser.add_argument("--model", type=str, default="whisper-1", help="Model name")
+    # Model
+    parser.add_argument("--model", type=str, required=False, help="Model name")
     # File name
     parser.add_argument("--file_path", type=str, required=True, help="Path to the audio file")
     args = parser.parse_args()
@@ -14,14 +15,30 @@ def main():
     assert stt_url is not None, "STT_OPENAI_BASE_URL environment variable is not set"
     stt_api_key = os.getenv('STT_OPENAI_API_KEY', 'EMPTY')
 
+    if not args.model:
+        stt_model = os.getenv('STT_OPENAI_MODEL')
+        assert stt_model is not None, "STT_OPENAI_MODEL environment variable is not set"
+        args.model = stt_model
+
     # Read the audio file
     audio_file = open(args.file_path, "rb")
     client = OpenAI(base_url=stt_url, api_key=stt_api_key)
     transcription = client.audio.transcriptions.create(
     model=args.model, 
     file=audio_file
     )
-    print(f"Audio file successfully transcribed: '{transcription.text}'")
+    # Save the image to a file
+    base_path = os.getenv("H2OGPT_OPENAI_BASE_FILE_PATH", "./openai_files/")
+    # Create the directory if it doesn't exist
+    if not os.path.exists(base_path):
+        os.makedirs(base_path)
+    # Get full path with base_path and audio file name. Note file_path includes the full path and the audio name at the end.
+    full_path = os.path.join(base_path, os.path.basename(args.file_path) + ".txt")
+    # Write the transcription to a file
+    with open(full_path, "w") as txt_file:
+        txt_file.write(transcription.text)
+    print(f"Transcription successfully saved to the path: {full_path}")
+    print(f"Audio file successfully transcribed as: '{transcription.text}'")
 
 
 if __name__ == "__main__":