-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcribe_and_ollama.py
172 lines (145 loc) · 7.02 KB
/
transcribe_and_ollama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import os
import subprocess
import whisper
import yaml
import logging
import requests
import json
from datetime import datetime
import warnings
# Suppress FutureWarnings and UserWarnings reduce the noise
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
# Setup logging
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler("echomind.log"),
logging.StreamHandler() # This will print to stdout (console)
]
)
# Step 1: Load config.yml for Ollama API details
def load_config(config_file):
logging.debug(f"Loading config from {config_file}")
try:
with open(config_file, 'r') as file:
config = yaml.safe_load(file)
logging.debug(f"Config loaded successfully: {config}")
return config
except Exception as e:
logging.error(f"Failed to load config: {e}")
raise
# Step 2: Extract audio using ffmpeg (only if input is a video file)
def extract_audio(video_file, audio_file):
logging.debug(f"Extracting audio from video file: {video_file}")
try:
command = f"ffmpeg -i {video_file} -q:a 0 -map a {audio_file} -y"
result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
logging.info(f"Audio extracted to {audio_file}")
logging.debug(f"ffmpeg output: {result.stdout}")
except subprocess.CalledProcessError as e:
logging.error(f"Failed to extract audio: {e.stderr}")
raise
# Step 3: Transcribe audio using Whisper, with optional model size
def transcribe_audio(audio_file, transcription_file, whisper_model="base"):
logging.debug(f"Transcribing audio from file: {audio_file} using model: {whisper_model}")
try:
model = whisper.load_model(whisper_model) # Load the specified Whisper model
logging.debug(f"Whisper model '{whisper_model}' loaded successfully")
result = model.transcribe(audio_file)
logging.info(f"Transcription completed. Saving to {transcription_file}")
with open(transcription_file, 'w') as f:
f.write(result['text'])
logging.debug(f"Transcription saved: {result['text'][:100]}...") # Log first 100 characters
return result['text']
except Exception as e:
logging.error(f"Failed to transcribe audio: {e}")
raise
# Step 4: Send transcription and prompt to Ollama LLM using the requests library
def send_to_ollama(model_name, prompt_file, transcription, ollama_api_url, ollama_api_token):
logging.debug(f"Preparing request to Ollama LLM using the '{model_name}' model at '{ollama_api_url}', token is: '{ollama_api_token}'")
try:
# Read the prompt from the provided file
with open(prompt_file, 'r') as file:
prompt = file.read()
logging.debug(f"Prompt loaded: {prompt[:100]}...") # Log first 100 characters
# Combine the prompt and transcription
combined_prompt = prompt + "\n\n" + transcription
# Set up the headers
headers = {
"Authorization": f"Bearer {ollama_api_token}",
"Content-Type": "application/json"
}
# Create the payload (data) for the request
data = {
"model": model_name,
"prompt": combined_prompt,
"stream": False
}
# Perform the POST request to the Ollama server
response = requests.post(ollama_api_url, headers=headers, data=json.dumps(data))
# Check for errors in the response
if response.status_code == 200:
logging.info(f"Received response from Ollama: {response.json()}")
return response.json().get('response', 'No text found in response')
else:
logging.error(f"Failed to send request to Ollama LLM: {response.status_code} - {response.text}")
raise Exception(f"Request failed: {response.status_code} - {response.text}")
except Exception as e:
logging.error(f"Failed to send request to Ollama LLM: {e}")
raise
# Helper function: Determine if the file is video or audio based on MIME type
def is_video_file(file_path):
logging.debug(f"Checking if {file_path} is a video file")
import mimetypes
mime_type, _ = mimetypes.guess_type(file_path)
if mime_type and mime_type.startswith('video'):
logging.debug(f"{file_path} is a video file")
return True
else:
logging.debug(f"{file_path} is not a video file")
return False
# Step 5: Main function to handle everything
def main(input_file, prompt_file, config_file, whisper_model=None):
logging.info(f"Starting processing for {input_file} {prompt_file} {config_file}")
# Step 1: Load config
config = load_config(config_file)
# Get the base filename without the extension
base_filename = os.path.splitext(os.path.basename(input_file))[0]
# Define file names based on the input filename
audio_file = f"{base_filename}_audio.mp3"
transcription_file = f"{base_filename}_transcription.txt"
output_file = f"{base_filename}_llm_output.txt"
model_name = config.get('model_name', 'smollm:1.7b') # Example model, replace with correct one
ollama_api_token = config.get('ollama_api_token')
ollama_api_url = config.get('ollama_api_url')
# Get whisper model from config or override via CLI
whisper_model = whisper_model or config.get('whisper_model', 'base') # Default to "base"
# Step 2: Determine if input file is a video or audio file
if is_video_file(input_file):
logging.info(f"{input_file} is a video file. Extracting audio...")
extract_audio(input_file, audio_file) # Extract audio from video
else:
logging.info(f"{input_file} is an audio file. Skipping audio extraction...")
audio_file = input_file # Use the audio file directly
# Step 3: Transcribe audio with the specified Whisper model
transcription = transcribe_audio(audio_file, transcription_file, whisper_model)
# Step 4: Send transcription and prompt to Ollama LLM
llm_output = send_to_ollama(model_name, prompt_file, transcription, ollama_api_url, ollama_api_token)
# Step 5: Save LLM output to a file
try:
with open(output_file, 'w') as f:
f.write(llm_output)
logging.info(f"LLM output saved to {output_file}")
except Exception as e:
logging.error(f"Failed to save LLM output: {e}")
raise
if __name__ == "__main__":
import sys
# Check for environment variables first, then fallback to command-line arguments
input_file = os.getenv("INPUT_FILE", sys.argv[1] if len(sys.argv) > 1 else "file.mp4")
prompt_file = os.getenv("PROMPT_FILE", sys.argv[2] if len(sys.argv) > 2 else "prompt.txt")
config_file = os.getenv("CONFIG_FILE", sys.argv[3] if len(sys.argv) > 3 else "config.yml")
whisper_model = os.getenv("WHISPER_MODEL", sys.argv[4] if len(sys.argv) > 4 else None) # Can override via CLI
main(input_file, prompt_file, config_file, whisper_model)