-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathdirect_openai_test.py
107 lines (86 loc) · 3.46 KB
/
direct_openai_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import json
from pathlib import Path
from dotenv import load_dotenv
import openai
# Load environment variables from .env file
load_dotenv()
# Create a temporary directory for audio files
temp_dir = Path("./temp_direct_test")
temp_dir.mkdir(exist_ok=True)
# Constants for audio offset resolution (same as in manim-voiceover)
AUDIO_OFFSET_RESOLUTION = 1000 # 1000 = milliseconds
print("=== Direct OpenAI API Test ===")
# First, generate speech using OpenAI TTS
print("\nGenerating speech from text...")
text = "This is a test of the cloud-based Whisper feature."
# Generate speech using OpenAI TTS
response = openai.audio.speech.create(
model="tts-1",
voice="alloy",
input=text
)
audio_path = temp_dir / "direct_test.mp3"
response.stream_to_file(str(audio_path))
print(f"Speech generated and saved to {audio_path}")
# Now, transcribe the audio using OpenAI Whisper API
print("\nTranscribing audio with word-level timestamps...")
with open(audio_path, "rb") as audio_file:
transcription = openai.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="verbose_json",
timestamp_granularities=["word"]
)
# Print the raw response structure
print("\nRaw API Response Structure:")
print(f"Response type: {type(transcription)}")
print(f"Response attributes: {dir(transcription)}")
print(f"Has 'words' attribute: {hasattr(transcription, 'words')}")
if hasattr(transcription, 'words'):
print(f"Words type: {type(transcription.words)}")
print(f"Words count: {len(transcription.words)}")
# Try to access the first word
if len(transcription.words) > 0:
first_word = transcription.words[0]
print(f"First word type: {type(first_word)}")
print(f"First word attributes: {dir(first_word)}")
print(f"First word: {first_word.word if hasattr(first_word, 'word') else 'No word attribute'}")
print(f"First word start: {first_word.start if hasattr(first_word, 'start') else 'No start attribute'}")
# Convert to word boundaries format used by manim-voiceover
print("\nConverting to word boundaries format...")
word_boundaries = []
current_text_offset = 0
if hasattr(transcription, 'words'):
for word_obj in transcription.words:
try:
word = word_obj.word
start_time = word_obj.start
# Create a word boundary entry
word_boundary = {
"audio_offset": int(start_time * AUDIO_OFFSET_RESOLUTION),
"text_offset": current_text_offset,
"word_length": len(word),
"text": word,
"boundary_type": "Word",
}
word_boundaries.append(word_boundary)
current_text_offset += len(word) + 1 # +1 for space
print(f"Added word boundary: {word} at {start_time}s")
except Exception as e:
print(f"Error processing word: {e}")
print(f"\nCreated {len(word_boundaries)} word boundaries")
# Create a cache file that manim-voiceover can use
cache_data = {
"input_text": text,
"input_data": {"input_text": text, "service": "openai"},
"original_audio": audio_path.name,
"word_boundaries": word_boundaries,
"transcribed_text": transcription.text,
"final_audio": audio_path.name
}
cache_file = temp_dir / "cache.json"
with open(cache_file, "w") as f:
json.dump([cache_data], f, indent=2)
print(f"\nCreated cache file at {cache_file}")
print("\nTest completed!")