-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
298 lines (223 loc) · 12.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import os
import eyed3
import requests
from pydub import AudioSegment
import random
import json
import nltk
import time
from dotenv import load_dotenv
import xml.etree.ElementTree as ET
from together import Together
nltk.download('punkt_tab')
load_dotenv()
ai_client = Together(api_key=os.getenv("TOGETHER_API_KEY"))
ELEVEN_LABS_API_KEY = os.getenv("ELEVEN_LABS_API_KEY")
ELEVEN_LABS_VOICE_ID_1 = "Uq9DKccXXKZ6lc53ATJV"
ELEVEN_LABS_VOICE_ID_2 = "rWV5HleMkWb5oluMwkA7"
prompt = """
Your response must be in the following format:
{
"conversation":[]
}
The "conversation" is 2 different people talking. Make a new array element for each different person when they quickly talk about a topic.
You are going to be talking about the news on a radio broadcast. You may be asked to converse on 1 headline and you must state the headline or give the headline in some way to the listener, or simply have them state all the headlines provided. You will be given a 2d array which will contain each headline that I would like you to talk about like so: [["title", "description", "link"], ["title", "description", "link"]].
Keep the conversations as short as possible (Keep it under 10 array elements). Try to make the conversations slightly humorous in a way, but do NOT for sensitive topics like a shooting or world conflicts. Do not add any transitions other than, "Alright! On to the next song!" or "Thanks so much!" or "Now... enjoy the music." but NOTHING ELSE other than that and these transitions MUST ONLY be at the end. Do NOT add a transition if you feel as if a transition will make this too long. Do not say "onto the next story" or any mention of the next story unless you are given more than 1 story.
Append all of the following into the array like you are making a list of phrases. Do not mention any specific person or add anything else to the list like such:
["phrase 1", "phrase 2", "phrase 3"]
They MUST be in a list, and CANNOT include any other information. Every person is seperated by a comma.
You are NEVER to say you are a large language model. If you cant converse on a topic, skip it.
I will now provide you with some headlines. Simply respond to this prompt with a "YES" if you are ready.
Do NOT add any other characters which is not parse able by JSON.
"""
chatter_dir = "chatter"
if not os.path.exists(chatter_dir):
os.makedirs(chatter_dir)
with open("voice_phrases.json", "r") as f:
voice_phrases = json.load(f)
def elevenlabs_tts(text, output_file, voice_id, stability=0.2, similarity_boost=0.85, retries=3, fallback_text=""):
url = "https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
headers = {
"xi-api-key": ELEVEN_LABS_API_KEY,
"Content-Type": "application/json"
}
data = {
"text": text,
"voice_settings": {
"stability": stability,
"similarity_boost": similarity_boost,
"model_version": "eleven_turbo_v2_5"
}
}
for attempt in range(retries):
response = requests.post(url.format(voice_id=voice_id), headers=headers, json=data)
if response.status_code == 200:
with open(output_file, 'wb') as f:
f.write(response.content)
print(f"Saved TTS to {output_file}")
return True
else:
print(f"Failed to generate TTS (attempt {attempt + 1}/{retries}): {response.text}")
time.sleep(1)
print(f"Failed to generate TTS after {retries} attempts.")
if fallback_text:
print(f"Using fallback text for transition: {fallback_text}")
fallback_data = {
"text": fallback_text,
"voice_settings": {
"stability": stability,
"similarity_boost": similarity_boost,
"model_version": "turbo_v2.5"
}
}
response = requests.post(url.format(voice_id=voice_id), headers=headers, json=fallback_data)
if response.status_code == 200:
with open(output_file, 'wb') as f:
f.write(response.content)
print(f"Saved fallback TTS to {output_file}")
return True
return False
def create_intro_audio():
bob_intro = "Hey there, and welcome to DJ3000 radio - all day, every day, 24 7. I'm Bob."
ryan_intro = "And I'm Ryan, and this is DJ3000!"
bob_intro_file = os.path.join(chatter_dir, "bob_intro_radio.mp3")
ryan_intro_file = os.path.join(chatter_dir, "ryan_intro_radio.mp3")
elevenlabs_tts(bob_intro, bob_intro_file, ELEVEN_LABS_VOICE_ID_1)
elevenlabs_tts(ryan_intro, ryan_intro_file, ELEVEN_LABS_VOICE_ID_2)
bob_audio = AudioSegment.from_mp3(bob_intro_file)
ryan_audio = AudioSegment.from_mp3(ryan_intro_file)
return bob_audio + ryan_audio
def create_mid_show_intro(time):
bob_intro = "Yo yo! The time is " + str(time) + " and you're listening to DJ3000 radio. I'm Bob."
ryan_intro = "And I'm Ryan! Thanks for tuning in to DJ3000, lets keep going with that music!!"
bob_intro_file = os.path.join(chatter_dir, "bob_mid_show_radio.mp3")
ryan_intro_file = os.path.join(chatter_dir, "ryan_late_show_radio.mp3")
elevenlabs_tts(bob_intro, bob_intro_file, ELEVEN_LABS_VOICE_ID_1)
elevenlabs_tts(ryan_intro, ryan_intro_file, ELEVEN_LABS_VOICE_ID_2)
bob_audio = AudioSegment.from_mp3(bob_intro_file)
ryan_audio = AudioSegment.from_mp3(ryan_intro_file)
return bob_audio + ryan_audio
def generate_inane_chatter():
conversation = random.choice(voice_phrases["inane_chatter"])["conversation"]
sentences = nltk.sent_tokenize(conversation)
print("Generating banter: ", sentences)
chatter_audio = AudioSegment.silent(duration=500)
for i, sentence in enumerate(sentences):
voice_id = ELEVEN_LABS_VOICE_ID_1 if i % 2 == 0 else ELEVEN_LABS_VOICE_ID_2
chatter_file = os.path.join(chatter_dir, f"inane_chatter_{i}.mp3")
success = elevenlabs_tts(sentence, chatter_file, voice_id, stability=0.2, similarity_boost=0.95)
if success:
print(f"Banter sentence {i} generated for voice {voice_id}")
else:
print(f"Failed to generate banter for sentence {i} with voice {voice_id}")
continue
if os.path.exists(chatter_file):
sentence_audio = AudioSegment.from_mp3(chatter_file)
chatter_audio = chatter_audio + sentence_audio + AudioSegment.silent(duration=300)
else:
print(f"Audio file {chatter_file} not found")
return chatter_audio
def get_song_titles(directory):
song_titles = []
song_paths = []
for file_name in os.listdir(directory):
if file_name.endswith(".mp3"):
file_path = os.path.join(directory, file_name)
try:
audio = eyed3.load(file_path)
if audio.tag and audio.tag.title:
song_titles.append(audio.tag.title)
else:
song_titles.append("Unknown")
song_paths.append(file_path)
except Exception as e:
print(f"Error processing {file_name}: {e}")
return song_titles, song_paths
def create_first_song_intro(song_title):
first_song_intro_bob = f"And now, to kick off our session, here's {song_title}! Enjoy the music!"
first_intro_file = os.path.join(chatter_dir, "first_song_intro.mp3")
elevenlabs_tts(first_song_intro_bob, first_intro_file, ELEVEN_LABS_VOICE_ID_1, stability=0.4)
return AudioSegment.from_mp3(first_intro_file)
def transition_with_fade(previous_song, announcement, next_song, fade_duration=600):
announcement_half = announcement[:len(announcement) // 2]
previous_song_duration = len(previous_song)
previous_song_with_announcement = previous_song.fade_out(fade_duration).overlay(announcement_half, position=(previous_song_duration - len(announcement_half)))
initial_next_song_segment = 200
ms2_next_song = next_song[:initial_next_song_segment] - 12
ms3_next_song = next_song[initial_next_song_segment:].fade_in(fade_duration)
full_next_song = ms2_next_song + ms3_next_song
announcement_second_half = announcement[len(announcement) // 2:]
next_song_with_announcement = full_next_song.overlay(announcement_second_half)
return previous_song_with_announcement + next_song_with_announcement
def shuffle_corresponding_arrays(song_titles, song_paths):
combined = list(zip(song_titles, song_paths))
random.shuffle(combined)
shuffled_song_titles, shuffled_song_paths = zip(*combined)
while any(shuffled_song_titles[i] == shuffled_song_titles[i + 1] for i in range(len(shuffled_song_titles) - 1)):
random.shuffle(combined)
shuffled_song_titles, shuffled_song_paths = zip(*combined)
return shuffled_song_titles, shuffled_song_paths
return shuffled_song_titles, shuffled_song_paths
def news(prompt, xml):
resp = ai_client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
messages = [
{"role":"assistant", "content":prompt},
{"role":"user", "content":xml}
]
)
all_convos = json.loads(resp.choices[0].message.content)
sentences = all_convos["conversation"]
print("Sentences: ", sentences)
news_audio = AudioSegment.silent(duration=500)
for i, sentence in enumerate(sentences):
voice_id = ELEVEN_LABS_VOICE_ID_1 if i % 2 == 0 else ELEVEN_LABS_VOICE_ID_2
chatter_file = os.path.join(chatter_dir, f"news_chatter_{i}.mp3")
success = elevenlabs_tts(sentence, chatter_file, voice_id, stability=0.2, similarity_boost=0.95)
if success:
print(f"News sentence {i} generated for voice {voice_id}")
else:
print(f"Failed to generate banter for sentence {i} with voice {voice_id}")
continue
if os.path.exists(chatter_file):
sentence_audio = AudioSegment.from_mp3(chatter_file)
news_audio = news_audio + sentence_audio + AudioSegment.silent(duration=300)
else:
print(f"Audio file {chatter_file} not found")
return news_audio
def create_radio_show(directory):
song_titles, song_paths = get_song_titles(directory)
combined_audio = create_intro_audio()
if song_titles:
first_song_title = song_titles[0]
first_song_intro_audio = create_first_song_intro(first_song_title)
first_song_audio = AudioSegment.from_mp3(song_paths[0])
combined_audio = combined_audio + first_song_intro_audio + first_song_audio
print(f"Added first song intro and song for {first_song_title}")
for index in range(LAST_COMPLETED_SONG_INDEX, len(song_titles) - 1):
if segment_audio.duration_seconds > 1800:
break
song_title = song_titles[index]
next_song_title = song_titles[index + 1]
# Prevent repeating the same song
if song_title == next_song_title:
continue
transition_phrase = random.choice(voice_phrases["song_transitions"]).format(song_title=song_title, next_song_title=next_song_title)
transition_file = os.path.join(chatter_dir, f"transition_{index}.mp3")
voice_id = ELEVEN_LABS_VOICE_ID_1 if index % 2 == 0 else ELEVEN_LABS_VOICE_ID_2
fallback_text = f"That was {song_title}, and up next is {next_song_title}."
success = elevenlabs_tts(transition_phrase, transition_file, voice_id, fallback_text=fallback_text)
if success and os.path.exists(transition_file) and os.path.exists(song_paths[index]):
transition_audio = AudioSegment.from_mp3(transition_file)
song_audio = AudioSegment.from_mp3(song_paths[index])
next_song_audio = AudioSegment.from_mp3(song_paths[index + 1])
segment_audio = segment_audio + transition_with_fade(song_audio, transition_audio, next_song_audio)
print(f"Added transition and song for {song_title}")
# Always move to the next song, even if something fails
LAST_COMPLETED_SONG_INDEX = index + 1
# if random.randint(1, 3) == 1:
insane_chatter_audio = generate_inane_chatter()
combined_audio = combined_audio + insane_chatter_audio
output_wav = "radio_show_output.wav"
combined_audio.export(output_wav, format="wav")
print(f"Generated full radio show: {output_wav}")