diff --git a/requirements.txt b/requirements.txt index 3eb8e84..38c5ecd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,5 @@ bitsandbytes peft accelerate huggingface_hub[hf_transfer] -huggingface-hub \ No newline at end of file +huggingface-hub +whisperx \ No newline at end of file diff --git a/servers/whisperx/download_mp3.py b/servers/whisperx/download_mp3.py index 9d3a2f6..cd604c8 100644 --- a/servers/whisperx/download_mp3.py +++ b/servers/whisperx/download_mp3.py @@ -1,6 +1,7 @@ import os from pytube import YouTube + def download_youtube_audio(video_url, output_folder): try: yt = YouTube(video_url) @@ -8,7 +9,7 @@ def download_youtube_audio(video_url, output_folder): if audio_stream: output_file = audio_stream.download(output_path=output_folder) # Rename the file to have the .mp3 extension - mp3_file = output_file.split('.')[0] + '.mp3' + mp3_file = output_file.split(".")[0] + ".mp3" os.rename(output_file, mp3_file) print("Audio downloaded successfully:", mp3_file) else: @@ -16,6 +17,7 @@ def download_youtube_audio(video_url, output_folder): except Exception as e: print("Error:", e) + # Example usage video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" # Example YouTube video URL output_folder = "youtube_downloads" # Output folder where the MP3 will be saved diff --git a/servers/whisperx/example_api.py b/servers/whisperx/example_api.py index cf8643e..eec2f0c 100644 --- a/servers/whisperx/example_api.py +++ b/servers/whisperx/example_api.py @@ -15,9 +15,7 @@ class WhisperTranscription(BaseModel): # Construct the request data -request_data = WhisperTranscription( - file = "song.mp3" -) +request_data = WhisperTranscription(file="song.mp3") # Specify the URL of your FastAPI application url = "https://localhost:8000/v1/audio/transcriptions" diff --git a/servers/whisperx/whisper_model.py b/servers/whisperx/whisper_model.py new file mode 100644 index 0000000..6d05c4b --- /dev/null +++ b/servers/whisperx/whisper_model.py @@ -0,0 +1,74 @@ +import logging +from faster_whisper import WhisperModel + + +class FasterWhisperTranscriber: + def __init__( + self, + model_size="large-v3", + device="cuda", + compute_type="float16", + model_type="faster-whisper", + **kwargs, + ): + """ + Initialize the WhisperModel with specified configuration. + + :param model_size: Size of the Whisper model (e.g., 'large-v3', 'distil-large-v2') + :param device: Computation device ('cuda' or 'cpu') + :param compute_type: Type of computation ('float16', 'int8_float16', 'int8') + :param model_type: Type of model ('faster-whisper' or 'faster-distil-whisper') + :param kwargs: Additional arguments for WhisperModel transcribe method + """ + self.model_size = model_size + self.device = device + self.compute_type = compute_type + self.model_type = model_type + self.transcribe_options = kwargs + self.model = WhisperModel( + self.model_size, device=self.device, compute_type=self.compute_type + ) + + def run(self, task: str, *args, **kwargs): + """ + Transcribes the given audio file using the Whisper model. + + :param audio_file_path: Path to the audio file to be transcribed + :return: Transcription results + """ + segments, info = self.model.transcribe( + task, **self.transcribe_options + ) + + # Printing language detection information + print( + f"Detected language '{info.language}' with probability {info.language_probability:.2f}" + ) + + # Handling transcription based on the model type + if self.model_type == "faster-whisper": + for segment in segments: + print(f"[{segment.start:.2fs} -> {segment.end:.2fs}] {segment.text}") + elif ( + self.model_type == "faster-distil-whisper" + and "word_timestamps" in self.transcribe_options + and self.transcribe_options["word_timestamps"] + ): + for segment in segments: + for word in segment.words: + print(f"[{word.start:.2fs} -> {word.end:.2fs}] {word.word}") + else: + for segment in segments: + print(f"[{segment.start:.2fs} -> {segment.end:.2fs}] {segment.text}") + + +# Example usage +if __name__ == "__main__": + logging.basicConfig() + logging.getLogger("faster_whisper").setLevel(logging.DEBUG) + + # Example for faster-whisper with GPU and FP16 + transcriber = FasterWhisperTranscriber( + model_size="large-v3", device="cuda", compute_type="float16", beam_size=5 + ) + transcriber.run("song.mp3") diff --git a/servers/whisperx/whisperx.py b/servers/whisperx/whisperx.py index eca6a71..9c68d7e 100644 --- a/servers/whisperx/whisperx.py +++ b/servers/whisperx/whisperx.py @@ -17,7 +17,6 @@ hf_token = os.getenv("HF_TOKEN") - class WhisperTranscriber: """ A class for transcribing audio using the Whisper ASR system. @@ -285,7 +284,7 @@ async def create_audio_completion(request: WhisperTranscription): # Log the entry into supabase transcriber = WhisperTranscriber( - device = "cuda" if torch.cuda.is_available() else "cpu", + device="cuda" if torch.cuda.is_available() else "cpu", ) # Run the audio processing pipeline diff --git a/servers/whisperx/whisperx_no_api.py b/servers/whisperx/whisperx_no_api.py index 4808e48..41416ba 100644 --- a/servers/whisperx/whisperx_no_api.py +++ b/servers/whisperx/whisperx_no_api.py @@ -4,7 +4,7 @@ import torch import whisperx -import os +import os from dotenv import load_dotenv @@ -157,8 +157,7 @@ def run(self, audio_file: str): # Instantiate the WhisperTranscriber -model = WhisperTranscriber( -) +model = WhisperTranscriber() # Run the audio processing pipeline result = model.run("song.mp3")