diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb index ce201c1..3246ab5 100644 --- a/notebook/demo.ipynb +++ b/notebook/demo.ipynb @@ -33,6 +33,17 @@ "nest_asyncio.apply()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b6cf901", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install whisperplus git+https://github.com/huggingface/transformers\n", + "!pip install flash-attn --no-build-isolation" + ] + }, { "cell_type": "markdown", "id": "1d0b2e40", @@ -50,16 +61,100 @@ "metadata": {}, "outputs": [], "source": [ - "from whisperplus import SpeechToTextPipeline, download_and_convert_to_mp3\n", + "from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3\n", + "from transformers import BitsAndBytesConfig, HqqConfig\n", + "import torch\n", "\n", "url = \"https://www.youtube.com/watch?v=di3rHkEZuUw\"\n", - "audio_path = download_and_convert_to_mp3(url)\n", - "pipeline = SpeechToTextPipeline(model_id=\"openai/whisper-large-v3\")\n", - "transcript = pipeline(audio_path, \"openai/whisper-large-v3\", \"english\")\n", + "audio_path = download_youtube_to_mp3(url, output_dir=\"downloads\", filename=\"test\")\n", + "\n", + "hqq_config = HqqConfig(\n", + " nbits=4,\n", + " group_size=64,\n", + " quant_zero=False,\n", + " quant_scale=False,\n", + " axis=0,\n", + " offload_meta=False,\n", + ") # axis=0 is used by default\n", + "\n", + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + " bnb_4bit_use_double_quant=True,\n", + ")\n", + "\n", + "pipeline = SpeechToTextPipeline(\n", + " model_id=\"distil-whisper/distil-large-v3\",\n", + " quant_config=hqq_config,\n", + " flash_attention_2=True,\n", + ")\n", + "\n", + "transcript = pipeline(\n", + " audio_path=audio_path,\n", + " chunk_length_s=30,\n", + " stride_length_s=5,\n", + " max_new_tokens=128,\n", + " batch_size=100,\n", + " language=\"english\",\n", + " return_timestamps=False,\n", + ")\n", "\n", "print(transcript)" ] }, + { + "cell_type": "markdown", + "id": "8d6282f7", + "metadata": {}, + "source": [ + "### 🍎 Apple MLX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97b42087", + "metadata": {}, + "outputs": [], + "source": [ + "from whisperplus.pipelines import mlx_whisper\n", + "from whisperplus import download_youtube_to_mp3\n", + "\n", + "url = \"https://www.youtube.com/watch?v=1__CAdTJ5JU\"\n", + "audio_path = download_youtube_to_mp3(url)\n", + "\n", + "text = mlx_whisper.transcribe(\n", + " audio_path, path_or_hf_repo=\"mlx-community/whisper-large-v3-mlx\"\n", + ")[\"text\"]\n", + "print(text)" + ] + }, + { + "cell_type": "markdown", + "id": "ca528ba7", + "metadata": {}, + "source": [ + "### 🍏 Lightning Mlx Whisper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d99b8bb", + "metadata": {}, + "outputs": [], + "source": [ + "from whisperplus.pipelines.lightning_whisper_mlx import LightningWhisperMLX\n", + "from whisperplus import download_youtube_to_mp3\n", + "\n", + "url = \"https://www.youtube.com/watch?v=1__CAdTJ5JU\"\n", + "audio_path = download_youtube_to_mp3(url)\n", + "\n", + "whisper = LightningWhisperMLX(model=\"distil-large-v3\", batch_size=12, quant=None)\n", + "output = whisper.transcribe(audio_path=audio_path)[\"text\"]" + ] + }, { "cell_type": "markdown", "id": "1a99131f", @@ -77,7 +172,7 @@ "metadata": {}, "outputs": [], "source": [ - "from whisperplus import TextSummarizationPipeline\n", + "from whisperplus.pipelines.summarization import TextSummarizationPipeline\n", "\n", "summarizer = TextSummarizationPipeline(model_id=\"facebook/bart-large-cnn\")\n", "summary = summarizer.summarize(transcript)\n", @@ -101,7 +196,7 @@ "metadata": {}, "outputs": [], "source": [ - "from whisperplus import LongTextSummarizationPipeline\n", + "from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline\n", "\n", "summarizer = LongTextSummarizationPipeline(model_id=\"facebook/bart-large-cnn\")\n", "summary_text = summarizer.summarize(transcript)\n", @@ -115,7 +210,25 @@ "source": [ "### 💬 Speaker Diarization\n", "\n", - "In this section, we demonstrate the use of Speaker Diarization. This feature helps in distinguishing between different speakers in an audio clip." + "You must confirm the licensing permissions of these two models.\n", + "\n", + "- https://huggingface.co/pyannote/speaker-diarization-3.1\n", + "- https://huggingface.co/pyannote/segmentation-3.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0066de25", + "metadata": {}, + "outputs": [], + "source": [ + "huggingface_hub import notebook_login\n", + "\n", + "!pip install -r speaker_diarization.txt\n", + "!pip install -U \"huggingface_hub[cli]\"\n", + "\n", + "notebook_login()" ] }, { @@ -125,18 +238,15 @@ "metadata": {}, "outputs": [], "source": [ - "from whisperplus import (\n", - " ASRDiarizationPipeline,\n", - " download_and_convert_to_mp3,\n", - " format_speech_to_dialogue,\n", - ")\n", + "from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline\n", + "from whisperplus import download_youtube_to_mp3, format_speech_to_dialogue\n", "\n", - "audio_path = download_and_convert_to_mp3(\"https://www.youtube.com/watch?v=mRB14sFHw2E\")\n", + "audio_path = download_youtube_to_mp3(\"https://www.youtube.com/watch?v=mRB14sFHw2E\")\n", "\n", "device = \"cuda\" # cpu or mps\n", "pipeline = ASRDiarizationPipeline.from_pretrained(\n", " asr_model=\"openai/whisper-large-v3\",\n", - " diarizer_model=\"pyannote/speaker-diarization\",\n", + " diarizer_model=\"pyannote/speaker-diarization-3.1\",\n", " use_auth_token=False,\n", " chunk_length_s=30,\n", " device=device,\n", @@ -157,6 +267,16 @@ "This part covers the 'Chat with Video' feature using LanceDB. It demonstrates how to interact with a video transcript using a chat interface." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0714c378", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install sentence-transformers ctransformers langchain" + ] + }, { "cell_type": "code", "execution_count": null, @@ -164,7 +284,7 @@ "metadata": {}, "outputs": [], "source": [ - "from whisperplus import ChatWithVideo\n", + "from whisperplus.pipelines.chatbot import ChatWithVideo\n", "\n", "chat = ChatWithVideo(\n", " input_file=\"trascript.txt\",\n", @@ -175,7 +295,8 @@ ")\n", "\n", "query = \"what is this video about ?\"\n", - "response = chat.run_query(query)" + "response = chat.run_query(query)\n", + "print(response)" ] }, { @@ -188,6 +309,16 @@ "This section demonstrates the 'Chat with Video' feature using AutoLLM. It enables querying a video's content through a chat interface, utilizing advanced language models." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "38be611e", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install autollm>=0.1.9" + ] + }, { "cell_type": "code", "execution_count": null, @@ -195,7 +326,7 @@ "metadata": {}, "outputs": [], "source": [ - "from whisperplus import AutoLLMChatWithVideo\n", + "from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo\n", "\n", "# service_context_params\n", "system_prompt = \"\"\"\n", @@ -215,15 +346,15 @@ "\"\"\"\n", "\n", "chat = AutoLLMChatWithVideo(\n", - " input_file=\"audio.mp3\",\n", - " openai_key=\"YOUR_OPENAI_KEY\",\n", - " huggingface_key=\"YOUR_HUGGINGFACE_KEY\",\n", + " input_file=\"input_dir\", # path of mp3 file\n", + " openai_key=\"YOUR_OPENAI_KEY\", # optional\n", + " huggingface_key=\"YOUR_HUGGINGFACE_KEY\", # optional\n", " llm_model=\"gpt-3.5-turbo\",\n", " llm_max_tokens=\"256\",\n", " llm_temperature=\"0.1\",\n", " system_prompt=system_prompt,\n", " query_wrapper_prompt=query_wrapper_prompt,\n", - " embed_model=\"huggingface/BAAI/bge-large-zh\",\n", + " embed_model=\"huggingface/BAAI/bge-large-zh\", # \"text-embedding-ada-002\"\n", ")\n", "\n", "query = \"what is this video about ?\"\n", @@ -236,7 +367,7 @@ "id": "223ed48e", "metadata": {}, "source": [ - "### 🎙️ Speech to Text\n", + "### 🎙️ Text to Speech\n", "\n", "Finally, this section covers converting text to speech using WhisperPlus, demonstrating how to generate spoken audio from text." ] @@ -248,7 +379,7 @@ "metadata": {}, "outputs": [], "source": [ - "from whisperplus import TextToSpeechPipeline\n", + "from whisperplus.pipelines.text2speech import TextToSpeechPipeline\n", "\n", "tts = TextToSpeechPipeline(model_id=\"suno/bark\")\n", "audio = tts(text=\"Hello World\", voice_preset=\"v2/en_speaker_6\")" diff --git a/whisperplus/app.py b/whisperplus/app.py index afe0d25..17278bf 100644 --- a/whisperplus/app.py +++ b/whisperplus/app.py @@ -1,9 +1,18 @@ import gradio as gr - -from whisperplus.pipelines.whisper import SpeechToTextPipeline +import torch +from transformers import BitsAndBytesConfig, HqqConfig + +from whisperplus import ( + SpeechToTextPipeline, + download_youtube_to_mp3, + download_youtube_to_mp4, + format_speech_to_dialogue, +) +from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline +from whisperplus.pipelines.summarization import TextSummarizationPipeline +from whisperplus.pipelines.text2speech import TextToSpeechPipeline +from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline -from whisperplus.utils.download_utils import download_youtube_to_mp3 -from whisperplus.utils.text_utils import format_speech_to_dialogue def youtube_url_to_text(url, model_id, language_choice): @@ -18,13 +27,68 @@ def youtube_url_to_text(url, model_id, language_choice): Returns: transcript (str): The transcript of the speech-to-text conversion. - video_path (str): The path of the downloaded video. """ - video_path = download_youtube_to_mp3(url) - pipeline = SpeechToTextPipeline(model_id) - transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice) + audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test") + + hqq_config = HqqConfig( + nbits=4, + group_size=64, + quant_zero=False, + quant_scale=False, + axis=0, + offload_meta=False, + ) # axis=0 is used by default + + pipeline = SpeechToTextPipeline( + model_id=model_id, + quant_config=hqq_config, + flash_attention_2=True, + ) + + transcript = pipeline( + audio_path=audio_path, + chunk_length_s=30, + stride_length_s=5, + max_new_tokens=128, + batch_size=100, + language=language_choice, + return_timestamps=False, + ) + return transcript + + +def summarization(text, model_id="facebook/bart-large-cnn"): + """ + Main function that performs summarization using a specified model and returns the summary. + + Args: + text (str): The text to summarize. + model_id (str): The ID of the summarization model to use. + + Returns: + summary (str): The summary of the text. + """ + summarizer = TextSummarizationPipeline(model_id=model_id) + summary = summarizer.summarize(text) + + return summary[0]["summary_text"] + + +def long_text_summarization(text, model_id="facebook/bart-large-cnn"): + """ + Main function that performs summarization using a specified model and returns the summary. + + Args: + text (str): The text to summarize. + model_id (str): The ID of the summarization model to use. + + Returns: + summary (str): The summary of the text. + """ + summarizer = LongTextSummarizationPipeline(model_id=model_id) + summary_text = summarizer.summarize(text) - return transcript, video_path + return summary_text def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker): @@ -57,161 +121,92 @@ def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_sp return dialogue, audio_path -def youtube_url_to_text_app(): - with gr.Blocks(): +def text2spech_bark(text, model_id="suno/bark", voice_preset="v2/en_speaker_6"): + tts = TextToSpeechPipeline(model_id=model_id) + audio = tts(text=text, voice_preset=voice_preset) + return audio + + +def whisper_autocaption(url, language, model_id="openai/whisper-large-v3"): + video_path = download_youtube_to_mp4(url) + + caption = WhisperAutoCaptionPipeline(model_id=model_id) + output = caption(video_path=video_path, output_path="output.mp4", language=language) + return output + + +with gr.Blocks() as demo: + with gr.Tab("YouTube URL to Text"): + with gr.Row(): + with gr.Column(): + url_input = gr.Textbox(label="Enter YouTube URL") + model_id_input = gr.Textbox(label="Enter Model ID", value="openai/whisper-medium") + language_input = gr.Textbox(label="Enter Language", value="en") + submit_btn1 = gr.Button("Submit") + with gr.Column(): + output1 = gr.Textbox(label="Transcript") + submit_btn1.click( + youtube_url_to_text, inputs=[url_input, model_id_input, language_input], outputs=output1) + + with gr.Tab("Text Summarization"): + with gr.Row(): + with gr.Column(): + text_input = gr.Textbox(label="Enter Text", lines=5) + model_id_input2 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn") + submit_btn2 = gr.Button("Summarize") + with gr.Column(): + output2 = gr.Textbox(label="Summary") + submit_btn2.click(summarization, inputs=[text_input, model_id_input2], outputs=output2) + + with gr.Tab("Long Text Summarization"): with gr.Row(): with gr.Column(): - youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") - - language_choice = gr.Dropdown( - choices=[ - "English", - "Turkish", - "Spanish", - "French", - "Chinese", - "Japanese", - "Korean", - ], - value="Turkish", - label="Language", - ) - whisper_model_id = gr.Dropdown( - choices=[ - "openai/whisper-large-v3", - "openai/whisper-large", - "openai/whisper-medium", - "openai/whisper-base", - "openai/whisper-small", - "openai/whisper-tiny", - ], - value="openai/whisper-large-v3", - label="Whisper Model", - ) - whisperplus_in_predict = gr.Button(value="Generator") + long_text_input = gr.Textbox(label="Enter Long Text", lines=10) + model_id_input3 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn") + submit_btn3 = gr.Button("Summarize Long Text") + with gr.Column(): + output3 = gr.Textbox(label="Long Text Summary") + submit_btn3.click(long_text_summarization, inputs=[long_text_input, model_id_input3], outputs=output3) + with gr.Tab("Speaker Diarization"): + with gr.Row(): + with gr.Column(): + url_input2 = gr.Textbox(label="Enter YouTube URL") + model_id_input4 = gr.Textbox(label="Enter Model ID") + num_speakers = gr.Number(label="Number of Speakers", value=2) + min_speakers = gr.Number(label="Min Speakers", value=1) + max_speakers = gr.Number(label="Max Speakers", value=4) + device = gr.Textbox(label="Device", value="cpu") + submit_btn4 = gr.Button("Diarize") with gr.Column(): - output_text = gr.Textbox(label="Output Text") - output_audio = gr.Audio(label="Output Audio") - - whisperplus_in_predict.click( - fn=youtube_url_to_text, - inputs=[ - youtube_url_path, - whisper_model_id, - language_choice, - ], - outputs=[output_text, output_audio], - ) - gr.Examples( - examples=[ - [ - "https://www.youtube.com/watch?v=di3rHkEZuUw", - "openai/whisper-large-v3", - "English", - ], - ], - fn=youtube_url_to_text, - inputs=[ - youtube_url_path, - whisper_model_id, - language_choice, - ], - outputs=[output_text, output_audio], - cache_examples=False, - ) - - -def speaker_diarization_app(): - with gr.Blocks(): + output4 = gr.DataFrame(headers=["Speaker", "Text"], datatype=["str", "str"]) + submit_btn4.click( + speaker_diarization, + inputs=[url_input2, model_id_input4, device, num_speakers, min_speakers, max_speakers], + outputs=output4) + + with gr.Tab("Text to Speech"): with gr.Row(): with gr.Column(): - youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL") - - whisper_model_id = gr.Dropdown( - choices=[ - "openai/whisper-large-v3", - "openai/whisper-large", - "openai/whisper-medium", - "openai/whisper-base", - "openai/whisper-small", - "openai/whisper-tiny", - ], - value="openai/whisper-large-v3", - label="Whisper Model", - ) - device = gr.Dropdown( - choices=["cpu", "cuda", "mps"], - value="cuda", - label="Device", - ) - num_speakers = gr.Number(value=2, label="Number of Speakers") - min_speaker = gr.Number(value=1, label="Minimum Number of Speakers") - max_speaker = gr.Number(value=2, label="Maximum Number of Speakers") - whisperplus_in_predict = gr.Button(value="Generator") + text_input2 = gr.Textbox(label="Enter Text", lines=3) + model_id_input5 = gr.Textbox(label="Enter Model ID", value="suno/bark") + voice_preset = gr.Textbox(label="Voice Preset", value="v2/en_speaker_6") + submit_btn5 = gr.Button("Generate Audio") + with gr.Column(): + output5 = gr.Audio(label="Generated Audio") + submit_btn5.click( + text2spech_bark, inputs=[text_input2, model_id_input5, voice_preset], outputs=output5) + with gr.Tab("Whisper Autocaption"): + with gr.Row(): with gr.Column(): - output_text = gr.Textbox(label="Output Text") - output_audio = gr.Audio(label="Output Audio") - - whisperplus_in_predict.click( - fn=speaker_diarization, - inputs=[ - youtube_url_path, - whisper_model_id, - device, - num_speakers, - min_speaker, - max_speaker, - ], - outputs=[output_text, output_audio], - ) - gr.Examples( - examples=[ - [ - "https://www.youtube.com/shorts/o8PgLUgte2k", - "openai/whisper-large-v3", - "mps", - 2, - 1, - 2, - ], - ], - fn=speaker_diarization, - inputs=[ - youtube_url_path, - whisper_model_id, - device, - num_speakers, - min_speaker, - max_speaker, - ], - outputs=[output_text, output_audio], - cache_examples=False, - ) - - -gradio_app = gr.Blocks() -with gradio_app: - gr.HTML( - """ -

- WhisperPlus: Advancing Speech-to-Text Processing 🚀 -

- """) - gr.HTML( - """ -

- Follow me for more! - Twitter | Github | Linkedin | HuggingFace -

- """) - with gr.Row(): - with gr.Column(): - with gr.Tab(label="Youtube URL to Text"): - youtube_url_to_text_app() - with gr.Tab(label="Speaker Diarization"): - speaker_diarization_app() - -gradio_app.queue() -gradio_app.launch(debug=True) + url_input3 = gr.Textbox(label="Enter YouTube URL") + language = gr.Textbox(label="Language", value="en") + model_id_input6 = gr.Textbox(label="Enter Model ID", value="openai/whisper-large-v2") + submit_btn6 = gr.Button("Generate Captions") + with gr.Column(): + output6 = gr.Video(label="Captioned Video") + submit_btn6.click( + whisper_autocaption, inputs=[url_input3, language, model_id_input6], outputs=output6) + +demo.launch()