From 60c0ccca0fdc6712db128d1b069ec8b34393ba8a Mon Sep 17 00:00:00 2001 From: kadirnar Date: Tue, 7 May 2024 11:37:53 +0000 Subject: [PATCH 1/3] Add yt-dlp instead of pytube to download youtube video --- README.md | 27 +++++------ requirements.txt | 2 +- test.py | 42 ++++++++++++++++++ whisperplus/__init__.py | 14 +++--- whisperplus/app.py | 6 +-- whisperplus/test.py | 4 +- whisperplus/utils/download_utils.py | 69 ++++++++++++++++------------- 7 files changed, 106 insertions(+), 58 deletions(-) create mode 100644 test.py diff --git a/README.md b/README.md index c5ceb52..d072eda 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ ## 🛠️ Installation ```bash -pip install whisperplus git+https://github.com/huggingface/transformers +pip install git+https://github.com/huggingface/transformers pip install flash-attn --no-build-isolation ``` @@ -32,15 +32,15 @@ To use the whisperplus library, follow the steps below for different tasks: ### 🎵 Youtube URL to Audio ```python -from whisperplus import SpeechToTextPipeline, download_and_convert_to_mp3 +from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3 from transformers import BitsAndBytesConfig, HqqConfig import torch url = "https://www.youtube.com/watch?v=di3rHkEZuUw" -audio_path = download_and_convert_to_mp3(url) +audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test") hqq_config = HqqConfig( - nbits=1, + nbits=4, group_size=64, quant_zero=False, quant_scale=False, @@ -78,7 +78,7 @@ print(transcript) ### 📰 Summarization ```python -from whisperplus import TextSummarizationPipeline +from whisperplus.pipelines.summarization import TextSummarizationPipeline summarizer = TextSummarizationPipeline(model_id="facebook/bart-large-cnn") summary = summarizer.summarize(transcript) @@ -88,7 +88,7 @@ print(summary[0]["summary_text"]) ### 📰 Long Text Support Summarization ```python -from whisperplus import LongTextSummarizationPipeline +from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline summarizer = LongTextSummarizationPipeline(model_id="facebook/bart-large-cnn") summary_text = summarizer.summarize(transcript) @@ -102,13 +102,10 @@ pip install pyannote.audio>=3.1.0 pyannote.core>=5.0.0 pyannote.database>=5.0.1 ``` ```python -from whisperplus import ( - ASRDiarizationPipeline, - download_and_convert_to_mp3, - format_speech_to_dialogue, -) +from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline +from whisperplus import download_youtube_to_mp3, format_speech_to_dialogue -audio_path = download_and_convert_to_mp3("https://www.youtube.com/watch?v=mRB14sFHw2E") +audio_path = download_youtube_to_mp3("https://www.youtube.com/watch?v=mRB14sFHw2E") device = "cuda" # cpu or mps pipeline = ASRDiarizationPipeline.from_pretrained( @@ -153,7 +150,7 @@ pip install autollm>=0.1.9 ``` ```python -from whisperplus import AutoLLMChatWithVideo +from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo # service_context_params system_prompt = """ @@ -192,7 +189,7 @@ print(response) ### 🎙️ Speech to Text ```python -from whisperplus import TextToSpeechPipeline +from whisperplus.pipelines.text2speech import TextToSpeechPipeline tts = TextToSpeechPipeline(model_id="suno/bark") audio = tts(text="Hello World", voice_preset="v2/en_speaker_6") @@ -201,7 +198,7 @@ audio = tts(text="Hello World", voice_preset="v2/en_speaker_6") ### 🎥 AutoCaption ```python -from whisperplus import WhisperAutoCaptionPipeline +from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline caption = WhisperAutoCaptionPipeline(model_id="openai/whisper-large-v3") caption(video_path="test.mp4", output_path="output.mp4", language="turkish") diff --git a/requirements.txt b/requirements.txt index 6fae836..591e28b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ torch>=2.0.0 torchvision>=0.15.0 moviepy>=1.0.3 -pytube>=15.0.0 +yt_dlp Requests>=2.31.0 accelerate bitsandbytes diff --git a/test.py b/test.py new file mode 100644 index 0000000..edf87ff --- /dev/null +++ b/test.py @@ -0,0 +1,42 @@ +import torch +from transformers import BitsAndBytesConfig, HqqConfig + +from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3 + +url = "https://www.youtube.com/watch?v=di3rHkEZuUw" +audio_path = download_youtube_to_mp3(url, output_dir='downloads', filename="test") + +hqq_config = HqqConfig( + nbits=1, + group_size=64, + quant_zero=False, + quant_scale=False, + axis=0, + offload_meta=False, +) # axis=0 is used by default + +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, +) + +pipeline = SpeechToTextPipeline( + model_id="distil-whisper/distil-large-v3", + quant_config=hqq_config, + hqq=True, + flash_attention_2=True, +) + +transcript = pipeline( + audio_path=audio_path, + chunk_length_s=30, + stride_length_s=5, + max_new_tokens=128, + batch_size=100, + language="english", + return_timestamps=False, +) + +print(transcript) diff --git a/whisperplus/__init__.py b/whisperplus/__init__.py index 0b9c36e..6c36c33 100644 --- a/whisperplus/__init__.py +++ b/whisperplus/__init__.py @@ -1,11 +1,11 @@ -from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo -from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline -from whisperplus.pipelines.summarization import TextSummarizationPipeline -from whisperplus.pipelines.text2speech import TextToSpeechPipeline +# from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo +# from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline +# from whisperplus.pipelines.summarization import TextSummarizationPipeline +# from whisperplus.pipelines.text2speech import TextToSpeechPipeline from whisperplus.pipelines.whisper import SpeechToTextPipeline -from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline -from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline -from whisperplus.utils.download_utils import download_and_convert_to_mp3 +# from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline +# from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline +from whisperplus.utils.download_utils import download_youtube_to_mp3 from whisperplus.utils.text_utils import format_speech_to_dialogue __version__ = '0.3.3' diff --git a/whisperplus/app.py b/whisperplus/app.py index 34d4f9d..afe0d25 100644 --- a/whisperplus/app.py +++ b/whisperplus/app.py @@ -2,7 +2,7 @@ from whisperplus.pipelines.whisper import SpeechToTextPipeline from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline -from whisperplus.utils.download_utils import download_and_convert_to_mp3 +from whisperplus.utils.download_utils import download_youtube_to_mp3 from whisperplus.utils.text_utils import format_speech_to_dialogue @@ -20,7 +20,7 @@ def youtube_url_to_text(url, model_id, language_choice): transcript (str): The transcript of the speech-to-text conversion. video_path (str): The path of the downloaded video. """ - video_path = download_and_convert_to_mp3(url) + video_path = download_youtube_to_mp3(url) pipeline = SpeechToTextPipeline(model_id) transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice) @@ -50,7 +50,7 @@ def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_sp device=device, ) - audio_path = download_and_convert_to_mp3(url) + audio_path = download_youtube_to_mp3(url) output_text = pipeline( audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker) dialogue = format_speech_to_dialogue(output_text) diff --git a/whisperplus/test.py b/whisperplus/test.py index f5f3622..5d42b0f 100644 --- a/whisperplus/test.py +++ b/whisperplus/test.py @@ -2,10 +2,10 @@ from hqq.utils.patching import prepare_for_inference from pipelines.whisper import SpeechToTextPipeline from transformers import BitsAndBytesConfig, HqqConfig -from utils.download_utils import download_and_convert_to_mp3 +from utils.download_utils import download_youtube_to_mp3 url = "https://www.youtube.com/watch?v=BpN4hEAvDBg" -audio_path = download_and_convert_to_mp3(url) +audio_path = download_youtube_to_mp3(url) hqq_config = HqqConfig( nbits=1, diff --git a/whisperplus/utils/download_utils.py b/whisperplus/utils/download_utils.py index 57cb4e8..b6f86e1 100644 --- a/whisperplus/utils/download_utils.py +++ b/whisperplus/utils/download_utils.py @@ -1,42 +1,51 @@ import logging -import os from pathlib import Path -from typing import Optional -from moviepy.editor import AudioFileClip -from pytube import YouTube +import yt_dlp -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s') -def download_and_convert_to_mp3(url: str, - output_path: str = "output", - filename: str = "test") -> Optional[str]: - try: - yt = YouTube(url) - audio_stream = yt.streams.filter(only_audio=True).first() - - if audio_stream is None: - logging.warning("No audio streams found") - return None - - Path(output_path).mkdir(parents=True, exist_ok=True) +def download_youtube_to_mp3(url, output_dir='./', filename="test"): + """ + Downloads a YouTube video as an MP3 file. - mp3_file_path = os.path.join(output_path, filename + ".mp3") - logging.info(f"Downloading started... {mp3_file_path}") + Parameters: + url (str): The URL of the YouTube video to download. + output_dir (str, optional): The directory to save the MP3 file. Defaults to the current working directory. + filename (str, optional): The filename for the MP3 file. If not provided, the video title will be used. - downloaded_file_path = audio_stream.download(output_path) + Returns: + pathlib.Path: The path to the downloaded MP3 file. + """ + # Create the output directory if it doesn't exist + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) - audio_clip = AudioFileClip(downloaded_file_path) - audio_clip.write_audiofile(mp3_file_path, codec="libmp3lame", verbose=False, logger=None) - audio_clip.close() + # Set the output filename + if filename is None: + filename = "%(title)s.%(ext)s" + else: + filename = f"{filename}.%(ext)s" - if Path(downloaded_file_path).suffix != ".mp3": - os.remove(downloaded_file_path) + # Download the video using yt_dlp + ydl_opts = { + 'outtmpl': str(output_dir / filename), + 'format': 'bestaudio/best', + 'postprocessors': [{ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': 'mp3', + 'preferredquality': '192', + }], + } - logging.info(f"Download and conversion successful. File saved at: {mp3_file_path}") - return str(mp3_file_path) - - except Exception as e: - logging.error(f"An error occurred: {e}") + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + except yt_dlp.utils.DownloadError as e: + logging.error(f"Error downloading video: {e}") return None + + output_path = output_dir / filename.replace('%(ext)s', 'mp3') + logging.info(f"Downloaded {output_path} as MP3") + return str(output_path) From 73c6e217f4bc4e0b1d9f9c0691e772beec79c3a6 Mon Sep 17 00:00:00 2001 From: kadirnar Date: Tue, 7 May 2024 12:07:58 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=90=9B=20Resolve=20imagemagic=20error?= =?UTF-8?q?=20for=20autocaption=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 15 ++++++++++- requirements.txt | 2 ++ test.py | 42 ----------------------------- whisperplus/__init__.py | 2 +- whisperplus/utils/download_utils.py | 40 +++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 44 deletions(-) delete mode 100644 test.py diff --git a/README.md b/README.md index d072eda..1635973 100644 --- a/README.md +++ b/README.md @@ -197,11 +197,24 @@ audio = tts(text="Hello World", voice_preset="v2/en_speaker_6") ### 🎥 AutoCaption +```bash +pip install moviepy +apt install imagemagick libmagick++-dev +cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml +``` + ```python from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline +from whisperplus import download_youtube_to_mp4 + +video_path = download_youtube_to_mp4( + "https://www.youtube.com/watch?v=di3rHkEZuUw", + output_dir="downloads", + filename="test", +) # Optional caption = WhisperAutoCaptionPipeline(model_id="openai/whisper-large-v3") -caption(video_path="test.mp4", output_path="output.mp4", language="turkish") +caption(video_path=video_path, output_path="output.mp4", language="english") ``` ## 😍 Contributing diff --git a/requirements.txt b/requirements.txt index 591e28b..bb69929 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ Requests>=2.31.0 accelerate bitsandbytes hqq +ffmpeg +ffmpeg-python diff --git a/test.py b/test.py deleted file mode 100644 index edf87ff..0000000 --- a/test.py +++ /dev/null @@ -1,42 +0,0 @@ -import torch -from transformers import BitsAndBytesConfig, HqqConfig - -from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3 - -url = "https://www.youtube.com/watch?v=di3rHkEZuUw" -audio_path = download_youtube_to_mp3(url, output_dir='downloads', filename="test") - -hqq_config = HqqConfig( - nbits=1, - group_size=64, - quant_zero=False, - quant_scale=False, - axis=0, - offload_meta=False, -) # axis=0 is used by default - -bnb_config = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_quant_type="nf4", - bnb_4bit_compute_dtype=torch.bfloat16, - bnb_4bit_use_double_quant=True, -) - -pipeline = SpeechToTextPipeline( - model_id="distil-whisper/distil-large-v3", - quant_config=hqq_config, - hqq=True, - flash_attention_2=True, -) - -transcript = pipeline( - audio_path=audio_path, - chunk_length_s=30, - stride_length_s=5, - max_new_tokens=128, - batch_size=100, - language="english", - return_timestamps=False, -) - -print(transcript) diff --git a/whisperplus/__init__.py b/whisperplus/__init__.py index 6c36c33..2056b47 100644 --- a/whisperplus/__init__.py +++ b/whisperplus/__init__.py @@ -5,7 +5,7 @@ from whisperplus.pipelines.whisper import SpeechToTextPipeline # from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline # from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline -from whisperplus.utils.download_utils import download_youtube_to_mp3 +from whisperplus.utils.download_utils import download_youtube_to_mp3, download_youtube_to_mp4 from whisperplus.utils.text_utils import format_speech_to_dialogue __version__ = '0.3.3' diff --git a/whisperplus/utils/download_utils.py b/whisperplus/utils/download_utils.py index b6f86e1..7da4701 100644 --- a/whisperplus/utils/download_utils.py +++ b/whisperplus/utils/download_utils.py @@ -49,3 +49,43 @@ def download_youtube_to_mp3(url, output_dir='./', filename="test"): output_path = output_dir / filename.replace('%(ext)s', 'mp3') logging.info(f"Downloaded {output_path} as MP3") return str(output_path) + + +def download_youtube_to_mp4(url, output_dir='./', filename="test"): + """ + Downloads a YouTube video as an MP4 file. + + Parameters: + url (str): The URL of the YouTube video to download. + output_dir (str, optional): The directory to save the MP4 file. Defaults to the current working directory. + filename (str, optional): The filename for the MP4 file. If not provided, the video title will be used. + + Returns: + str: The path to the downloaded MP4 file. + """ + # Create the output directory if it doesn't exist + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Set the output filename + if filename is None: + filename = "%(title)s.%(ext)s" + else: + filename = f"{filename}.%(ext)s" + + # Download the video using yt_dlp + ydl_opts = { + 'outtmpl': str(output_dir / filename), + 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best', + } + + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download([url]) + except yt_dlp.utils.DownloadError as e: + logging.error(f"Error downloading video: {e}") + return None + + output_path = output_dir / filename.replace('%(ext)s', 'mp4') + logging.info(f"Downloaded {output_path} as MP4") + return str(output_path) From 47823c26a3381332687db9ecad4c73de3a7bc383 Mon Sep 17 00:00:00 2001 From: kadirnar Date: Tue, 7 May 2024 12:11:45 +0000 Subject: [PATCH 3/3] Add pre-commit library --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index bb69929..b469769 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ bitsandbytes hqq ffmpeg ffmpeg-python +pre-commit