From 60c0ccca0fdc6712db128d1b069ec8b34393ba8a Mon Sep 17 00:00:00 2001
From: kadirnar <kadir.nar@hotmail.com>
Date: Tue, 7 May 2024 11:37:53 +0000
Subject: [PATCH 1/3] Add yt-dlp instead of pytube to download youtube video

---
 README.md                           | 27 +++++------
 requirements.txt                    |  2 +-
 test.py                             | 42 ++++++++++++++++++
 whisperplus/__init__.py             | 14 +++---
 whisperplus/app.py                  |  6 +--
 whisperplus/test.py                 |  4 +-
 whisperplus/utils/download_utils.py | 69 ++++++++++++++++-------------
 7 files changed, 106 insertions(+), 58 deletions(-)
 create mode 100644 test.py

diff --git a/README.md b/README.md
index c5ceb52..d072eda 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
 ## 🛠️ Installation
 
 ```bash
-pip install whisperplus git+https://github.com/huggingface/transformers
+pip install git+https://github.com/huggingface/transformers
 pip install flash-attn --no-build-isolation
 ```
 
@@ -32,15 +32,15 @@ To use the whisperplus library, follow the steps below for different tasks:
 ### 🎵 Youtube URL to Audio
 
 ```python
-from whisperplus import SpeechToTextPipeline, download_and_convert_to_mp3
+from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3
 from transformers import BitsAndBytesConfig, HqqConfig
 import torch
 
 url = "https://www.youtube.com/watch?v=di3rHkEZuUw"
-audio_path = download_and_convert_to_mp3(url)
+audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")
 
 hqq_config = HqqConfig(
-    nbits=1,
+    nbits=4,
     group_size=64,
     quant_zero=False,
     quant_scale=False,
@@ -78,7 +78,7 @@ print(transcript)
 ### 📰 Summarization
 
 ```python
-from whisperplus import TextSummarizationPipeline
+from whisperplus.pipelines.summarization import TextSummarizationPipeline
 
 summarizer = TextSummarizationPipeline(model_id="facebook/bart-large-cnn")
 summary = summarizer.summarize(transcript)
@@ -88,7 +88,7 @@ print(summary[0]["summary_text"])
 ### 📰 Long Text Support Summarization
 
 ```python
-from whisperplus import LongTextSummarizationPipeline
+from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
 
 summarizer = LongTextSummarizationPipeline(model_id="facebook/bart-large-cnn")
 summary_text = summarizer.summarize(transcript)
@@ -102,13 +102,10 @@ pip install pyannote.audio>=3.1.0 pyannote.core>=5.0.0 pyannote.database>=5.0.1
 ```
 
 ```python
-from whisperplus import (
-    ASRDiarizationPipeline,
-    download_and_convert_to_mp3,
-    format_speech_to_dialogue,
-)
+from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
+from whisperplus import download_youtube_to_mp3, format_speech_to_dialogue
 
-audio_path = download_and_convert_to_mp3("https://www.youtube.com/watch?v=mRB14sFHw2E")
+audio_path = download_youtube_to_mp3("https://www.youtube.com/watch?v=mRB14sFHw2E")
 
 device = "cuda"  # cpu or mps
 pipeline = ASRDiarizationPipeline.from_pretrained(
@@ -153,7 +150,7 @@ pip install autollm>=0.1.9
 ```
 
 ```python
-from whisperplus import AutoLLMChatWithVideo
+from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo
 
 # service_context_params
 system_prompt = """
@@ -192,7 +189,7 @@ print(response)
 ### 🎙️ Speech to Text
 
 ```python
-from whisperplus import TextToSpeechPipeline
+from whisperplus.pipelines.text2speech import TextToSpeechPipeline
 
 tts = TextToSpeechPipeline(model_id="suno/bark")
 audio = tts(text="Hello World", voice_preset="v2/en_speaker_6")
@@ -201,7 +198,7 @@ audio = tts(text="Hello World", voice_preset="v2/en_speaker_6")
 ### 🎥 AutoCaption
 
 ```python
-from whisperplus import WhisperAutoCaptionPipeline
+from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
 
 caption = WhisperAutoCaptionPipeline(model_id="openai/whisper-large-v3")
 caption(video_path="test.mp4", output_path="output.mp4", language="turkish")
diff --git a/requirements.txt b/requirements.txt
index 6fae836..591e28b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 torch>=2.0.0
 torchvision>=0.15.0
 moviepy>=1.0.3
-pytube>=15.0.0
+yt_dlp
 Requests>=2.31.0
 accelerate
 bitsandbytes
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..edf87ff
--- /dev/null
+++ b/test.py
@@ -0,0 +1,42 @@
+import torch
+from transformers import BitsAndBytesConfig, HqqConfig
+
+from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3
+
+url = "https://www.youtube.com/watch?v=di3rHkEZuUw"
+audio_path = download_youtube_to_mp3(url, output_dir='downloads', filename="test")
+
+hqq_config = HqqConfig(
+    nbits=1,
+    group_size=64,
+    quant_zero=False,
+    quant_scale=False,
+    axis=0,
+    offload_meta=False,
+)  # axis=0 is used by default
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+)
+
+pipeline = SpeechToTextPipeline(
+    model_id="distil-whisper/distil-large-v3",
+    quant_config=hqq_config,
+    hqq=True,
+    flash_attention_2=True,
+)
+
+transcript = pipeline(
+    audio_path=audio_path,
+    chunk_length_s=30,
+    stride_length_s=5,
+    max_new_tokens=128,
+    batch_size=100,
+    language="english",
+    return_timestamps=False,
+)
+
+print(transcript)
diff --git a/whisperplus/__init__.py b/whisperplus/__init__.py
index 0b9c36e..6c36c33 100644
--- a/whisperplus/__init__.py
+++ b/whisperplus/__init__.py
@@ -1,11 +1,11 @@
-from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo
-from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
-from whisperplus.pipelines.summarization import TextSummarizationPipeline
-from whisperplus.pipelines.text2speech import TextToSpeechPipeline
+# from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo
+# from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
+# from whisperplus.pipelines.summarization import TextSummarizationPipeline
+# from whisperplus.pipelines.text2speech import TextToSpeechPipeline
 from whisperplus.pipelines.whisper import SpeechToTextPipeline
-from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
-from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
-from whisperplus.utils.download_utils import download_and_convert_to_mp3
+# from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
+# from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
+from whisperplus.utils.download_utils import download_youtube_to_mp3
 from whisperplus.utils.text_utils import format_speech_to_dialogue
 
 __version__ = '0.3.3'
diff --git a/whisperplus/app.py b/whisperplus/app.py
index 34d4f9d..afe0d25 100644
--- a/whisperplus/app.py
+++ b/whisperplus/app.py
@@ -2,7 +2,7 @@
 
 from whisperplus.pipelines.whisper import SpeechToTextPipeline
 from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
-from whisperplus.utils.download_utils import download_and_convert_to_mp3
+from whisperplus.utils.download_utils import download_youtube_to_mp3
 from whisperplus.utils.text_utils import format_speech_to_dialogue
 
 
@@ -20,7 +20,7 @@ def youtube_url_to_text(url, model_id, language_choice):
         transcript (str): The transcript of the speech-to-text conversion.
         video_path (str): The path of the downloaded video.
     """
-    video_path = download_and_convert_to_mp3(url)
+    video_path = download_youtube_to_mp3(url)
     pipeline = SpeechToTextPipeline(model_id)
     transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice)
 
@@ -50,7 +50,7 @@ def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_sp
         device=device,
     )
 
-    audio_path = download_and_convert_to_mp3(url)
+    audio_path = download_youtube_to_mp3(url)
     output_text = pipeline(
         audio_path, num_speakers=num_speakers, min_speaker=min_speaker, max_speaker=max_speaker)
     dialogue = format_speech_to_dialogue(output_text)
diff --git a/whisperplus/test.py b/whisperplus/test.py
index f5f3622..5d42b0f 100644
--- a/whisperplus/test.py
+++ b/whisperplus/test.py
@@ -2,10 +2,10 @@
 from hqq.utils.patching import prepare_for_inference
 from pipelines.whisper import SpeechToTextPipeline
 from transformers import BitsAndBytesConfig, HqqConfig
-from utils.download_utils import download_and_convert_to_mp3
+from utils.download_utils import download_youtube_to_mp3
 
 url = "https://www.youtube.com/watch?v=BpN4hEAvDBg"
-audio_path = download_and_convert_to_mp3(url)
+audio_path = download_youtube_to_mp3(url)
 
 hqq_config = HqqConfig(
     nbits=1,
diff --git a/whisperplus/utils/download_utils.py b/whisperplus/utils/download_utils.py
index 57cb4e8..b6f86e1 100644
--- a/whisperplus/utils/download_utils.py
+++ b/whisperplus/utils/download_utils.py
@@ -1,42 +1,51 @@
 import logging
-import os
 from pathlib import Path
-from typing import Optional
 
-from moviepy.editor import AudioFileClip
-from pytube import YouTube
+import yt_dlp
 
-logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')
 
 
-def download_and_convert_to_mp3(url: str,
-                                output_path: str = "output",
-                                filename: str = "test") -> Optional[str]:
-    try:
-        yt = YouTube(url)
-        audio_stream = yt.streams.filter(only_audio=True).first()
-
-        if audio_stream is None:
-            logging.warning("No audio streams found")
-            return None
-
-        Path(output_path).mkdir(parents=True, exist_ok=True)
+def download_youtube_to_mp3(url, output_dir='./', filename="test"):
+    """
+    Downloads a YouTube video as an MP3 file.
 
-        mp3_file_path = os.path.join(output_path, filename + ".mp3")
-        logging.info(f"Downloading started... {mp3_file_path}")
+    Parameters:
+    url (str): The URL of the YouTube video to download.
+    output_dir (str, optional): The directory to save the MP3 file. Defaults to the current working directory.
+    filename (str, optional): The filename for the MP3 file. If not provided, the video title will be used.
 
-        downloaded_file_path = audio_stream.download(output_path)
+    Returns:
+    pathlib.Path: The path to the downloaded MP3 file.
+    """
+    # Create the output directory if it doesn't exist
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
 
-        audio_clip = AudioFileClip(downloaded_file_path)
-        audio_clip.write_audiofile(mp3_file_path, codec="libmp3lame", verbose=False, logger=None)
-        audio_clip.close()
+    # Set the output filename
+    if filename is None:
+        filename = "%(title)s.%(ext)s"
+    else:
+        filename = f"{filename}.%(ext)s"
 
-        if Path(downloaded_file_path).suffix != ".mp3":
-            os.remove(downloaded_file_path)
+    # Download the video using yt_dlp
+    ydl_opts = {
+        'outtmpl': str(output_dir / filename),
+        'format': 'bestaudio/best',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+    }
 
-        logging.info(f"Download and conversion successful. File saved at: {mp3_file_path}")
-        return str(mp3_file_path)
-
-    except Exception as e:
-        logging.error(f"An error occurred: {e}")
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+    except yt_dlp.utils.DownloadError as e:
+        logging.error(f"Error downloading video: {e}")
         return None
+
+    output_path = output_dir / filename.replace('%(ext)s', 'mp3')
+    logging.info(f"Downloaded {output_path} as MP3")
+    return str(output_path)

From 73c6e217f4bc4e0b1d9f9c0691e772beec79c3a6 Mon Sep 17 00:00:00 2001
From: kadirnar <kadir.nar@hotmail.com>
Date: Tue, 7 May 2024 12:07:58 +0000
Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=90=9B=20Resolve=20imagemagic=20error?=
 =?UTF-8?q?=20for=20autocaption=20pipeline?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                           | 15 ++++++++++-
 requirements.txt                    |  2 ++
 test.py                             | 42 -----------------------------
 whisperplus/__init__.py             |  2 +-
 whisperplus/utils/download_utils.py | 40 +++++++++++++++++++++++++++
 5 files changed, 57 insertions(+), 44 deletions(-)
 delete mode 100644 test.py

diff --git a/README.md b/README.md
index d072eda..1635973 100644
--- a/README.md
+++ b/README.md
@@ -197,11 +197,24 @@ audio = tts(text="Hello World", voice_preset="v2/en_speaker_6")
 
 ### 🎥 AutoCaption
 
+```bash
+pip install moviepy
+apt install imagemagick libmagick++-dev
+cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml
+```
+
 ```python
 from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
+from whisperplus import download_youtube_to_mp4
+
+video_path = download_youtube_to_mp4(
+    "https://www.youtube.com/watch?v=di3rHkEZuUw",
+    output_dir="downloads",
+    filename="test",
+)  # Optional
 
 caption = WhisperAutoCaptionPipeline(model_id="openai/whisper-large-v3")
-caption(video_path="test.mp4", output_path="output.mp4", language="turkish")
+caption(video_path=video_path, output_path="output.mp4", language="english")
 ```
 
 ## 😍 Contributing
diff --git a/requirements.txt b/requirements.txt
index 591e28b..bb69929 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,5 @@ Requests>=2.31.0
 accelerate
 bitsandbytes
 hqq
+ffmpeg
+ffmpeg-python
diff --git a/test.py b/test.py
deleted file mode 100644
index edf87ff..0000000
--- a/test.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch
-from transformers import BitsAndBytesConfig, HqqConfig
-
-from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3
-
-url = "https://www.youtube.com/watch?v=di3rHkEZuUw"
-audio_path = download_youtube_to_mp3(url, output_dir='downloads', filename="test")
-
-hqq_config = HqqConfig(
-    nbits=1,
-    group_size=64,
-    quant_zero=False,
-    quant_scale=False,
-    axis=0,
-    offload_meta=False,
-)  # axis=0 is used by default
-
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_use_double_quant=True,
-)
-
-pipeline = SpeechToTextPipeline(
-    model_id="distil-whisper/distil-large-v3",
-    quant_config=hqq_config,
-    hqq=True,
-    flash_attention_2=True,
-)
-
-transcript = pipeline(
-    audio_path=audio_path,
-    chunk_length_s=30,
-    stride_length_s=5,
-    max_new_tokens=128,
-    batch_size=100,
-    language="english",
-    return_timestamps=False,
-)
-
-print(transcript)
diff --git a/whisperplus/__init__.py b/whisperplus/__init__.py
index 6c36c33..2056b47 100644
--- a/whisperplus/__init__.py
+++ b/whisperplus/__init__.py
@@ -5,7 +5,7 @@
 from whisperplus.pipelines.whisper import SpeechToTextPipeline
 # from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
 # from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
-from whisperplus.utils.download_utils import download_youtube_to_mp3
+from whisperplus.utils.download_utils import download_youtube_to_mp3, download_youtube_to_mp4
 from whisperplus.utils.text_utils import format_speech_to_dialogue
 
 __version__ = '0.3.3'
diff --git a/whisperplus/utils/download_utils.py b/whisperplus/utils/download_utils.py
index b6f86e1..7da4701 100644
--- a/whisperplus/utils/download_utils.py
+++ b/whisperplus/utils/download_utils.py
@@ -49,3 +49,43 @@ def download_youtube_to_mp3(url, output_dir='./', filename="test"):
     output_path = output_dir / filename.replace('%(ext)s', 'mp3')
     logging.info(f"Downloaded {output_path} as MP3")
     return str(output_path)
+
+
+def download_youtube_to_mp4(url, output_dir='./', filename="test"):
+    """
+    Downloads a YouTube video as an MP4 file.
+
+    Parameters:
+    url (str): The URL of the YouTube video to download.
+    output_dir (str, optional): The directory to save the MP4 file. Defaults to the current working directory.
+    filename (str, optional): The filename for the MP4 file. If not provided, the video title will be used.
+
+    Returns:
+    str: The path to the downloaded MP4 file.
+    """
+    # Create the output directory if it doesn't exist
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Set the output filename
+    if filename is None:
+        filename = "%(title)s.%(ext)s"
+    else:
+        filename = f"{filename}.%(ext)s"
+
+    # Download the video using yt_dlp
+    ydl_opts = {
+        'outtmpl': str(output_dir / filename),
+        'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
+    }
+
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            ydl.download([url])
+    except yt_dlp.utils.DownloadError as e:
+        logging.error(f"Error downloading video: {e}")
+        return None
+
+    output_path = output_dir / filename.replace('%(ext)s', 'mp4')
+    logging.info(f"Downloaded {output_path} as MP4")
+    return str(output_path)

From 47823c26a3381332687db9ecad4c73de3a7bc383 Mon Sep 17 00:00:00 2001
From: kadirnar <kadir.nar@hotmail.com>
Date: Tue, 7 May 2024 12:11:45 +0000
Subject: [PATCH 3/3] Add pre-commit library

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index bb69929..b469769 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ bitsandbytes
 hqq
 ffmpeg
 ffmpeg-python
+pre-commit