kadirnar · kadirnar · Jun 8, 2024 · Jun 8, 2024
diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb
@@ -33,6 +33,17 @@
     "nest_asyncio.apply()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b6cf901",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install whisperplus git+https://github.com/huggingface/transformers\n",
+    "!pip install flash-attn --no-build-isolation"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "1d0b2e40",
@@ -50,16 +61,100 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import SpeechToTextPipeline, download_and_convert_to_mp3\n",
+    "from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3\n",
+    "from transformers import BitsAndBytesConfig, HqqConfig\n",
+    "import torch\n",
     "\n",
     "url = \"https://www.youtube.com/watch?v=di3rHkEZuUw\"\n",
-    "audio_path = download_and_convert_to_mp3(url)\n",
-    "pipeline = SpeechToTextPipeline(model_id=\"openai/whisper-large-v3\")\n",
-    "transcript = pipeline(audio_path, \"openai/whisper-large-v3\", \"english\")\n",
+    "audio_path = download_youtube_to_mp3(url, output_dir=\"downloads\", filename=\"test\")\n",
+    "\n",
+    "hqq_config = HqqConfig(\n",
+    "    nbits=4,\n",
+    "    group_size=64,\n",
+    "    quant_zero=False,\n",
+    "    quant_scale=False,\n",
+    "    axis=0,\n",
+    "    offload_meta=False,\n",
+    ")  # axis=0 is used by default\n",
+    "\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    ")\n",
+    "\n",
+    "pipeline = SpeechToTextPipeline(\n",
+    "    model_id=\"distil-whisper/distil-large-v3\",\n",
+    "    quant_config=hqq_config,\n",
+    "    flash_attention_2=True,\n",
+    ")\n",
+    "\n",
+    "transcript = pipeline(\n",
+    "    audio_path=audio_path,\n",
+    "    chunk_length_s=30,\n",
+    "    stride_length_s=5,\n",
+    "    max_new_tokens=128,\n",
+    "    batch_size=100,\n",
+    "    language=\"english\",\n",
+    "    return_timestamps=False,\n",
+    ")\n",
     "\n",
     "print(transcript)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8d6282f7",
+   "metadata": {},
+   "source": [
+    "### 🍎 Apple MLX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97b42087",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from whisperplus.pipelines import mlx_whisper\n",
+    "from whisperplus import download_youtube_to_mp3\n",
+    "\n",
+    "url = \"https://www.youtube.com/watch?v=1__CAdTJ5JU\"\n",
+    "audio_path = download_youtube_to_mp3(url)\n",
+    "\n",
+    "text = mlx_whisper.transcribe(\n",
+    "    audio_path, path_or_hf_repo=\"mlx-community/whisper-large-v3-mlx\"\n",
+    ")[\"text\"]\n",
+    "print(text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca528ba7",
+   "metadata": {},
+   "source": [
+    "### 🍏 Lightning Mlx Whisper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d99b8bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from whisperplus.pipelines.lightning_whisper_mlx import LightningWhisperMLX\n",
+    "from whisperplus import download_youtube_to_mp3\n",
+    "\n",
+    "url = \"https://www.youtube.com/watch?v=1__CAdTJ5JU\"\n",
+    "audio_path = download_youtube_to_mp3(url)\n",
+    "\n",
+    "whisper = LightningWhisperMLX(model=\"distil-large-v3\", batch_size=12, quant=None)\n",
+    "output = whisper.transcribe(audio_path=audio_path)[\"text\"]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "1a99131f",
@@ -77,7 +172,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import TextSummarizationPipeline\n",
+    "from whisperplus.pipelines.summarization import TextSummarizationPipeline\n",
     "\n",
     "summarizer = TextSummarizationPipeline(model_id=\"facebook/bart-large-cnn\")\n",
     "summary = summarizer.summarize(transcript)\n",
@@ -101,7 +196,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import LongTextSummarizationPipeline\n",
+    "from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline\n",
     "\n",
     "summarizer = LongTextSummarizationPipeline(model_id=\"facebook/bart-large-cnn\")\n",
     "summary_text = summarizer.summarize(transcript)\n",
@@ -115,7 +210,25 @@
    "source": [
     "### 💬 Speaker Diarization\n",
     "\n",
-    "In this section, we demonstrate the use of Speaker Diarization. This feature helps in distinguishing between different speakers in an audio clip."
+    "You must confirm the licensing permissions of these two models.\n",
+    "\n",
+    "- https://huggingface.co/pyannote/speaker-diarization-3.1\n",
+    "- https://huggingface.co/pyannote/segmentation-3.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0066de25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "huggingface_hub import notebook_login\n",
+    "\n",
+    "!pip install -r speaker_diarization.txt\n",
+    "!pip install -U \"huggingface_hub[cli]\"\n",
+    "\n",
+    "notebook_login()"
    ]
   },
   {
@@ -125,18 +238,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import (\n",
-    "    ASRDiarizationPipeline,\n",
-    "    download_and_convert_to_mp3,\n",
-    "    format_speech_to_dialogue,\n",
-    ")\n",
+    "from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline\n",
+    "from whisperplus import download_youtube_to_mp3, format_speech_to_dialogue\n",
     "\n",
-    "audio_path = download_and_convert_to_mp3(\"https://www.youtube.com/watch?v=mRB14sFHw2E\")\n",
+    "audio_path = download_youtube_to_mp3(\"https://www.youtube.com/watch?v=mRB14sFHw2E\")\n",
     "\n",
     "device = \"cuda\"  # cpu or mps\n",
     "pipeline = ASRDiarizationPipeline.from_pretrained(\n",
     "    asr_model=\"openai/whisper-large-v3\",\n",
-    "    diarizer_model=\"pyannote/speaker-diarization\",\n",
+    "    diarizer_model=\"pyannote/speaker-diarization-3.1\",\n",
     "    use_auth_token=False,\n",
     "    chunk_length_s=30,\n",
     "    device=device,\n",
@@ -157,14 +267,24 @@
     "This part covers the 'Chat with Video' feature using LanceDB. It demonstrates how to interact with a video transcript using a chat interface."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0714c378",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install sentence-transformers ctransformers langchain"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "7c108aee",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import ChatWithVideo\n",
+    "from whisperplus.pipelines.chatbot import ChatWithVideo\n",
     "\n",
     "chat = ChatWithVideo(\n",
     "    input_file=\"trascript.txt\",\n",
@@ -175,7 +295,8 @@
     ")\n",
     "\n",
     "query = \"what is this video about ?\"\n",
-    "response = chat.run_query(query)"
+    "response = chat.run_query(query)\n",
+    "print(response)"
    ]
   },
   {
@@ -188,14 +309,24 @@
     "This section demonstrates the 'Chat with Video' feature using AutoLLM. It enables querying a video's content through a chat interface, utilizing advanced language models."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38be611e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install autollm>=0.1.9"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "ddfd23fd",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import AutoLLMChatWithVideo\n",
+    "from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo\n",
     "\n",
     "# service_context_params\n",
     "system_prompt = \"\"\"\n",
@@ -215,15 +346,15 @@
     "\"\"\"\n",
     "\n",
     "chat = AutoLLMChatWithVideo(\n",
-    "    input_file=\"audio.mp3\",\n",
-    "    openai_key=\"YOUR_OPENAI_KEY\",\n",
-    "    huggingface_key=\"YOUR_HUGGINGFACE_KEY\",\n",
+    "    input_file=\"input_dir\",  # path of mp3 file\n",
+    "    openai_key=\"YOUR_OPENAI_KEY\",  # optional\n",
+    "    huggingface_key=\"YOUR_HUGGINGFACE_KEY\",  # optional\n",
     "    llm_model=\"gpt-3.5-turbo\",\n",
     "    llm_max_tokens=\"256\",\n",
     "    llm_temperature=\"0.1\",\n",
     "    system_prompt=system_prompt,\n",
     "    query_wrapper_prompt=query_wrapper_prompt,\n",
-    "    embed_model=\"huggingface/BAAI/bge-large-zh\",\n",
+    "    embed_model=\"huggingface/BAAI/bge-large-zh\",  # \"text-embedding-ada-002\"\n",
     ")\n",
     "\n",
     "query = \"what is this video about ?\"\n",
@@ -236,7 +367,7 @@
    "id": "223ed48e",
    "metadata": {},
    "source": [
-    "### 🎙️ Speech to Text\n",
+    "### 🎙️ Text to Speech\n",
     "\n",
     "Finally, this section covers converting text to speech using WhisperPlus, demonstrating how to generate spoken audio from text."
    ]
@@ -248,7 +379,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import TextToSpeechPipeline\n",
+    "from whisperplus.pipelines.text2speech import TextToSpeechPipeline\n",
     "\n",
     "tts = TextToSpeechPipeline(model_id=\"suno/bark\")\n",
     "audio = tts(text=\"Hello World\", voice_preset=\"v2/en_speaker_6\")"