diff --git a/notebook/demo.ipynb b/notebook/demo.ipynb
index ce201c1..3246ab5 100644
--- a/notebook/demo.ipynb
+++ b/notebook/demo.ipynb
@@ -33,6 +33,17 @@
     "nest_asyncio.apply()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b6cf901",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install whisperplus git+https://github.com/huggingface/transformers\n",
+    "!pip install flash-attn --no-build-isolation"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "1d0b2e40",
@@ -50,16 +61,100 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import SpeechToTextPipeline, download_and_convert_to_mp3\n",
+    "from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3\n",
+    "from transformers import BitsAndBytesConfig, HqqConfig\n",
+    "import torch\n",
     "\n",
     "url = \"https://www.youtube.com/watch?v=di3rHkEZuUw\"\n",
-    "audio_path = download_and_convert_to_mp3(url)\n",
-    "pipeline = SpeechToTextPipeline(model_id=\"openai/whisper-large-v3\")\n",
-    "transcript = pipeline(audio_path, \"openai/whisper-large-v3\", \"english\")\n",
+    "audio_path = download_youtube_to_mp3(url, output_dir=\"downloads\", filename=\"test\")\n",
+    "\n",
+    "hqq_config = HqqConfig(\n",
+    "    nbits=4,\n",
+    "    group_size=64,\n",
+    "    quant_zero=False,\n",
+    "    quant_scale=False,\n",
+    "    axis=0,\n",
+    "    offload_meta=False,\n",
+    ")  # axis=0 is used by default\n",
+    "\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    ")\n",
+    "\n",
+    "pipeline = SpeechToTextPipeline(\n",
+    "    model_id=\"distil-whisper/distil-large-v3\",\n",
+    "    quant_config=hqq_config,\n",
+    "    flash_attention_2=True,\n",
+    ")\n",
+    "\n",
+    "transcript = pipeline(\n",
+    "    audio_path=audio_path,\n",
+    "    chunk_length_s=30,\n",
+    "    stride_length_s=5,\n",
+    "    max_new_tokens=128,\n",
+    "    batch_size=100,\n",
+    "    language=\"english\",\n",
+    "    return_timestamps=False,\n",
+    ")\n",
     "\n",
     "print(transcript)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8d6282f7",
+   "metadata": {},
+   "source": [
+    "### 🍎 Apple MLX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97b42087",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from whisperplus.pipelines import mlx_whisper\n",
+    "from whisperplus import download_youtube_to_mp3\n",
+    "\n",
+    "url = \"https://www.youtube.com/watch?v=1__CAdTJ5JU\"\n",
+    "audio_path = download_youtube_to_mp3(url)\n",
+    "\n",
+    "text = mlx_whisper.transcribe(\n",
+    "    audio_path, path_or_hf_repo=\"mlx-community/whisper-large-v3-mlx\"\n",
+    ")[\"text\"]\n",
+    "print(text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ca528ba7",
+   "metadata": {},
+   "source": [
+    "### 🍏 Lightning Mlx Whisper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d99b8bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from whisperplus.pipelines.lightning_whisper_mlx import LightningWhisperMLX\n",
+    "from whisperplus import download_youtube_to_mp3\n",
+    "\n",
+    "url = \"https://www.youtube.com/watch?v=1__CAdTJ5JU\"\n",
+    "audio_path = download_youtube_to_mp3(url)\n",
+    "\n",
+    "whisper = LightningWhisperMLX(model=\"distil-large-v3\", batch_size=12, quant=None)\n",
+    "output = whisper.transcribe(audio_path=audio_path)[\"text\"]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "1a99131f",
@@ -77,7 +172,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import TextSummarizationPipeline\n",
+    "from whisperplus.pipelines.summarization import TextSummarizationPipeline\n",
     "\n",
     "summarizer = TextSummarizationPipeline(model_id=\"facebook/bart-large-cnn\")\n",
     "summary = summarizer.summarize(transcript)\n",
@@ -101,7 +196,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import LongTextSummarizationPipeline\n",
+    "from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline\n",
     "\n",
     "summarizer = LongTextSummarizationPipeline(model_id=\"facebook/bart-large-cnn\")\n",
     "summary_text = summarizer.summarize(transcript)\n",
@@ -115,7 +210,25 @@
    "source": [
     "### 💬 Speaker Diarization\n",
     "\n",
-    "In this section, we demonstrate the use of Speaker Diarization. This feature helps in distinguishing between different speakers in an audio clip."
+    "You must confirm the licensing permissions of these two models.\n",
+    "\n",
+    "- https://huggingface.co/pyannote/speaker-diarization-3.1\n",
+    "- https://huggingface.co/pyannote/segmentation-3.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0066de25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "huggingface_hub import notebook_login\n",
+    "\n",
+    "!pip install -r speaker_diarization.txt\n",
+    "!pip install -U \"huggingface_hub[cli]\"\n",
+    "\n",
+    "notebook_login()"
    ]
   },
   {
@@ -125,18 +238,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import (\n",
-    "    ASRDiarizationPipeline,\n",
-    "    download_and_convert_to_mp3,\n",
-    "    format_speech_to_dialogue,\n",
-    ")\n",
+    "from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline\n",
+    "from whisperplus import download_youtube_to_mp3, format_speech_to_dialogue\n",
     "\n",
-    "audio_path = download_and_convert_to_mp3(\"https://www.youtube.com/watch?v=mRB14sFHw2E\")\n",
+    "audio_path = download_youtube_to_mp3(\"https://www.youtube.com/watch?v=mRB14sFHw2E\")\n",
     "\n",
     "device = \"cuda\"  # cpu or mps\n",
     "pipeline = ASRDiarizationPipeline.from_pretrained(\n",
     "    asr_model=\"openai/whisper-large-v3\",\n",
-    "    diarizer_model=\"pyannote/speaker-diarization\",\n",
+    "    diarizer_model=\"pyannote/speaker-diarization-3.1\",\n",
     "    use_auth_token=False,\n",
     "    chunk_length_s=30,\n",
     "    device=device,\n",
@@ -157,6 +267,16 @@
     "This part covers the 'Chat with Video' feature using LanceDB. It demonstrates how to interact with a video transcript using a chat interface."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0714c378",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install sentence-transformers ctransformers langchain"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -164,7 +284,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import ChatWithVideo\n",
+    "from whisperplus.pipelines.chatbot import ChatWithVideo\n",
     "\n",
     "chat = ChatWithVideo(\n",
     "    input_file=\"trascript.txt\",\n",
@@ -175,7 +295,8 @@
     ")\n",
     "\n",
     "query = \"what is this video about ?\"\n",
-    "response = chat.run_query(query)"
+    "response = chat.run_query(query)\n",
+    "print(response)"
    ]
   },
   {
@@ -188,6 +309,16 @@
     "This section demonstrates the 'Chat with Video' feature using AutoLLM. It enables querying a video's content through a chat interface, utilizing advanced language models."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38be611e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install autollm>=0.1.9"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -195,7 +326,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import AutoLLMChatWithVideo\n",
+    "from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo\n",
     "\n",
     "# service_context_params\n",
     "system_prompt = \"\"\"\n",
@@ -215,15 +346,15 @@
     "\"\"\"\n",
     "\n",
     "chat = AutoLLMChatWithVideo(\n",
-    "    input_file=\"audio.mp3\",\n",
-    "    openai_key=\"YOUR_OPENAI_KEY\",\n",
-    "    huggingface_key=\"YOUR_HUGGINGFACE_KEY\",\n",
+    "    input_file=\"input_dir\",  # path of mp3 file\n",
+    "    openai_key=\"YOUR_OPENAI_KEY\",  # optional\n",
+    "    huggingface_key=\"YOUR_HUGGINGFACE_KEY\",  # optional\n",
     "    llm_model=\"gpt-3.5-turbo\",\n",
     "    llm_max_tokens=\"256\",\n",
     "    llm_temperature=\"0.1\",\n",
     "    system_prompt=system_prompt,\n",
     "    query_wrapper_prompt=query_wrapper_prompt,\n",
-    "    embed_model=\"huggingface/BAAI/bge-large-zh\",\n",
+    "    embed_model=\"huggingface/BAAI/bge-large-zh\",  # \"text-embedding-ada-002\"\n",
     ")\n",
     "\n",
     "query = \"what is this video about ?\"\n",
@@ -236,7 +367,7 @@
    "id": "223ed48e",
    "metadata": {},
    "source": [
-    "### 🎙️ Speech to Text\n",
+    "### 🎙️ Text to Speech\n",
     "\n",
     "Finally, this section covers converting text to speech using WhisperPlus, demonstrating how to generate spoken audio from text."
    ]
@@ -248,7 +379,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from whisperplus import TextToSpeechPipeline\n",
+    "from whisperplus.pipelines.text2speech import TextToSpeechPipeline\n",
     "\n",
     "tts = TextToSpeechPipeline(model_id=\"suno/bark\")\n",
     "audio = tts(text=\"Hello World\", voice_preset=\"v2/en_speaker_6\")"
diff --git a/whisperplus/app.py b/whisperplus/app.py
index afe0d25..17278bf 100644
--- a/whisperplus/app.py
+++ b/whisperplus/app.py
@@ -1,9 +1,18 @@
 import gradio as gr
-
-from whisperplus.pipelines.whisper import SpeechToTextPipeline
+import torch
+from transformers import BitsAndBytesConfig, HqqConfig
+
+from whisperplus import (
+    SpeechToTextPipeline,
+    download_youtube_to_mp3,
+    download_youtube_to_mp4,
+    format_speech_to_dialogue,
+)
+from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
+from whisperplus.pipelines.summarization import TextSummarizationPipeline
+from whisperplus.pipelines.text2speech import TextToSpeechPipeline
+from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
 from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
-from whisperplus.utils.download_utils import download_youtube_to_mp3
-from whisperplus.utils.text_utils import format_speech_to_dialogue
 
 
 def youtube_url_to_text(url, model_id, language_choice):
@@ -18,13 +27,68 @@ def youtube_url_to_text(url, model_id, language_choice):
 
     Returns:
         transcript (str): The transcript of the speech-to-text conversion.
-        video_path (str): The path of the downloaded video.
     """
-    video_path = download_youtube_to_mp3(url)
-    pipeline = SpeechToTextPipeline(model_id)
-    transcript = pipeline(audio_path=video_path, model_id=model_id, language=language_choice)
+    audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")
+
+    hqq_config = HqqConfig(
+        nbits=4,
+        group_size=64,
+        quant_zero=False,
+        quant_scale=False,
+        axis=0,
+        offload_meta=False,
+    )  # axis=0 is used by default
+
+    pipeline = SpeechToTextPipeline(
+        model_id=model_id,
+        quant_config=hqq_config,
+        flash_attention_2=True,
+    )
+
+    transcript = pipeline(
+        audio_path=audio_path,
+        chunk_length_s=30,
+        stride_length_s=5,
+        max_new_tokens=128,
+        batch_size=100,
+        language=language_choice,
+        return_timestamps=False,
+    )
+    return transcript
+
+
+def summarization(text, model_id="facebook/bart-large-cnn"):
+    """
+    Main function that performs summarization using a specified model and returns the summary.
+
+    Args:
+        text (str): The text to summarize.
+        model_id (str): The ID of the summarization model to use.
+
+    Returns:
+        summary (str): The summary of the text.
+    """
+    summarizer = TextSummarizationPipeline(model_id=model_id)
+    summary = summarizer.summarize(text)
+
+    return summary[0]["summary_text"]
+
+
+def long_text_summarization(text, model_id="facebook/bart-large-cnn"):
+    """
+    Main function that performs summarization using a specified model and returns the summary.
+
+    Args:
+        text (str): The text to summarize.
+        model_id (str): The ID of the summarization model to use.
+
+    Returns:
+        summary (str): The summary of the text.
+    """
+    summarizer = LongTextSummarizationPipeline(model_id=model_id)
+    summary_text = summarizer.summarize(text)
 
-    return transcript, video_path
+    return summary_text
 
 
 def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_speaker):
@@ -57,161 +121,92 @@ def speaker_diarization(url, model_id, device, num_speakers, min_speaker, max_sp
     return dialogue, audio_path
 
 
-def youtube_url_to_text_app():
-    with gr.Blocks():
+def text2spech_bark(text, model_id="suno/bark", voice_preset="v2/en_speaker_6"):
+    tts = TextToSpeechPipeline(model_id=model_id)
+    audio = tts(text=text, voice_preset=voice_preset)
+    return audio
+
+
+def whisper_autocaption(url, language, model_id="openai/whisper-large-v3"):
+    video_path = download_youtube_to_mp4(url)
+
+    caption = WhisperAutoCaptionPipeline(model_id=model_id)
+    output = caption(video_path=video_path, output_path="output.mp4", language=language)
+    return output
+
+
+with gr.Blocks() as demo:
+    with gr.Tab("YouTube URL to Text"):
+        with gr.Row():
+            with gr.Column():
+                url_input = gr.Textbox(label="Enter YouTube URL")
+                model_id_input = gr.Textbox(label="Enter Model ID", value="openai/whisper-medium")
+                language_input = gr.Textbox(label="Enter Language", value="en")
+                submit_btn1 = gr.Button("Submit")
+            with gr.Column():
+                output1 = gr.Textbox(label="Transcript")
+        submit_btn1.click(
+            youtube_url_to_text, inputs=[url_input, model_id_input, language_input], outputs=output1)
+
+    with gr.Tab("Text Summarization"):
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(label="Enter Text", lines=5)
+                model_id_input2 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
+                submit_btn2 = gr.Button("Summarize")
+            with gr.Column():
+                output2 = gr.Textbox(label="Summary")
+        submit_btn2.click(summarization, inputs=[text_input, model_id_input2], outputs=output2)
+
+    with gr.Tab("Long Text Summarization"):
         with gr.Row():
             with gr.Column():
-                youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
-
-                language_choice = gr.Dropdown(
-                    choices=[
-                        "English",
-                        "Turkish",
-                        "Spanish",
-                        "French",
-                        "Chinese",
-                        "Japanese",
-                        "Korean",
-                    ],
-                    value="Turkish",
-                    label="Language",
-                )
-                whisper_model_id = gr.Dropdown(
-                    choices=[
-                        "openai/whisper-large-v3",
-                        "openai/whisper-large",
-                        "openai/whisper-medium",
-                        "openai/whisper-base",
-                        "openai/whisper-small",
-                        "openai/whisper-tiny",
-                    ],
-                    value="openai/whisper-large-v3",
-                    label="Whisper Model",
-                )
-                whisperplus_in_predict = gr.Button(value="Generator")
+                long_text_input = gr.Textbox(label="Enter Long Text", lines=10)
+                model_id_input3 = gr.Textbox(label="Enter Model ID", value="facebook/bart-large-cnn")
+                submit_btn3 = gr.Button("Summarize Long Text")
+            with gr.Column():
+                output3 = gr.Textbox(label="Long Text Summary")
+        submit_btn3.click(long_text_summarization, inputs=[long_text_input, model_id_input3], outputs=output3)
 
+    with gr.Tab("Speaker Diarization"):
+        with gr.Row():
+            with gr.Column():
+                url_input2 = gr.Textbox(label="Enter YouTube URL")
+                model_id_input4 = gr.Textbox(label="Enter Model ID")
+                num_speakers = gr.Number(label="Number of Speakers", value=2)
+                min_speakers = gr.Number(label="Min Speakers", value=1)
+                max_speakers = gr.Number(label="Max Speakers", value=4)
+                device = gr.Textbox(label="Device", value="cpu")
+                submit_btn4 = gr.Button("Diarize")
             with gr.Column():
-                output_text = gr.Textbox(label="Output Text")
-                output_audio = gr.Audio(label="Output Audio")
-
-        whisperplus_in_predict.click(
-            fn=youtube_url_to_text,
-            inputs=[
-                youtube_url_path,
-                whisper_model_id,
-                language_choice,
-            ],
-            outputs=[output_text, output_audio],
-        )
-        gr.Examples(
-            examples=[
-                [
-                    "https://www.youtube.com/watch?v=di3rHkEZuUw",
-                    "openai/whisper-large-v3",
-                    "English",
-                ],
-            ],
-            fn=youtube_url_to_text,
-            inputs=[
-                youtube_url_path,
-                whisper_model_id,
-                language_choice,
-            ],
-            outputs=[output_text, output_audio],
-            cache_examples=False,
-        )
-
-
-def speaker_diarization_app():
-    with gr.Blocks():
+                output4 = gr.DataFrame(headers=["Speaker", "Text"], datatype=["str", "str"])
+        submit_btn4.click(
+            speaker_diarization,
+            inputs=[url_input2, model_id_input4, device, num_speakers, min_speakers, max_speakers],
+            outputs=output4)
+
+    with gr.Tab("Text to Speech"):
         with gr.Row():
             with gr.Column():
-                youtube_url_path = gr.Text(placeholder="Enter Youtube URL", label="Youtube URL")
-
-                whisper_model_id = gr.Dropdown(
-                    choices=[
-                        "openai/whisper-large-v3",
-                        "openai/whisper-large",
-                        "openai/whisper-medium",
-                        "openai/whisper-base",
-                        "openai/whisper-small",
-                        "openai/whisper-tiny",
-                    ],
-                    value="openai/whisper-large-v3",
-                    label="Whisper Model",
-                )
-                device = gr.Dropdown(
-                    choices=["cpu", "cuda", "mps"],
-                    value="cuda",
-                    label="Device",
-                )
-                num_speakers = gr.Number(value=2, label="Number of Speakers")
-                min_speaker = gr.Number(value=1, label="Minimum Number of Speakers")
-                max_speaker = gr.Number(value=2, label="Maximum Number of Speakers")
-                whisperplus_in_predict = gr.Button(value="Generator")
+                text_input2 = gr.Textbox(label="Enter Text", lines=3)
+                model_id_input5 = gr.Textbox(label="Enter Model ID", value="suno/bark")
+                voice_preset = gr.Textbox(label="Voice Preset", value="v2/en_speaker_6")
+                submit_btn5 = gr.Button("Generate Audio")
+            with gr.Column():
+                output5 = gr.Audio(label="Generated Audio")
+        submit_btn5.click(
+            text2spech_bark, inputs=[text_input2, model_id_input5, voice_preset], outputs=output5)
 
+    with gr.Tab("Whisper Autocaption"):
+        with gr.Row():
             with gr.Column():
-                output_text = gr.Textbox(label="Output Text")
-                output_audio = gr.Audio(label="Output Audio")
-
-        whisperplus_in_predict.click(
-            fn=speaker_diarization,
-            inputs=[
-                youtube_url_path,
-                whisper_model_id,
-                device,
-                num_speakers,
-                min_speaker,
-                max_speaker,
-            ],
-            outputs=[output_text, output_audio],
-        )
-        gr.Examples(
-            examples=[
-                [
-                    "https://www.youtube.com/shorts/o8PgLUgte2k",
-                    "openai/whisper-large-v3",
-                    "mps",
-                    2,
-                    1,
-                    2,
-                ],
-            ],
-            fn=speaker_diarization,
-            inputs=[
-                youtube_url_path,
-                whisper_model_id,
-                device,
-                num_speakers,
-                min_speaker,
-                max_speaker,
-            ],
-            outputs=[output_text, output_audio],
-            cache_examples=False,
-        )
-
-
-gradio_app = gr.Blocks()
-with gradio_app:
-    gr.HTML(
-        """
-    <h1 style='text-align: center'>
-    WhisperPlus: Advancing Speech-to-Text Processing 🚀
-    </h1>
-    """)
-    gr.HTML(
-        """
-        <h3 style='text-align: center'>
-        Follow me for more!
-        <a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a>  | <a href='https://www.huggingface.co/kadirnar/' target='_blank'>HuggingFace</a>
-        </h3>
-        """)
-    with gr.Row():
-        with gr.Column():
-            with gr.Tab(label="Youtube URL to Text"):
-                youtube_url_to_text_app()
-            with gr.Tab(label="Speaker Diarization"):
-                speaker_diarization_app()
-
-gradio_app.queue()
-gradio_app.launch(debug=True)
+                url_input3 = gr.Textbox(label="Enter YouTube URL")
+                language = gr.Textbox(label="Language", value="en")
+                model_id_input6 = gr.Textbox(label="Enter Model ID", value="openai/whisper-large-v2")
+                submit_btn6 = gr.Button("Generate Captions")
+            with gr.Column():
+                output6 = gr.Video(label="Captioned Video")
+        submit_btn6.click(
+            whisper_autocaption, inputs=[url_input3, language, model_id_input6], outputs=output6)
+
+demo.launch()