From 9caeaf5d1184453b21028e37335e861fb8854f5b Mon Sep 17 00:00:00 2001
From: Evan Cosgrove <evanjcosgrove@gmail.com>
Date: Mon, 28 Oct 2024 03:54:27 -0400
Subject: [PATCH 1/3] add support for ingesting content from websites, audio
 files, YouTube links, and more.

---
 README.md                                     | 139 +++++++++++
 recipes/quickstart/NotebookLlama/README.md    | 114 +++++----
 recipes/quickstart/NotebookLlama/ingestion.py | 224 ++++++++++++++++++
 .../quickstart/NotebookLlama/requirements.txt |  18 +-
 4 files changed, 436 insertions(+), 59 deletions(-)
 create mode 100644 recipes/quickstart/NotebookLlama/ingestion.py

diff --git a/README.md b/README.md
index 38aaf5846..5915af9ad 100644
--- a/README.md
+++ b/README.md
@@ -180,3 +180,142 @@ See the License file for Meta Llama 3 [here](https://github.com/meta-llama/llama
 
 See the License file for Meta Llama 2 [here](https://github.com/meta-llama/llama-models/blob/main/models/llama2/LICENSE) and Acceptable Use Policy [here](https://github.com/meta-llama/llama-models/blob/main/models/llama2/USE_POLICY.md)
 <!-- markdown-link-check-enable -->
+
+## Supported Input Formats
+
+- **PDF Documents**: Ingest and process text from PDF files.
+- **Websites**: Extract and process text content from web URLs.
+- **YouTube Videos**: Retrieve and transcribe audio from YouTube video URLs.
+- **Audio Files**: Transcribe audio files into text using Whisper.
+
+## Usage Examples
+
+### Ingest from a PDF
+
+```python
+from ingestion import IngestorFactory
+
+input_type = "pdf"
+pdf_path = './resources/2402.13116v3.pdf'
+extracted_text = ingest_content(input_type, pdf_path)
+if extracted_text:
+    with open('extracted_text.txt', 'w', encoding='utf-8') as f:
+        f.write(extracted_text)
+    print("Extracted text has been saved to extracted_text.txt")
+```
+
+### Ingest from a Website
+
+```python
+from ingestion import IngestorFactory
+
+input_type = "website"
+website_url = "https://www.example.com"
+website_text = ingest_content(input_type, website_url)
+if website_text:
+    with open('website_extracted_text.txt', 'w', encoding='utf-8') as f:
+        f.write(website_text)
+    print("Extracted website text has been saved to website_extracted_text.txt")
+```
+
+### Ingest from a YouTube Video
+
+```python
+from ingestion import IngestorFactory
+
+input_type = "youtube"
+youtube_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+youtube_transcript = ingest_content(input_type, youtube_url)
+if youtube_transcript:
+    with open('youtube_transcript.txt', 'w', encoding='utf-8') as f:
+        f.write(youtube_transcript)
+    print("YouTube transcript has been saved to youtube_transcript.txt")
+```
+
+### Ingest from an Audio File
+
+```python
+from ingestion import IngestorFactory
+
+input_type = "audio"
+audio_file = './resources/sample_audio.mp3'
+audio_transcription = ingest_content(input_type, audio_file, model_type="base")
+if audio_transcription:
+    with open('audio_transcription.txt', 'w', encoding='utf-8') as f:
+        f.write(audio_transcription)
+    print("Audio transcription has been saved to audio_transcription.txt")
+```
+
+## Step 4: Testing
+
+Ensure that each ingestor works as expected by testing with sample inputs.
+
+### 4.1. Create Test Cases
+
+```python
+# test_ingestion.py
+
+import unittest
+from ingestion import IngestorFactory
+
+class TestIngestion(unittest.TestCase):
+
+    def test_pdf_ingestion(self):
+        pdf_path = "./resources/sample.pdf"
+        ingestor = IngestorFactory.get_ingestor("pdf")
+        text = ingestor.extract_text(pdf_path)
+        self.assertIsInstance(text, str)
+        self.assertTrue(len(text) > 0)
+
+    def test_website_ingestion(self):
+        website_url = "https://www.example.com"
+        ingestor = IngestorFactory.get_ingestor("website")
+        text = ingestor.extract_text(website_url)
+        self.assertIsInstance(text, str)
+        self.assertTrue(len(text) > 0)
+
+    def test_youtube_ingestion(self):
+        youtube_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+        ingestor = IngestorFactory.get_ingestor("youtube")
+        transcript = ingestor.extract_text(youtube_url)
+        self.assertIsInstance(transcript, str)
+        self.assertTrue(len(transcript) > 0)
+
+    def test_audio_ingestion(self):
+        audio_file = "./resources/sample_audio.mp3"
+        ingestor = IngestorFactory.get_ingestor("audio", model_type="base")
+        transcription = ingestor.extract_text(audio_file)
+        self.assertIsInstance(transcription, str)
+        self.assertTrue(len(transcription) > 0)
+    
+    def test_unsupported_type(self):
+        ingestor = IngestorFactory.get_ingestor("unsupported")
+        self.assertIsNone(ingestor)
+
+if __name__ == "__main__":
+    unittest.main()
+```
+
+### 4.2. Run Tests
+
+Execute the tests to verify all ingestion methods function correctly.
+
+```bash
+python test_ingestion.py
+```
+
+Ensure all tests pass and handle any exceptions or errors that arise.
+
+## Conclusion
+
+By following these steps, you've successfully **extended your `ingestion.py` module** to support multiple input formats—**websites, YouTube links, and audio files**—in addition to PDFs. This enhancement broadens the usability of your `NotebookLlama` pipeline, making it more versatile and valuable.
+
+### Next Steps
+
+1. **Handle Edge Cases**: Enhance each ingestor to manage various edge cases, such as unsupported formats, network issues, or transcription errors.
+2. **Asynchronous Processing**: Implement asynchronous ingestion to improve pipeline efficiency, especially for time-consuming tasks like audio transcription.
+3. **Logging and Error Reporting**: Integrate comprehensive logging to monitor ingestion processes and facilitate troubleshooting.
+4. **User Interface Enhancements**: Improve the interactive widgets in your notebook to provide better feedback and progress indicators during ingestion.
+5. **Documentation**: Continue to refine your documentation with detailed explanations, troubleshooting tips, and advanced usage examples.
+
+Feel free to reach out if you need further assistance or have more features you'd like to implement. Happy coding!
diff --git a/recipes/quickstart/NotebookLlama/README.md b/recipes/quickstart/NotebookLlama/README.md
index 70293c7f5..5f41a7578 100644
--- a/recipes/quickstart/NotebookLlama/README.md
+++ b/recipes/quickstart/NotebookLlama/README.md
@@ -6,90 +6,100 @@
 
 This is a guided series of tutorials/notebooks that can be taken as a reference or course to build a PDF to Podcast workflow. 
 
-You will also learn from the experiments of using  Text to Speech Models.
+You will also learn from the experiments of using Text to Speech Models.
 
-It assumes zero knowledge of LLMs, prompting and audio models, everything is covered in their respective notebooks.
+It assumes zero knowledge of LLMs, prompting, and audio models; everything is covered in their respective notebooks.
 
 ### Outline:
 
-Here is step by step thought (pun intended) for the task:
+Here is a step-by-step guide for the task:
 
-- Step 1: Pre-process PDF: Use `Llama-3.2-1B-Instruct` to pre-process the PDF and save it in a `.txt` file.
-- Step 2: Transcript Writer: Use `Llama-3.1-70B-Instruct` model to write a podcast transcript from the text
-- Step 3: Dramatic Re-Writer: Use `Llama-3.1-8B-Instruct` model to make the transcript more dramatic
-- Step 4: Text-To-Speech Workflow: Use `parler-tts/parler-tts-mini-v1` and `bark/suno` to generate a conversational podcast
+- **Step 1: Pre-process PDF**: Use [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) to pre-process the PDF and save it in a `.txt` file.
+- **Step 2: Transcript Writer**: Use [`Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) model to write a podcast transcript from the text.
+- **Step 3: Dramatic Re-Writer**: Use [`Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model to make the transcript more dramatic.
+- **Step 4: Text-To-Speech Workflow**: Use `parler-tts/parler-tts-mini-v1` and `bark/suno` to generate a conversational podcast.
 
-Note 1: In Step 1, we prompt the 1B model to not modify the text or summarize it, strictly clean up extra characters or garbage characters that might get picked due to encoding from PDF. Please see the prompt in Notebook 1 for more details.
+**Note 1**: In Step 1, we prompt the `Llama-3.2-1B-Instruct` model to not modify or summarize the text but strictly clean up extra characters or garbage characters that might get picked up due to encoding from the PDF. Please see the prompt in [Notebook 1: Pre-process PDF](Notebook1_PreprocessPDF.ipynb) for more details.
 
-Note 2: For Step 2, you can also use `Llama-3.1-8B-Instruct` model, we recommend experimenting and trying if you see any differences. The 70B model was used here because it gave slightly more creative podcast transcripts for the tested examples.
+**Note 2**: For Step 2, you can also use the `Llama-3.1-8B-Instruct` model. We recommend experimenting to see if you observe any differences. The 70B model was used here because it provided slightly more creative podcast transcripts in our tests.
 
-Note 3: For Step 4, please try to extend the approach with other models. These models were chosen based on a sample prompt and worked best, newer models might sound better. Please see [Notes](./TTS_Notes.md) for some of the sample tests.
+**Note 3**: For Step 4, please try to extend the approach with other models. These models were chosen based on sample prompts and worked best. Newer models might sound better. Please see [Notes](./TTS_Notes.md) for some sample tests.
 
 ### Detailed steps on running the notebook:
 
-Requirements: GPU server or an API provider for using 70B, 8B and 1B Llama models.
-For running the 70B model, you will need a GPU with aggregated memory around 140GB to infer in bfloat-16 precision.
+**Requirements**: 
 
-Note: For our GPU Poor friends, you can also use the 8B and lower models for the entire pipeline. There is no strong recommendation. The pipeline below is what worked best on first few tests. You should try and see what works best for you!
+- **GPU Server**: Required for using 70B, 8B, and 1B Llama models.
+- **70B Model**: Requires a GPU with approximately 140GB of aggregated memory to infer in bfloat-16 precision.
 
-- Before getting started, please make sure to login using the `huggingface cli` and then launch your jupyter notebook server to make sure you are able to download the Llama models.
+**Note**: If you do not have access to high-memory GPUs, you can use the 8B and lower models for the entire pipeline without significant loss in functionality.
 
-You'll need your Hugging Face access token, which you can get at your Settings page [here](https://huggingface.co/settings/tokens). Then run `huggingface-cli login` and copy and paste your Hugging Face access token to complete the login to make sure the scripts can download Hugging Face models if needed.
+- **Login to Hugging Face**: Make sure to login using the `huggingface cli` and then launch your Jupyter notebook server to ensure you can download the Llama models.
 
-- First, please Install the requirements from [here]() by running inside the folder:
+  You'll need your Hugging Face access token, which you can obtain from your [Settings page](https://huggingface.co/settings/tokens). Then run `huggingface-cli login` and paste your Hugging Face access token to complete the login, ensuring the scripts can download Hugging Face models as needed.
 
-```
-git clone https://github.com/meta-llama/llama-recipes
-cd llama-recipes/recipes/quickstart/NotebookLlama/
-pip install -r requirements.txt
-```
+- **Install Requirements**:
 
-- Notebook 1:
+  Clone the repository and install dependencies by running the following commands inside the folder:
 
-This notebook is used for processing the PDF and processing it using the new Feather light model into a `.txt` file.
+  ```bash
+  git clone https://github.com/meta-llama/llama-recipes
+  cd llama-recipes/recipes/quickstart/NotebookLlama/
+  pip install -r requirements.txt
+  ```
 
-Update the first cell with a PDF link that you would like to use. Please decide on a PDF to use for Notebook 1, it can be any link but please remember to update the first cell of the notebook with the right link. 
+- **Notebook 1: Pre-process PDF** (`Notebook1_PreprocessPDF.ipynb`):
 
-Please try changing the prompts for the `Llama-3.2-1B-Instruct` model and see if you can improve results.
+  This notebook processes the PDF and converts it into a `.txt` file using the new Feather light model.
+  
+  - Update the first cell with a PDF link that you would like to use. Ensure the link is correct before running the notebook.
+  - Experiment with the prompts for the `Llama-3.2-1B-Instruct` model to improve results.
 
-- Notebook 2:
+- **Notebook 2: Transcript Writer** (`Notebook2_TranscriptWriter.ipynb`):
 
-This notebook will take in the processed output from Notebook 1 and creatively convert it into a podcast transcript using the `Llama-3.1-70B-Instruct` model. If you are GPU rich, please feel free to test with the 405B model!
+  This notebook takes the processed output from Notebook 1 and generates a podcast transcript using the `Llama-3.1-70B-Instruct` model. If you have ample GPU resources, feel free to test with the 405B model!
+  
+  - Experiment with system prompts to improve results.
+  - Try using the 8B model to compare differences.
 
-Please try experimenting with the System prompts for the model and see if you can improve the results and try the 8B model as well here to see if there is a huge difference!
+- **Notebook 3: Dramatic Re-Writer** (`Notebook3_DramaticReWriter.ipynb`):
 
-- Notebook 3:
+  This notebook enhances the transcript by adding dramatization and interruptions using the `Llama-3.1-8B-Instruct` model.
+  
+  - The notebook returns a tuple of conversations, simplifying subsequent steps.
+  - Experiment with system prompts to further improve results.
+  - Consider testing with the feather light 3B and 1B models.
 
-This notebook takes the transcript from earlier and prompts `Llama-3.1-8B-Instruct` to add more dramatization and interruptions in the conversations. 
+- **Notebook 4: Text-To-Speech Workflow** (`Notebook4_TextToSpeechWorkflow.ipynb`):
 
-There is also a key factor here: we return a tuple of conversation which makes our lives easier later. Yes, studying Data Structures 101 was actually useful for once!
+  Convert the enhanced transcript into a podcast using `parler-tts/parler-tts-mini-v1` and `bark/suno` models.
+  
+  - The speakers and prompts for the parler model were chosen based on experimentation and suggestions from model authors.
+  - Experiment with different TTS models and prompts to improve the natural sound of the podcast.
 
-For our TTS logic, we use two different models that behave differently with certain prompts. So we prompt the model to add specifics for each speaker accordingly.
+#### Note: Currently, there is an issue where Parler requires `transformers` version 4.43.3 or earlier, conflicting with steps 1-3. In Notebook 4, we switch the `transformers` version to accommodate Parler. Ensure you follow the notebook's instructions carefully to avoid dependency conflicts.
 
-Please again try changing the system prompt and see if you can improve the results. We encourage testing the feather light 3B and 1B models as well at this stage
+### Next Improvements & Further Ideas:
 
-- Notebook 4:
+- **Speech Model Experimentation**: Improve the naturalness of the podcast by experimenting with different TTS models.
+- **LLM vs. LLM Debate**: Utilize two agents to debate the topic of interest and generate the podcast outline.
+- **Testing 405B Model**: Assess performance differences when using the 405B model for writing transcripts.
+- **Enhanced Prompting**: Refine system prompts for improved results.
+- **Support for Additional Input Sources**: Enable ingestion of websites, audio files, YouTube links, etc. Community contributions are welcome!
 
-Finally, we take the results from last notebook and convert them into a podcast. We use the `parler-tts/parler-tts-mini-v1` and `bark/suno` models for a conversation.
+### Resources for Further Learning:
 
-The speakers and the prompt for parler model were decided based on experimentation and suggestions from the model authors. Please try experimenting, you can find more details in the resources section.
+- [Text to Audio Generation with Bark - Clearly Explained](https://betterprogramming.pub/text-to-audio-generation-with-bark-clearly-explained-4ee300a3713a)
+- [Colab Notebook for Text Processing](https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing)
+- [Replicate: Bark Model](https://replicate.com/suno-ai/bark?prediction=zh8j6yddxxrge0cjp9asgzd534)
+- [Suno AI Notion Page](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)
 
+### Supported Input Sources:
 
-#### Note: Right now there is one issue: Parler needs transformers 4.43.3 or earlier and for steps 1 to 3 of the pipeline you need latest, so we just switch versions in the last notebook.
+NotebookLlama supports multiple input formats:
 
-### Next-Improvements/Further ideas:
-
-- Speech Model experimentation: The TTS model is the limitation of how natural this will sound. This probably be improved with a better pipeline and with the help of someone more knowledgable-PRs are welcome! :) 
-- LLM vs LLM Debate: Another approach of writing the podcast would be having two agents debate the topic of interest and write the podcast outline. Right now we use a single LLM (70B) to write the podcast outline
-- Testing 405B for writing the transcripts
-- Better prompting
-- Support for ingesting a website, audio file, YouTube links and more. Again, we welcome community PRs!
-
-### Resources for further learning:
-
-- https://betterprogramming.pub/text-to-audio-generation-with-bark-clearly-explained-4ee300a3713a
-- https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing
-- https://colab.research.google.com/drive/1eJfA2XUa-mXwdMy7DoYKVYHI1iTd9Vkt?usp=sharing#scrollTo=NyYQ--3YksJY
-- https://replicate.com/suno-ai/bark?prediction=zh8j6yddxxrge0cjp9asgzd534
-- https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
+- **PDF files** (`*.pdf`)
+- **Web pages** (`http://`, `https://`)
+- **YouTube videos** (`youtube.com`, `youtu.be`)
 
+To use a different input source, simply provide the appropriate path or URL when running the notebooks.
diff --git a/recipes/quickstart/NotebookLlama/ingestion.py b/recipes/quickstart/NotebookLlama/ingestion.py
new file mode 100644
index 000000000..ac631587a
--- /dev/null
+++ b/recipes/quickstart/NotebookLlama/ingestion.py
@@ -0,0 +1,224 @@
+"""Ingestor module for NotebookLlama supporting multiple input formats"""
+
+from abc import ABC, abstractmethod
+from typing import Optional
+import os
+import warnings
+import requests
+
+# Core PDF support - required
+import PyPDF2
+
+# Optional format support
+try:
+    from langchain.document_loaders import WebBaseLoader, YoutubeLoader
+except ImportError:
+    WebBaseLoader = YoutubeLoader = None
+
+import whisper
+
+class BaseIngestor(ABC):
+    """Base class for all ingestors"""
+    
+    @abstractmethod
+    def validate(self, source: str) -> bool:
+        """Validate if source is valid"""
+        pass
+    
+    @abstractmethod
+    def extract_text(self, source: str, max_chars: int = 100000) -> Optional[str]:
+        """Extract text from source"""
+        pass
+
+class PDFIngestor(BaseIngestor):
+    """PDF ingestion - core functionality"""
+    
+    def validate(self, file_path: str) -> bool:
+        if not os.path.exists(file_path):
+            print(f"Error: File not found at path: {file_path}")
+            return False
+        if not file_path.lower().endswith('.pdf'):
+            print("Error: File is not a PDF")
+            return False
+        return True
+
+    def extract_text(self, file_path: str, max_chars: int = 100000) -> Optional[str]:
+        if not self.validate(file_path):
+            return None
+        
+        try:
+            with open(file_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                num_pages = len(pdf_reader.pages)
+                print(f"Processing PDF with {num_pages} pages...")
+                
+                extracted_text = []
+                total_chars = 0
+                
+                for page_num in range(num_pages):
+                    page_text = pdf_reader.pages[page_num].extract_text()
+                    if page_text:
+                        total_chars += len(page_text)
+                        if total_chars > max_chars:
+                            remaining_chars = max_chars - total_chars
+                            extracted_text.append(page_text[:remaining_chars])
+                            print(f"Reached {max_chars} character limit at page {page_num + 1}")
+                            break
+                        extracted_text.append(page_text)
+                        print(f"Processed page {page_num + 1}/{num_pages}")
+                
+                return "\n".join(extracted_text)
+                
+        except PyPDF2.PdfReadError:
+            print("Error: Invalid or corrupted PDF file")
+            return None
+        except Exception as e:
+            print(f"An unexpected error occurred: {str(e)}")
+            return None
+
+class WebsiteIngestor(BaseIngestor):
+    """Website ingestion using LangChain's WebBaseLoader"""
+
+    def validate(self, url: str) -> bool:
+        if WebBaseLoader is None:
+            print("Error: langchain is not installed. Please install it to use WebsiteIngestor.")
+            return False
+        if not url.startswith(('http://', 'https://')):
+            print("Error: Invalid URL format")
+            return False
+        return True
+
+    def extract_text(self, url: str, max_chars: int = 100000) -> Optional[str]:
+        if not self.validate(url):
+            return None
+        
+        try:
+            loader = WebBaseLoader(url)
+            documents = loader.load()
+            extracted_text = "\n".join([doc.page_content for doc in documents])
+            if len(extracted_text) > max_chars:
+                extracted_text = extracted_text[:max_chars]
+                print(f"Truncated extracted text to {max_chars} characters")
+            print(f"Extracted text from website: {url}")
+            return extracted_text
+        except Exception as e:
+            print(f"An error occurred while extracting from website: {str(e)}")
+            return None
+
+class YouTubeIngestor(BaseIngestor):
+    """YouTube ingestion using LangChain's YoutubeLoader"""
+
+    def validate(self, youtube_url: str) -> bool:
+        if YoutubeLoader is None:
+            print("Error: langchain is not installed. Please install it to use YouTubeIngestor.")
+            return False
+        if not youtube_url.startswith(('http://', 'https://')):
+            print("Error: Invalid URL format")
+            return False
+        return True
+
+    def extract_text(self, youtube_url: str, max_chars: int = 100000) -> Optional[str]:
+        if not self.validate(youtube_url):
+            return None
+        
+        try:
+            loader = YoutubeLoader.from_youtube_url(youtube_url, add_video_info=False)
+            documents = loader.load()
+            transcript = "\n".join([doc.page_content for doc in documents])
+            if len(transcript) > max_chars:
+                transcript = transcript[:max_chars]
+                print(f"Truncated transcript to {max_chars} characters")
+            print(f"Extracted transcript from YouTube video: {youtube_url}")
+            return transcript
+        except Exception as e:
+            print(f"An error occurred while extracting from YouTube: {str(e)}")
+            return None
+
+class AudioIngestor(BaseIngestor):
+    """Audio ingestion using OpenAI's Whisper model"""
+
+    def __init__(self, model_type: str = "base"):
+        self.model_type = model_type
+        self.model = whisper.load_model(self.model_type)
+
+    def validate(self, audio_file: str) -> bool:
+        if 20240930not os.path.exists(audio_file):
+            print(f"Error: Audio file not found at path: {audio_file}")
+            return False
+        if not audio_file.lower().endswith(('.mp3', '.wav', '.flac', '.m4a')):
+            print("Error: Unsupported audio format. Supported formats are .mp3, .wav, .flac, .m4a")
+            return False
+        return True
+
+    def extract_text(self, audio_file: str, max_chars: int = 100000) -> Optional[str]:
+        if not self.validate(audio_file):
+            return None
+        
+        try:
+            result = self.model.transcribe(audio_file)
+            transcription = result["text"]
+            if len(transcription) > max_chars:
+                transcription = transcription[:max_chars]
+                print(f"Truncated transcription to {max_chars} characters")
+            print(f"Transcribed audio file: {audio_file}")
+            return transcription
+        except Exception as e:
+            print(f"An error occurred during audio transcription: {str(e)}")
+            return None
+
+class IngestorFactory:
+    """Factory to create appropriate ingestor based on input type"""
+
+    @staticmethod
+    def get_ingestor(input_type: str, **kwargs) -> Optional[BaseIngestor]:
+        """
+        Retrieve the appropriate ingestor based on input type.
+
+        Args:
+            input_type (str): The type of input ('pdf', 'website', 'youtube', 'audio').
+
+        Returns:
+            BaseIngestor: An instance of a concrete Ingestor or None if unsupported.
+        """
+        input_type = input_type.lower()
+        if input_type == "pdf":
+            return PDFIngestor()
+        elif input_type == "website":
+            return WebsiteIngestor()
+        elif input_type == "youtube":
+            return YouTubeIngestor()
+        elif input_type == "audio":
+            return AudioIngestor(**kwargs)
+        else:
+            print(f"Unsupported input type: {input_type}")
+            return None
+
+def get_ingestor(source: str) -> BaseIngestor:
+    """Factory function to get appropriate ingestor based on source type"""
+    if source.lower().endswith('.pdf'):
+        return PDFIngestor()
+    elif source.startswith('http'):
+        if 'youtube.com' in source or 'youtu.be' in source:
+            return YouTubeIngestor()
+        return WebsiteIngestor()
+    raise ValueError(f"Unsupported source type: {source}")
+
+def ingest_content(input_type: str, source: str, max_chars: int = 100000, **kwargs) -> Optional[str]:
+    """
+    Ingest content from various sources based on the input type.
+
+    Args:
+        input_type (str): Type of the input ('pdf', 'website', 'youtube', 'audio').
+        source (str): Path to the file or URL.
+        max_chars (int, optional): Maximum number of characters to extract. Defaults to 100000.
+        **kwargs: Additional arguments for specific ingestors.
+
+    Returns:
+        Optional[str]: Extracted text or None if ingestion fails.
+    """
+    ingestor = IngestorFactory.get_ingestor(input_type, **kwargs)
+    if not ingestor:
+        print(f"Failed to get ingestor for input type: {input_type}")
+        return None
+    return ingestor.extract_text(source, max_chars)
+
diff --git a/recipes/quickstart/NotebookLlama/requirements.txt b/recipes/quickstart/NotebookLlama/requirements.txt
index 34a27dc81..c65975654 100644
--- a/recipes/quickstart/NotebookLlama/requirements.txt
+++ b/recipes/quickstart/NotebookLlama/requirements.txt
@@ -1,15 +1,19 @@
 # Core dependencies
-PyPDF2>=3.0.0
-torch>=2.0.0
+PyPDF2>=3.0.1
+torch>=2.1.0
 transformers>=4.46.0
 accelerate>=0.27.0
-rich>=13.0.0
-ipywidgets>=8.0.0
-tqdm>=4.66.0
+rich>=13.6.0
+ipywidgets>=8.1.5
+tqdm>=4.68.0
+langchain>=0.1.25
+openai-whisper>=20240930
+requests>=2.31.0
+youtube-dl>=2021.12.17
 
 # Optional but recommended
 jupyter>=1.0.0
-ipykernel>=6.0.0
+ipykernel>=6.24.0
 
 # Warning handling
-warnings>=0.1.0
\ No newline at end of file
+warnings>=0.1.0

From ca503cb2abcf7ef28c91cdda1bc044d77239f9a7 Mon Sep 17 00:00:00 2001
From: Evan Cosgrove <91965277+evanjcosgrove@users.noreply.github.com>
Date: Mon, 28 Oct 2024 14:44:54 -0400
Subject: [PATCH 2/3] Update ingestion.py

---
 recipes/quickstart/NotebookLlama/ingestion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/quickstart/NotebookLlama/ingestion.py b/recipes/quickstart/NotebookLlama/ingestion.py
index ac631587a..077629efe 100644
--- a/recipes/quickstart/NotebookLlama/ingestion.py
+++ b/recipes/quickstart/NotebookLlama/ingestion.py
@@ -142,7 +142,7 @@ def __init__(self, model_type: str = "base"):
         self.model = whisper.load_model(self.model_type)
 
     def validate(self, audio_file: str) -> bool:
-        if 20240930not os.path.exists(audio_file):
+        if not os.path.exists(audio_file):
             print(f"Error: Audio file not found at path: {audio_file}")
             return False
         if not audio_file.lower().endswith(('.mp3', '.wav', '.flac', '.m4a')):

From 8e7337bca09e780944983e313c04a2f7b6372256 Mon Sep 17 00:00:00 2001
From: Evan Cosgrove <91965277+evanjcosgrove@users.noreply.github.com>
Date: Mon, 28 Oct 2024 14:48:41 -0400
Subject: [PATCH 3/3] Update requirements.txt

---
 recipes/quickstart/NotebookLlama/requirements.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/recipes/quickstart/NotebookLlama/requirements.txt b/recipes/quickstart/NotebookLlama/requirements.txt
index c65975654..ffe09f330 100644
--- a/recipes/quickstart/NotebookLlama/requirements.txt
+++ b/recipes/quickstart/NotebookLlama/requirements.txt
@@ -1,11 +1,11 @@
 # Core dependencies
-PyPDF2>=3.0.1
-torch>=2.1.0
+PyPDF2>=3.0.0
+torch>=2.0.0
 transformers>=4.46.0
 accelerate>=0.27.0
-rich>=13.6.0
-ipywidgets>=8.1.5
-tqdm>=4.68.0
+rich>=13.0.0
+ipywidgets>=8.0.0
+tqdm>=4.66.0
 langchain>=0.1.25
 openai-whisper>=20240930
 requests>=2.31.0
@@ -13,7 +13,7 @@ youtube-dl>=2021.12.17
 
 # Optional but recommended
 jupyter>=1.0.0
-ipykernel>=6.24.0
+ipykernel>=6.0.0
 
 # Warning handling
 warnings>=0.1.0