diff --git a/Deep_Learning/Image Caption Generation with Audio Output/README.md b/Deep_Learning/Image Caption Generation with Audio Output/README.md new file mode 100644 index 000000000..c4379a3a5 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/README.md @@ -0,0 +1,91 @@ +# Image Caption Generator with TTS + +This project is a web application that allows users to upload images and generate captions using a pre-trained model. The generated captions can also be converted to speech using Google Text-to-Speech (gTTS), which can be played or downloaded directly from the webpage. + +## Features +- Upload an image file and generate a caption using the `Salesforce/blip-image-captioning-base` model. +- Converts the generated caption into audio using Google Text-to-Speech (gTTS). +- Displays the uploaded image along with the generated caption and an audio player to listen to the caption. + + +## Project Structure + +``` +project/ +│ +├── app.py # Main Flask app +├── static/ # Static files (uploads and audio) +│ ├── uploads/ # Folder for uploaded images +│ └── audio/ # Folder for audio files generated by gTTS +├── templates/ +│ └── index.html # HTML file for rendering the webpage +├── requirements.txt # Python dependencies +└── README.md # Project documentation +``` + +## Installation and Setup + +1. **Clone the repository:** + + ```bash + git clone https://github.com/payal83/image-caption-generator.git + cd image-caption-generator + ``` + +2. **Create a virtual environment:** + + ```bash + python3 -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + ``` + +3. **Install dependencies:** + + ```bash + pip install -r requirements.txt + ``` + +4. **Run the Flask application:** + + ```bash + python app.py + ``` + +5. **Open your browser and navigate to:** + + ``` + http://127.0.0.1:5000/ + ``` + +## Dependencies + +This project relies on the following libraries: + +- **Flask**: Web framework used to create the application. +- **Pillow**: For image processing. +- **transformers**: Hugging Face transformers library for loading the image captioning model. +- **gTTS**: Google Text-to-Speech library for converting text into audio. +- **Werkzeug**: Used for securing file uploads. + +To install the dependencies, use: + +```bash +pip install -r requirements.txt +``` + +## Usage + +1. **Upload an Image**: + Upload any image file (e.g., `.jpg`, `.png`) through the web interface. + +2. **Generate Caption**: + Once uploaded, the model will generate a caption based on the content of the image. + +3. **Play Caption as Audio**: + The caption will also be converted to speech using Google Text-to-Speech (gTTS). An audio player will appear, allowing you to listen to the caption. + + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + diff --git a/Deep_Learning/Image Caption Generation with Audio Output/app.py b/Deep_Learning/Image Caption Generation with Audio Output/app.py new file mode 100644 index 000000000..395b8dea1 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/app.py @@ -0,0 +1,56 @@ +from flask import Flask, render_template, request, url_for +from werkzeug.utils import secure_filename +import os +from PIL import Image +from transformers import pipeline +from gtts import gTTS + +app = Flask(__name__) + +# Configure upload folder +app.config['UPLOAD_FOLDER'] = 'static/uploads' +app.config['AUDIO_FOLDER'] = 'static/audio' +app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # Limit to 16 MB + +# Create uploads and audio directories if they don't exist +os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) +os.makedirs(app.config['AUDIO_FOLDER'], exist_ok=True) + +# Initialize the image-to-text pipeline +image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") + +@app.route('/', methods=['GET', 'POST']) +def index(): + caption = '' + image_url = '' + audio_url = '' + + if request.method == 'POST' and 'photo' in request.files: + # Process the uploaded photo + photo = request.files['photo'] + filename = secure_filename(photo.filename) + filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) + photo.save(filepath) + + # Convert the image to RGB and process + image = Image.open(filepath).convert('RGB') + + # Generate caption + captions = image_to_text(image) + caption = captions[0]['generated_text'] + + # Set image URL for display + image_url = url_for('static', filename=f'uploads/{filename}') + + # Convert caption to audio using gtts + if caption: + tts = gTTS(text=caption, lang='en') + audio_filename = f"{filename.rsplit('.', 1)[0]}.mp3" # Same name but with .mp3 extension + audio_filepath = os.path.join(app.config['AUDIO_FOLDER'], audio_filename) + tts.save(audio_filepath) + audio_url = url_for('static', filename=f'audio/{audio_filename}') + + return render_template('index.html', caption=caption, image_url=image_url, audio_url=audio_url) + +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file diff --git a/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt b/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt new file mode 100644 index 000000000..326ca1e47 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt @@ -0,0 +1,7 @@ +Flask==2.3.2 +Pillow==10.0.0 +transformers==4.31.0 +torch==2.0.1 +gTTS==2.3.2 +Werkzeug==2.3.6 +gunicorn diff --git a/Deep_Learning/Image Caption Generation with Audio Output/static/ignore b/Deep_Learning/Image Caption Generation with Audio Output/static/ignore new file mode 100644 index 000000000..d3f5a12fa --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/static/ignore @@ -0,0 +1 @@ + diff --git a/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html b/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html new file mode 100644 index 000000000..c55346223 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html @@ -0,0 +1,122 @@ + + + + + + Image Caption Generator + + + +
+

Image Caption Generator

+
+ + + +
+ + {% if caption %} +

Generated Caption:

+
{{ caption }}
+ {% if image_url %} + Uploaded Image + {% endif %} + {% if audio_url %} +

Audio:

+ + {% endif %} + {% endif %} +
+ +