From 9ad070e84f46dca18b9b401a3b9ea6cdcf7f9cd6 Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:29:55 +0530 Subject: [PATCH 1/9] Create Readme.md --- .../Image Caption Generation with Audio Output/Readme.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/Readme.md diff --git a/Deep_Learning/Image Caption Generation with Audio Output/Readme.md b/Deep_Learning/Image Caption Generation with Audio Output/Readme.md new file mode 100644 index 000000000..0a36dfb17 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/Readme.md @@ -0,0 +1 @@ +Image Caption Generation with Audio Output From 158a994d274794e72299815721ceeab09e022246 Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:34:11 +0530 Subject: [PATCH 2/9] Add files via upload --- .../README.md | 91 +++++++++++++++++++ .../app.py | 56 ++++++++++++ .../requirements.txt | 7 ++ 3 files changed, 154 insertions(+) create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/README.md create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/app.py create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/requirements.txt diff --git a/Deep_Learning/Image Caption Generation with Audio Output/README.md b/Deep_Learning/Image Caption Generation with Audio Output/README.md new file mode 100644 index 000000000..c4379a3a5 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/README.md @@ -0,0 +1,91 @@ +# Image Caption Generator with TTS + +This project is a web application that allows users to upload images and generate captions using a pre-trained model. The generated captions can also be converted to speech using Google Text-to-Speech (gTTS), which can be played or downloaded directly from the webpage. + +## Features +- Upload an image file and generate a caption using the `Salesforce/blip-image-captioning-base` model. +- Converts the generated caption into audio using Google Text-to-Speech (gTTS). +- Displays the uploaded image along with the generated caption and an audio player to listen to the caption. + + +## Project Structure + +``` +project/ +│ +├── app.py # Main Flask app +├── static/ # Static files (uploads and audio) +│ ├── uploads/ # Folder for uploaded images +│ └── audio/ # Folder for audio files generated by gTTS +├── templates/ +│ └── index.html # HTML file for rendering the webpage +├── requirements.txt # Python dependencies +└── README.md # Project documentation +``` + +## Installation and Setup + +1. **Clone the repository:** + + ```bash + git clone https://github.com/payal83/image-caption-generator.git + cd image-caption-generator + ``` + +2. **Create a virtual environment:** + + ```bash + python3 -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + ``` + +3. **Install dependencies:** + + ```bash + pip install -r requirements.txt + ``` + +4. **Run the Flask application:** + + ```bash + python app.py + ``` + +5. **Open your browser and navigate to:** + + ``` + http://127.0.0.1:5000/ + ``` + +## Dependencies + +This project relies on the following libraries: + +- **Flask**: Web framework used to create the application. +- **Pillow**: For image processing. +- **transformers**: Hugging Face transformers library for loading the image captioning model. +- **gTTS**: Google Text-to-Speech library for converting text into audio. +- **Werkzeug**: Used for securing file uploads. + +To install the dependencies, use: + +```bash +pip install -r requirements.txt +``` + +## Usage + +1. **Upload an Image**: + Upload any image file (e.g., `.jpg`, `.png`) through the web interface. + +2. **Generate Caption**: + Once uploaded, the model will generate a caption based on the content of the image. + +3. **Play Caption as Audio**: + The caption will also be converted to speech using Google Text-to-Speech (gTTS). An audio player will appear, allowing you to listen to the caption. + + +## License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + diff --git a/Deep_Learning/Image Caption Generation with Audio Output/app.py b/Deep_Learning/Image Caption Generation with Audio Output/app.py new file mode 100644 index 000000000..395b8dea1 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/app.py @@ -0,0 +1,56 @@ +from flask import Flask, render_template, request, url_for +from werkzeug.utils import secure_filename +import os +from PIL import Image +from transformers import pipeline +from gtts import gTTS + +app = Flask(__name__) + +# Configure upload folder +app.config['UPLOAD_FOLDER'] = 'static/uploads' +app.config['AUDIO_FOLDER'] = 'static/audio' +app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # Limit to 16 MB + +# Create uploads and audio directories if they don't exist +os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) +os.makedirs(app.config['AUDIO_FOLDER'], exist_ok=True) + +# Initialize the image-to-text pipeline +image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") + +@app.route('/', methods=['GET', 'POST']) +def index(): + caption = '' + image_url = '' + audio_url = '' + + if request.method == 'POST' and 'photo' in request.files: + # Process the uploaded photo + photo = request.files['photo'] + filename = secure_filename(photo.filename) + filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) + photo.save(filepath) + + # Convert the image to RGB and process + image = Image.open(filepath).convert('RGB') + + # Generate caption + captions = image_to_text(image) + caption = captions[0]['generated_text'] + + # Set image URL for display + image_url = url_for('static', filename=f'uploads/{filename}') + + # Convert caption to audio using gtts + if caption: + tts = gTTS(text=caption, lang='en') + audio_filename = f"{filename.rsplit('.', 1)[0]}.mp3" # Same name but with .mp3 extension + audio_filepath = os.path.join(app.config['AUDIO_FOLDER'], audio_filename) + tts.save(audio_filepath) + audio_url = url_for('static', filename=f'audio/{audio_filename}') + + return render_template('index.html', caption=caption, image_url=image_url, audio_url=audio_url) + +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file diff --git a/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt b/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt new file mode 100644 index 000000000..326ca1e47 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt @@ -0,0 +1,7 @@ +Flask==2.3.2 +Pillow==10.0.0 +transformers==4.31.0 +torch==2.0.1 +gTTS==2.3.2 +Werkzeug==2.3.6 +gunicorn From 23bf20c770051b96f6bceaeff07148058d58439e Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:34:52 +0530 Subject: [PATCH 3/9] Create t --- .../Image Caption Generation with Audio Output/templates/t | 1 + 1 file changed, 1 insertion(+) create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/templates/t diff --git a/Deep_Learning/Image Caption Generation with Audio Output/templates/t b/Deep_Learning/Image Caption Generation with Audio Output/templates/t new file mode 100644 index 000000000..f8a268712 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/templates/t @@ -0,0 +1 @@ +templates From 3292975d4b861eeba38d2d177cb7f8401ca17eb4 Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:35:11 +0530 Subject: [PATCH 4/9] Create s --- .../Image Caption Generation with Audio Output/static/s | 1 + 1 file changed, 1 insertion(+) create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/static/s diff --git a/Deep_Learning/Image Caption Generation with Audio Output/static/s b/Deep_Learning/Image Caption Generation with Audio Output/static/s new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/static/s @@ -0,0 +1 @@ + From 429e166b6484afe4dbbdac98a39ba92054eff007 Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:35:47 +0530 Subject: [PATCH 5/9] Add files via upload --- .../Image Caption Generation with Audio Output/static/ignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/static/ignore diff --git a/Deep_Learning/Image Caption Generation with Audio Output/static/ignore b/Deep_Learning/Image Caption Generation with Audio Output/static/ignore new file mode 100644 index 000000000..d3f5a12fa --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/static/ignore @@ -0,0 +1 @@ + From dd6cb8a4c00cd75ba30b459c0312ceeed1fa48ca Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:36:47 +0530 Subject: [PATCH 6/9] Add files via upload --- .../templates/index.html | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/templates/index.html diff --git a/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html b/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html new file mode 100644 index 000000000..c55346223 --- /dev/null +++ b/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html @@ -0,0 +1,122 @@ + + + + + + Image Caption Generator + + + +
+

Image Caption Generator

+
+ + + +
+ + {% if caption %} +

Generated Caption:

+
{{ caption }}
+ {% if image_url %} + Uploaded Image + {% endif %} + {% if audio_url %} +

Audio:

+ + {% endif %} + {% endif %} +
+ + From 35a29f0af7d21f38a4ed3947eff13be2482d5918 Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:37:38 +0530 Subject: [PATCH 7/9] Delete Deep_Learning/Image Caption Generation with Audio Output/Readme.md --- .../Image Caption Generation with Audio Output/Readme.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 Deep_Learning/Image Caption Generation with Audio Output/Readme.md diff --git a/Deep_Learning/Image Caption Generation with Audio Output/Readme.md b/Deep_Learning/Image Caption Generation with Audio Output/Readme.md deleted file mode 100644 index 0a36dfb17..000000000 --- a/Deep_Learning/Image Caption Generation with Audio Output/Readme.md +++ /dev/null @@ -1 +0,0 @@ -Image Caption Generation with Audio Output From d0bf09d643a312c5ce94baf954b42b444d8d2eb1 Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:38:00 +0530 Subject: [PATCH 8/9] Delete Deep_Learning/Image Caption Generation with Audio Output/static/s --- .../Image Caption Generation with Audio Output/static/s | 1 - 1 file changed, 1 deletion(-) delete mode 100644 Deep_Learning/Image Caption Generation with Audio Output/static/s diff --git a/Deep_Learning/Image Caption Generation with Audio Output/static/s b/Deep_Learning/Image Caption Generation with Audio Output/static/s deleted file mode 100644 index 8b1378917..000000000 --- a/Deep_Learning/Image Caption Generation with Audio Output/static/s +++ /dev/null @@ -1 +0,0 @@ - From a678d2c684414fbb052ab86f0b447f54aa780178 Mon Sep 17 00:00:00 2001 From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com> Date: Sun, 27 Oct 2024 16:38:37 +0530 Subject: [PATCH 9/9] Delete Deep_Learning/Image Caption Generation with Audio Output/templates/t --- .../Image Caption Generation with Audio Output/templates/t | 1 - 1 file changed, 1 deletion(-) delete mode 100644 Deep_Learning/Image Caption Generation with Audio Output/templates/t diff --git a/Deep_Learning/Image Caption Generation with Audio Output/templates/t b/Deep_Learning/Image Caption Generation with Audio Output/templates/t deleted file mode 100644 index f8a268712..000000000 --- a/Deep_Learning/Image Caption Generation with Audio Output/templates/t +++ /dev/null @@ -1 +0,0 @@ -templates