From 9ad070e84f46dca18b9b401a3b9ea6cdcf7f9cd6 Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:29:55 +0530
Subject: [PATCH 1/9] Create Readme.md

---
 .../Image Caption Generation with Audio Output/Readme.md         | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/Readme.md

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/Readme.md b/Deep_Learning/Image Caption Generation with Audio Output/Readme.md
new file mode 100644
index 000000000..0a36dfb17
--- /dev/null
+++ b/Deep_Learning/Image Caption Generation with Audio Output/Readme.md	
@@ -0,0 +1 @@
+Image Caption Generation with Audio Output

From 158a994d274794e72299815721ceeab09e022246 Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:34:11 +0530
Subject: [PATCH 2/9] Add files via upload

---
 .../README.md                                 | 91 +++++++++++++++++++
 .../app.py                                    | 56 ++++++++++++
 .../requirements.txt                          |  7 ++
 3 files changed, 154 insertions(+)
 create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/README.md
 create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/app.py
 create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/requirements.txt

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/README.md b/Deep_Learning/Image Caption Generation with Audio Output/README.md
new file mode 100644
index 000000000..c4379a3a5
--- /dev/null
+++ b/Deep_Learning/Image Caption Generation with Audio Output/README.md	
@@ -0,0 +1,91 @@
+# Image Caption Generator with TTS
+
+This project is a web application that allows users to upload images and generate captions using a pre-trained model. The generated captions can also be converted to speech using Google Text-to-Speech (gTTS), which can be played or downloaded directly from the webpage.
+
+## Features
+- Upload an image file and generate a caption using the `Salesforce/blip-image-captioning-base` model.
+- Converts the generated caption into audio using Google Text-to-Speech (gTTS).
+- Displays the uploaded image along with the generated caption and an audio player to listen to the caption.
+
+
+## Project Structure
+
+```
+project/
+│
+├── app.py                     # Main Flask app
+├── static/                     # Static files (uploads and audio)
+│   ├── uploads/                # Folder for uploaded images
+│   └── audio/                  # Folder for audio files generated by gTTS
+├── templates/
+│   └── index.html              # HTML file for rendering the webpage
+├── requirements.txt            # Python dependencies
+└── README.md                   # Project documentation
+```
+
+## Installation and Setup
+
+1. **Clone the repository:**
+
+   ```bash
+   git clone https://github.com/payal83/image-caption-generator.git
+   cd image-caption-generator
+   ```
+
+2. **Create a virtual environment:**
+
+   ```bash
+   python3 -m venv venv
+   source venv/bin/activate   # On Windows: venv\Scripts\activate
+   ```
+
+3. **Install dependencies:**
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+4. **Run the Flask application:**
+
+   ```bash
+   python app.py
+   ```
+
+5. **Open your browser and navigate to:**
+
+   ```
+   http://127.0.0.1:5000/
+   ```
+
+## Dependencies
+
+This project relies on the following libraries:
+
+- **Flask**: Web framework used to create the application.
+- **Pillow**: For image processing.
+- **transformers**: Hugging Face transformers library for loading the image captioning model.
+- **gTTS**: Google Text-to-Speech library for converting text into audio.
+- **Werkzeug**: Used for securing file uploads.
+
+To install the dependencies, use:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+1. **Upload an Image**: 
+   Upload any image file (e.g., `.jpg`, `.png`) through the web interface.
+   
+2. **Generate Caption**: 
+   Once uploaded, the model will generate a caption based on the content of the image.
+
+3. **Play Caption as Audio**: 
+   The caption will also be converted to speech using Google Text-to-Speech (gTTS). An audio player will appear, allowing you to listen to the caption.
+
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
diff --git a/Deep_Learning/Image Caption Generation with Audio Output/app.py b/Deep_Learning/Image Caption Generation with Audio Output/app.py
new file mode 100644
index 000000000..395b8dea1
--- /dev/null
+++ b/Deep_Learning/Image Caption Generation with Audio Output/app.py	
@@ -0,0 +1,56 @@
+from flask import Flask, render_template, request, url_for
+from werkzeug.utils import secure_filename
+import os
+from PIL import Image
+from transformers import pipeline
+from gtts import gTTS
+
+app = Flask(__name__)
+
+# Configure upload folder
+app.config['UPLOAD_FOLDER'] = 'static/uploads'
+app.config['AUDIO_FOLDER'] = 'static/audio'
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # Limit to 16 MB
+
+# Create uploads and audio directories if they don't exist
+os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+os.makedirs(app.config['AUDIO_FOLDER'], exist_ok=True)
+
+# Initialize the image-to-text pipeline
+image_to_text = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+
+@app.route('/', methods=['GET', 'POST'])
+def index():
+    caption = ''
+    image_url = ''
+    audio_url = ''
+    
+    if request.method == 'POST' and 'photo' in request.files:
+        # Process the uploaded photo
+        photo = request.files['photo']
+        filename = secure_filename(photo.filename)
+        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
+        photo.save(filepath)
+
+        # Convert the image to RGB and process
+        image = Image.open(filepath).convert('RGB')
+
+        # Generate caption
+        captions = image_to_text(image)
+        caption = captions[0]['generated_text'] 
+
+        # Set image URL for display
+        image_url = url_for('static', filename=f'uploads/{filename}')
+
+        # Convert caption to audio using gtts
+        if caption:
+            tts = gTTS(text=caption, lang='en')
+            audio_filename = f"{filename.rsplit('.', 1)[0]}.mp3"  # Same name but with .mp3 extension
+            audio_filepath = os.path.join(app.config['AUDIO_FOLDER'], audio_filename)
+            tts.save(audio_filepath)
+            audio_url = url_for('static', filename=f'audio/{audio_filename}')
+
+    return render_template('index.html', caption=caption, image_url=image_url, audio_url=audio_url)
+
+if __name__ == '__main__':
+    app.run(debug=True)
\ No newline at end of file
diff --git a/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt b/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt
new file mode 100644
index 000000000..326ca1e47
--- /dev/null
+++ b/Deep_Learning/Image Caption Generation with Audio Output/requirements.txt	
@@ -0,0 +1,7 @@
+Flask==2.3.2
+Pillow==10.0.0
+transformers==4.31.0
+torch==2.0.1
+gTTS==2.3.2
+Werkzeug==2.3.6
+gunicorn

From 23bf20c770051b96f6bceaeff07148058d58439e Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:34:52 +0530
Subject: [PATCH 3/9] Create t

---
 .../Image Caption Generation with Audio Output/templates/t       | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/templates/t

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/templates/t b/Deep_Learning/Image Caption Generation with Audio Output/templates/t
new file mode 100644
index 000000000..f8a268712
--- /dev/null
+++ b/Deep_Learning/Image Caption Generation with Audio Output/templates/t	
@@ -0,0 +1 @@
+templates

From 3292975d4b861eeba38d2d177cb7f8401ca17eb4 Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:35:11 +0530
Subject: [PATCH 4/9] Create s

---
 .../Image Caption Generation with Audio Output/static/s          | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/static/s

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/static/s b/Deep_Learning/Image Caption Generation with Audio Output/static/s
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/Deep_Learning/Image Caption Generation with Audio Output/static/s	
@@ -0,0 +1 @@
+

From 429e166b6484afe4dbbdac98a39ba92054eff007 Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:35:47 +0530
Subject: [PATCH 5/9] Add files via upload

---
 .../Image Caption Generation with Audio Output/static/ignore     | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/static/ignore

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/static/ignore b/Deep_Learning/Image Caption Generation with Audio Output/static/ignore
new file mode 100644
index 000000000..d3f5a12fa
--- /dev/null
+++ b/Deep_Learning/Image Caption Generation with Audio Output/static/ignore	
@@ -0,0 +1 @@
+

From dd6cb8a4c00cd75ba30b459c0312ceeed1fa48ca Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:36:47 +0530
Subject: [PATCH 6/9] Add files via upload

---
 .../templates/index.html                      | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 Deep_Learning/Image Caption Generation with Audio Output/templates/index.html

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html b/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html
new file mode 100644
index 000000000..c55346223
--- /dev/null
+++ b/Deep_Learning/Image Caption Generation with Audio Output/templates/index.html	
@@ -0,0 +1,122 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Image Caption Generator</title>
+    <style>
+        * {
+            box-sizing: border-box;
+            margin: 0;
+            padding: 0;
+        }
+
+        body {
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            background-color: #f4f4f4;
+            color: #333;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            min-height: 100vh;
+            padding: 20px;
+        }
+
+        .container {
+            background-color: white;
+            box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.1);
+            border-radius: 10px;
+            padding: 30px;
+            width: 100%;
+            max-width: 600px;
+        }
+
+        h1 {
+            font-size: 2.5em;
+            color: #333;
+            text-align: center;
+            margin-bottom: 20px;
+        }
+
+        form {
+            display: flex;
+            flex-direction: column;
+            gap: 15px;
+        }
+
+        input[type="file"] {
+            border: 1px solid #ddd;
+            padding: 10px;
+            border-radius: 5px;
+            font-size: 1em;
+            cursor: pointer;
+        }
+
+        button {
+            background-color: #4CAF50;
+            color: white;
+            padding: 15px 20px;
+            border: none;
+            border-radius: 5px;
+            font-size: 1.2em;
+            cursor: pointer;
+            transition: background-color 0.3s ease;
+        }
+
+        button:hover {
+            background-color: #45a049;
+        }
+
+        h2 {
+            font-size: 1.8em;
+            margin-top: 30px;
+            color: #333;
+        }
+
+        img {
+            max-width: 100%;
+            border-radius: 10px;
+            margin-top: 15px;
+        }
+
+        .caption {
+            font-size: 1.2em;
+            margin-top: 10px;
+            padding: 15px;
+            background-color: #f9f9f9;
+            border-left: 4px solid #4CAF50;
+            border-radius: 5px;
+        }
+
+        audio {
+            margin-top: 20px;
+            width: 100%;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>Image Caption Generator</h1>
+        <form action="/" method="post" enctype="multipart/form-data">
+            <label for="photo">Upload an image:</label>
+            <input type="file" name="photo" accept="image/*" required>
+            <button type="submit">Generate Caption</button>
+        </form>
+        
+        {% if caption %}
+            <h2>Generated Caption:</h2>
+            <div class="caption">{{ caption }}</div>
+            {% if image_url %}
+                <img src="{{ image_url }}" alt="Uploaded Image">
+            {% endif %}
+            {% if audio_url %}
+                <h3>Audio:</h3>
+                <audio controls>
+                    <source src="{{ audio_url }}" type="audio/mpeg">
+                    Your browser does not support the audio element.
+                </audio>
+            {% endif %}
+        {% endif %}
+    </div>
+</body>
+</html>

From 35a29f0af7d21f38a4ed3947eff13be2482d5918 Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:37:38 +0530
Subject: [PATCH 7/9] Delete Deep_Learning/Image Caption Generation with Audio
 Output/Readme.md

---
 .../Image Caption Generation with Audio Output/Readme.md         | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 Deep_Learning/Image Caption Generation with Audio Output/Readme.md

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/Readme.md b/Deep_Learning/Image Caption Generation with Audio Output/Readme.md
deleted file mode 100644
index 0a36dfb17..000000000
--- a/Deep_Learning/Image Caption Generation with Audio Output/Readme.md	
+++ /dev/null
@@ -1 +0,0 @@
-Image Caption Generation with Audio Output

From d0bf09d643a312c5ce94baf954b42b444d8d2eb1 Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:38:00 +0530
Subject: [PATCH 8/9] Delete Deep_Learning/Image Caption Generation with Audio
 Output/static/s

---
 .../Image Caption Generation with Audio Output/static/s          | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 Deep_Learning/Image Caption Generation with Audio Output/static/s

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/static/s b/Deep_Learning/Image Caption Generation with Audio Output/static/s
deleted file mode 100644
index 8b1378917..000000000
--- a/Deep_Learning/Image Caption Generation with Audio Output/static/s	
+++ /dev/null
@@ -1 +0,0 @@
-

From a678d2c684414fbb052ab86f0b447f54aa780178 Mon Sep 17 00:00:00 2001
From: Anushka Pote <101658241+Anushka-Pote@users.noreply.github.com>
Date: Sun, 27 Oct 2024 16:38:37 +0530
Subject: [PATCH 9/9] Delete Deep_Learning/Image Caption Generation with Audio
 Output/templates/t

---
 .../Image Caption Generation with Audio Output/templates/t       | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 Deep_Learning/Image Caption Generation with Audio Output/templates/t

diff --git a/Deep_Learning/Image Caption Generation with Audio Output/templates/t b/Deep_Learning/Image Caption Generation with Audio Output/templates/t
deleted file mode 100644
index f8a268712..000000000
--- a/Deep_Learning/Image Caption Generation with Audio Output/templates/t	
+++ /dev/null
@@ -1 +0,0 @@
-templates