Add video demo and HF Space

Aasthaengg · Oct 21, 2022 · f44673c · f44673c
1 parent cc2f075
commit f44673c
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,16 @@
 # Vision-Language Object Detection and Visual Question Answering
 This repository includes Microsoft's GLIP and Salesforce's BLIP ensembled demo for detecting objects and Visual Question Answering based on text prompts.  
+<br />
+
+
+## Updates 
+Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces/Pinwheel/GLIP-BLIP-Object-Detection-VQA)
+
+<p align="left">
+    <a href="#">
+        <img src="https://img.shields.io/badge/🤗 Hugging Face-Demo-FFFF00?style=flat-square&logo=huggingface&logoColor=white">
+    </a>
+</p>
 
 <br />
 
@@ -14,6 +25,7 @@ This repository includes Microsoft's GLIP and Salesforce's BLIP ensembled demo f
 
 > A new model architecture that enables a wider range of downstream tasks than existing methods, and a new dataset bootstrapping method for learning from noisy web data.
 
+
 <br />
 
 ## Installation and Setup
@@ -62,10 +74,16 @@ python3 app_local.py
 
 > After loading the checkpoints, you must click on the prompted local URL to run inference. 
 
+Video I/O
+[![FPS](resources/demo.gif)]()
+
+Image I/O
 [![FPS](resources/gradio.png)]()
+
 ## Future Work
 
 - [x] Frame based Visual Question Answering
+- [x] Video based Visual Question Answering
 - [ ] Each object based Visual Question Answering
 
 

diff --git a/app_local.py b/app_local.py
@@ -1,4 +1,6 @@
 import os
+
+from numpy import true_divide
 import gradio as gr
 import warnings
 
@@ -7,6 +9,9 @@
 from maskrcnn_benchmark.config import cfg
 from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
 import vqa
+import cv2
+from PIL import Image
+import numpy as np
 
 # Use this command for evaluate the GLIP-T model
 config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
@@ -26,29 +31,69 @@
     show_mask_heatmaps=False
 )
 blip_demo = vqa.VQA(
-    model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
-)
+    model_path = 'checkpoints/model_base_vqa_capfilt_large.pth')
 
-def predict(image, object, question):
+def predict_image(image, object, question):
     result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
+    result = result[:, :, [2, 1, 0]]
     answer = blip_demo.vqa_demo(image, question)
-    return result[:, :, [2, 1, 0]], answer
-
-image = gr.inputs.Image()
-
-gr.Interface(
-    description="GLIP + BLIP VQA Demo.",
-    fn=predict,
-    inputs=[
-        "image", 
-        gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."), 
-        gr.Textbox(label='Question', lines=1, placeholder="Question here..")],
-
-    outputs=[
-        gr.outputs.Image(
-            type="pil",
-            label="grounding results"
-        ),
-        gr.Textbox(label="Answer")
-    ],
-).launch()
+    return result, answer
+
+def predict_video(video, object, question, frame_drop_value):
+    vid = cv2.VideoCapture(video)
+    count = 0
+    while True:
+        ret, frame = vid.read()
+        if ret:
+            count+=1
+            if count % frame_drop_value == 0:
+                # image = Image.fromarray(frame)
+                image = frame
+                cv2.putText(
+                img = image,
+                text = str(count),
+                org = (20, 20),
+                fontFace = cv2.FONT_HERSHEY_DUPLEX,
+                fontScale = 0.5,
+                color = (125, 246, 55),
+                thickness = 1)
+                result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
+                answer = blip_demo.vqa_demo(image, question)
+                yield result, answer
+        else:
+            break
+
+    yield result, answer
+
+with gr.Blocks() as demo:
+    gr.Markdown("Text-Based Object Detection and Visual Question Answering")
+    with gr.Tab("Image"):
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(label='input image')
+                obj_input = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
+                vqa_input = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
+                image_button = gr.Button("Submit")
+
+            with gr.Column():
+                image_output = gr.outputs.Image(type="pil", label="grounding results")
+                vqa_output = gr.Textbox(label="Answer")
+
+    with gr.Tab("Video"):
+        with gr.Row():
+            with gr.Column():
+                video_input = gr.PlayableVideo(label='input video', mirror_webcam=False)
+                obj_input_video = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
+                vqa_input_video = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
+                frame_drop_input = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5)
+                video_button = gr.Button("Submit")
+
+            with gr.Column():
+                video_output = gr.outputs.Image(type="pil", label="grounding results")
+                vqa_output_video = gr.Textbox(label="Answer")
+
+    image_button.click(predict_image, inputs=[image_input, obj_input, vqa_input], outputs=[image_output, vqa_output])
+    video_button.click(predict_video, inputs=[video_input, obj_input_video, vqa_input_video, frame_drop_input], outputs=[video_output, vqa_output_video])
+
+demo.queue()
+demo.launch()