Skip to content

Commit

Permalink
Add video demo and HF Space
Browse files Browse the repository at this point in the history
  • Loading branch information
Aasthaengg committed Oct 21, 2022
1 parent cc2f075 commit f44673c
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 23 deletions.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# Vision-Language Object Detection and Visual Question Answering
This repository includes Microsoft's GLIP and Salesforce's BLIP ensembled demo for detecting objects and Visual Question Answering based on text prompts.
<br />


## Updates
Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces/Pinwheel/GLIP-BLIP-Object-Detection-VQA)

<p align="left">
<a href="#">
<img src="https://img.shields.io/badge/🤗 Hugging Face-Demo-FFFF00?style=flat-square&logo=huggingface&logoColor=white">
</a>
</p>

<br />

Expand All @@ -14,6 +25,7 @@ This repository includes Microsoft's GLIP and Salesforce's BLIP ensembled demo f

> A new model architecture that enables a wider range of downstream tasks than existing methods, and a new dataset bootstrapping method for learning from noisy web data.

<br />

## Installation and Setup
Expand Down Expand Up @@ -62,10 +74,16 @@ python3 app_local.py

> After loading the checkpoints, you must click on the prompted local URL to run inference.
Video I/O
[![FPS](resources/demo.gif)]()

Image I/O
[![FPS](resources/gradio.png)]()

## Future Work

- [x] Frame based Visual Question Answering
- [x] Video based Visual Question Answering
- [ ] Each object based Visual Question Answering


Expand Down
91 changes: 68 additions & 23 deletions app_local.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os

from numpy import true_divide
import gradio as gr
import warnings

Expand All @@ -7,6 +9,9 @@
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
import vqa
import cv2
from PIL import Image
import numpy as np

# Use this command for evaluate the GLIP-T model
config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
Expand All @@ -26,29 +31,69 @@
show_mask_heatmaps=False
)
blip_demo = vqa.VQA(
model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
)
model_path = 'checkpoints/model_base_vqa_capfilt_large.pth')

def predict(image, object, question):
def predict_image(image, object, question):
result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
result = result[:, :, [2, 1, 0]]
answer = blip_demo.vqa_demo(image, question)
return result[:, :, [2, 1, 0]], answer

image = gr.inputs.Image()

gr.Interface(
description="GLIP + BLIP VQA Demo.",
fn=predict,
inputs=[
"image",
gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."),
gr.Textbox(label='Question', lines=1, placeholder="Question here..")],

outputs=[
gr.outputs.Image(
type="pil",
label="grounding results"
),
gr.Textbox(label="Answer")
],
).launch()
return result, answer

def predict_video(video, object, question, frame_drop_value):
vid = cv2.VideoCapture(video)
count = 0
while True:
ret, frame = vid.read()
if ret:
count+=1
if count % frame_drop_value == 0:
# image = Image.fromarray(frame)
image = frame
cv2.putText(
img = image,
text = str(count),
org = (20, 20),
fontFace = cv2.FONT_HERSHEY_DUPLEX,
fontScale = 0.5,
color = (125, 246, 55),
thickness = 1)
result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
answer = blip_demo.vqa_demo(image, question)
yield result, answer
else:
break

yield result, answer

with gr.Blocks() as demo:
gr.Markdown("Text-Based Object Detection and Visual Question Answering")
with gr.Tab("Image"):
with gr.Row():
with gr.Column():
image_input = gr.Image(label='input image')
obj_input = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
vqa_input = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
image_button = gr.Button("Submit")

with gr.Column():
image_output = gr.outputs.Image(type="pil", label="grounding results")
vqa_output = gr.Textbox(label="Answer")

with gr.Tab("Video"):
with gr.Row():
with gr.Column():
video_input = gr.PlayableVideo(label='input video', mirror_webcam=False)
obj_input_video = gr.Textbox(label='Objects', lines=1, placeholder="Objects here..")
vqa_input_video = gr.Textbox(label='Question', lines=1, placeholder="Question here..")
frame_drop_input = gr.Slider(label='Frames drop value', minimum=0, maximum=30, step=1, value=5)
video_button = gr.Button("Submit")

with gr.Column():
video_output = gr.outputs.Image(type="pil", label="grounding results")
vqa_output_video = gr.Textbox(label="Answer")

image_button.click(predict_image, inputs=[image_input, obj_input, vqa_input], outputs=[image_output, vqa_output])
video_button.click(predict_video, inputs=[video_input, obj_input_video, vqa_input_video, frame_drop_input], outputs=[video_output, vqa_output_video])

demo.queue()
demo.launch()

0 comments on commit f44673c

Please sign in to comment.