Initial commit

Aasthaengg · Oct 12, 2022 · 6a5a5f0 · 6a5a5f0
1 parent d013fbb
commit 6a5a5f0
Show file tree

Hide file tree

Showing 228 changed files with 47,864 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -126,4 +126,6 @@ venv.bak/
 dmypy.json
 
 # Pyre type checker
-.pyre/
+.pyre/\
+
+flagged/
diff --git a/README.md b/README.md
@@ -1 +1,101 @@
-# Vision-Langauge-Object-Det-VQA
+# Vision-Language Object Detection and Visual Question Answering
+This repository includes Microsoft's GLIP and Salesforce's BLIP ensembled demo for detecting objects and Visual Question Answering based on text prompts.  
+
+<br />
+
+## About GLIP: Grounded Language-Image Pre-training - 
+> GLIP demonstrate strong zero-shot and few-shot transferability to various object-level recognition tasks.
+
+> The model used in this repo is GLIP-T, it is originally pre-trained on Conceptual Captions 3M and SBU captions.
+
+<br />
+
+## About BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation - 
+
+> A new model architecture that enables a wider range of downstream tasks than existing methods, and a new dataset bootstrapping method for learning from noisy web data.
+
+<br />
+
+## Installation and Setup
+
+***Enviornment*** - Due to limitations with `maskrcnn_benchmark`, this repo requires Pytorch=1.10 and torchvision.
+
+Use `requirements.txt` to install dependencies
+
+```sh
+pip3 install -r requirements.txt
+```
+Build `maskrcnn_benchmark`
+```
+python setup.py build develop --user
+```
+
+To verify a successful build, check the terminal for message  
+"Finished processing dependencies for maskrcnn-benchmark==0.1"
+
+## Checkpoints
+
+> Download the pre-trained models into the `checkpoints` folder.
+
+<br />
+
+```sh
+mkdir checkpoints
+cd checkpoints
+```
+
+Model | Weight
+-- | --
+**GLIP-T** | [weight](https://drive.google.com/file/d/1nlPL6PHkslarP6RiWJJu6QGKjqHG4tkc/view?usp=sharing)
+**BLIP** | [weight](https://drive.google.com/file/d/1QliNGiAcyCCJLd22eNOxWvMUDzb7GzrO/view?usp=sharing)
+
+<br />files.maxMemoryForLargeFilesMB
+
+## If you have an NVIDIA GPU with 8GB VRAM, run local demo using Gradio interface
+
+```sh
+python3 app_local.py
+```
+
+## Sample display
+</br>
+
+> After loading the checkpoints, you must click on the prompted local URL to run inference. 
+
+[![FPS](resources/gradio.png)]()
+## Future Work
+
+- [x] Frame based Visual Question Answering
+- [ ] Each object based Visual Question Answering
+
+
+## Citations
+
+```txt
+@inproceedings{li2022blip,
+      title={BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation}, 
+      author={Junnan Li and Dongxu Li and Caiming Xiong and Steven Hoi},
+      year={2022},
+      booktitle={ICML},
+}
+@inproceedings{li2021grounded,
+      title={Grounded Language-Image Pre-training},
+      author={Liunian Harold Li* and Pengchuan Zhang* and Haotian Zhang* and Jianwei Yang and Chunyuan Li and Yiwu Zhong and Lijuan Wang and Lu Yuan and Lei Zhang and Jenq-Neng Hwang and Kai-Wei Chang and Jianfeng Gao},
+      year={2022},
+      booktitle={CVPR},
+}
+@article{zhang2022glipv2,
+  title={GLIPv2: Unifying Localization and Vision-Language Understanding},
+  author={Zhang, Haotian* and Zhang, Pengchuan* and Hu, Xiaowei and Chen, Yen-Chun and Li, Liunian Harold and Dai, Xiyang and Wang, Lijuan and Yuan, Lu and Hwang, Jenq-Neng and Gao, Jianfeng},
+  journal={arXiv preprint arXiv:2206.05836},
+  year={2022}
+}
+@article{li2022elevater,
+  title={ELEVATER: A Benchmark and Toolkit for Evaluating Language-Augmented Visual Models},
+  author={Li*, Chunyuan and Liu*, Haotian and Li, Liunian Harold and Zhang, Pengchuan and Aneja, Jyoti and Yang, Jianwei and Jin, Ping and Lee, Yong Jae and Hu, Houdong and Liu, Zicheng and others},
+  journal={arXiv preprint arXiv:2204.08790},
+  year={2022}
+}
+```
+## Acknowledgement
+The implementation of this work relies on resources from <a href="https://github.com/salesforce/BLIP">BLIP</a>, <a href="https://github.com/microsoft/GLIP">GLIP</a>,  <a href="https://github.com/huggingface/transformers">Huggingface Transformers</a>, and <a href="https://github.com/rwightman/pytorch-image-models/tree/master/timm">timm</a>. We thank the original authors for their open-sourcing.
diff --git a/app.py b/app.py
@@ -0,0 +1,65 @@
+import requests
+import os
+from io import BytesIO
+from PIL import Image
+import numpy as np
+from pathlib import Path
+import gradio as gr
+import json
+import traceback
+import warnings
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+warnings.filterwarnings("ignore")
+
+from maskrcnn_benchmark.config import cfg
+from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
+
+from models.blip import BLIP_Decoder
+
+import vqa
+
+# Use this command for evaluate the GLIP-T model
+config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
+weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth"
+
+# manual override some options
+cfg.local_rank = 0
+cfg.num_gpus = 1
+cfg.merge_from_file(config_file)
+cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
+cfg.merge_from_list(["MODEL.DEVICE", "cuda"])
+
+glip_demo = GLIPDemo(
+    cfg,
+    min_image_size=800,
+    confidence_threshold=0.7,
+    show_mask_heatmaps=False
+)
+blip_demo = vqa.VQA(
+    model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
+)
+
+def predict(image, object, question):
+    result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
+    answer = blip_demo.vqa_demo(image, question)
+    return result[:, :, [2, 1, 0]], answer
+
+image = gr.inputs.Image()
+
+gr.Interface(
+    description="GLIP + BLIP VQA Demo.",
+    fn=predict,
+    inputs=[
+        "image", 
+        gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."), 
+        gr.Textbox(label='Question', lines=1, placeholder="Question here..")],
+
+    outputs=[
+        gr.outputs.Image(
+            type="pil",
+            label="grounding results"
+        ),
+        gr.Textbox(label="Answer")
+    ],
+).launch()
diff --git a/app_local.py b/app_local.py
@@ -0,0 +1,56 @@
+import os
+import gradio as gr
+import warnings
+
+warnings.filterwarnings("ignore")
+
+os.system("python setup.py build develop --user")
+
+from maskrcnn_benchmark.config import cfg
+from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
+import vqa
+
+# Use this command for evaluate the GLIP-T model
+config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
+weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth"
+
+# manual override some options
+cfg.local_rank = 0
+cfg.num_gpus = 1
+cfg.merge_from_file(config_file)
+cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
+cfg.merge_from_list(["MODEL.DEVICE", "cuda"])
+
+glip_demo = GLIPDemo(
+    cfg,
+    min_image_size=800,
+    confidence_threshold=0.7,
+    show_mask_heatmaps=False
+)
+blip_demo = vqa.VQA(
+    model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
+)
+
+def predict(image, object, question):
+    result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
+    answer = blip_demo.vqa_demo(image, question)
+    return result[:, :, [2, 1, 0]], answer
+
+image = gr.inputs.Image()
+
+gr.Interface(
+    description="GLIP + BLIP VQA Demo.",
+    fn=predict,
+    inputs=[
+        "image", 
+        gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."), 
+        gr.Textbox(label='Question', lines=1, placeholder="Question here..")],
+
+    outputs=[
+        gr.outputs.Image(
+            type="pil",
+            label="grounding results"
+        ),
+        gr.Textbox(label="Answer")
+    ],
+).launch()
diff --git a/configs/glip_Swin_T_O365_GoldG.yaml b/configs/glip_Swin_T_O365_GoldG.yaml
@@ -0,0 +1,100 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+    FREEZE_CONV_BODY_AT: -1
+
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+
+    USE_CHECKPOINT: True
+
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64
+
+# use for grounding model
+DATASETS:
+  TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
+  TEST: ("coco_2017_val", )
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+
+  SEPARATION_TOKENS: ". "
+
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 30
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  FIND_UNUSED_PARAMETERS: False
+
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
diff --git a/configs/med_config.json b/configs/med_config.json
@@ -0,0 +1,21 @@
+{
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "type_vocab_size": 2,
+  "vocab_size": 30524,
+  "encoder_width": 768,
+  "add_cross_attention": true   
+}
diff --git a/configs/vqa.yaml b/configs/vqa.yaml
@@ -0,0 +1,25 @@
+vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #followed by train2014/
+vg_root: '/export/share/datasets/vision/visual-genome/'  #followed by image/
+train_files: ['vqa_train','vqa_val','vg_qa']
+ann_root: 'annotation'
+
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'
+
+# size of vit model; base or large
+vit: 'base'
+batch_size_train: 16 
+batch_size_test: 32 
+vit_grad_ckpt: False
+vit_ckpt_layer: 0
+init_lr: 2e-5
+
+image_size: 480
+
+k_test: 128
+inference: 'rank'
+
+# optimizer
+weight_decay: 0.05
+min_lr: 0
+max_epoch: 10