From ea89f6b5656807d404e784032e3d948fe2f1730b Mon Sep 17 00:00:00 2001 From: Zhe Chen Date: Thu, 9 Jun 2022 13:22:50 +0800 Subject: [PATCH] update ViT-Adapter-L + HTC++ --- README.md | 73 +++- detection/README.md | 121 ++++--- detection/configs/htc++/README.md | 105 +++++- .../htc++_beit_adapter_large_fpn_3x_coco.py | 2 +- ...tc++_beit_adapter_large_fpn_3x_coco_old.py | 326 ++++++++++++++++++ .../models/backbones/beit_adapter.py | 12 +- 6 files changed, 561 insertions(+), 78 deletions(-) create mode 100644 detection/configs/htc++/htc++_beit_adapter_large_fpn_3x_coco_old.py diff --git a/README.md b/README.md index 60425fdaa..55e9669a6 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,9 @@ The official implementation of the paper "[Vision Transformer Adapter for Dense Predictions](https://arxiv.org/abs/2205.08534)". ## News - -(2022/06/04) Segmentation is released.\ -(2022/06/02) Detection is released and segmentation will come soon.\ -(2022/05/17) ViT-Adapter-L yields 60.1 box AP and 52.1 mask AP on COCO test-dev.\ +(2022/06/09) ViT-Adapter-L yields 60.4 box AP and 52.5 mask AP on COCO test-dev.\ +(2022/06/04) Code and models are released.\ +(2022/05/17) ~~ViT-Adapter-L yields 60.1 box AP and 52.1 mask AP on COCO test-dev.~~ \ (2022/05/12) ViT-Adapter-L reaches 85.2 mIoU on Cityscapes test set without coarse data.\ (2022/05/05) ViT-Adapter-L achieves the SOTA on ADE20K val set with 60.5 mIoU! @@ -29,14 +28,52 @@ This work investigates a simple yet powerful adapter for Vision Transformer (ViT ## SOTA Model Zoo -COCO test-dev - -| Method | Framework | Pre-train | Lr schd | box AP | mask AP | #Param | -|:------------------:|:---------:|:---------:|:-------:|:------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:|:------:| -| ViT-Adapter-L | HTC++ | BEiT | 3x | [58.5](https://drive.google.com/file/d/11zpPSvmuAn7aP5brxzHE8naObnOfFxby/view?usp=sharing) | [50.8](https://drive.google.com/file/d/1wIbtzfHfPqkvZaSivzcsh4HWu1oSiun6/view?usp=sharing) | 401M | -| ViT-Adapter-L (MS) | HTC++ | BEiT | 3x | [60.1](https://drive.google.com/file/d/1i-qjgUK4CMwZcmu5pkndldwfVbdkw5sU/view?usp=sharing) | [52.1](https://drive.google.com/file/d/16mlEOPY7K-Xpx_CL650A-LWbVDm2vl4X/view?usp=sharing) | 401M | - -ADE20K val +**COCO mini-val test-dev** + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MethodFrameworkPre-trainSchdmini-valtest-dev#Param
box APmask APbox APmask AP
ViT-Adapter-LHTC++BEiT3x58.450.858.951.3401M
ViT-Adapter-L$^\dagger$HTC++BEiT3x60.252.260.452.5401M
+ +$\dagger$ demotes multi-scale testing. + +**ADE20K val** | Method | Framework | Pre-train | Iters | Crop Size | mIoU | +MS | #Param | |:-------------:|:-----------:|:---------------:|:-----:|:---------:|:------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:|:------:| @@ -44,20 +81,20 @@ ADE20K val | ViT-Adapter-L | Mask2Former | BEiT | 160k | 640 | [58.3](https://drive.google.com/file/d/1jj56lSbc2s4ZNc-Hi-w6o-OSS99oi-_g/view?usp=sharing) | [59.0](https://drive.google.com/file/d/1hgpZB5gsyd7LTS7Aay2CbHmlY10nafCw/view?usp=sharing) | 568M | | ViT-Adapter-L | Mask2Former | COCO-Stuff-164k | 80k | 896 | [59.4](https://drive.google.com/file/d/1B_1XSwdnLhjJeUmn1g_nxfvGJpYmYWHa/view?usp=sharing) | [60.5](https://drive.google.com/file/d/1UtjmgcYKR-2h116oQXklUYOVcTw15woM/view?usp=sharing) | 571M | -Cityscapes val/test +**Cityscapes val/test** | Method | Framework | Pre-train | Iters | Crop Size | val mIoU | val/test +MS | #Param | |:-------------:|:-----------:|:---------:|:-----:|:---------:|:------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------:| | ViT-Adapter-L | Mask2Former | Mapillary | 80k | 896 | [84.9](https://drive.google.com/file/d/1LKy0zz-brCBbKGmUWquadILaBHdDLR6s/view?usp=sharing) | [85.8](https://drive.google.com/file/d/1LSJvK1BPSbzm9eWpKL8Xo7RmYBrd2xux/view?usp=sharing)/[85.2](https://www.cityscapes-dataset.com/anonymous-results/?id=0ca6821dc3183ff970bd5266f812df2eaa4519ecb1973ca1308d65a3b546bf27) | 571M | -COCO-Stuff-10K +**COCO-Stuff-10K** | Method | Framework | Pre-train | Iters | Crop Size | mIoU | +MS | #Param | |:-------------:|:-----------:|:---------:|:-----:|:---------:|:------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:|:------:| | ViT-Adapter-L | UperNet | BEiT | 80k | 512 | [51.0](https://drive.google.com/file/d/1xZodiAvOLGaLtMGx_btYVZIMC2VKrDhI/view?usp=sharing) | [51.4](https://drive.google.com/file/d/1bmFG9GA4bRqOEJfqXcO7nWYPwG3wSk2J/view?usp=sharing) | 451M | | ViT-Adapter-L | Mask2Former | BEiT | 40k | 512 | [53.2](https://drive.google.com/file/d/1Buewc1n7GBAcBDXeia-QarujrDZqc_Sx/view?usp=sharing) | [54.2](https://drive.google.com/file/d/1kQgJUHDeQoO3pPY6QoXRKwyF7heT7wCJ/view?usp=sharing) | 568M | -Pascal Context +**Pascal Context** | Method | Framework | Pre-train | Iters | Crop Size | mIoU | +MS | #Param | |:-------------:|:-----------:|:---------:|:-----:|:---------:|:------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:|:------:| @@ -68,7 +105,7 @@ Pascal Context ### COCO mini-val -Baseline Detectors +**Baseline Detectors** | Method | Framework | Pre-train | Lr schd | Aug | box AP | mask AP | #Param | |:-------------:|:----------:|:---------:|:-------:|:---:|:------:|:-------:|:------:| @@ -77,7 +114,7 @@ Baseline Detectors | ViT-Adapter-B | Mask R-CNN | DeiT | 3x | Yes | 49.6 | 43.6 | 120M | | ViT-Adapter-L | Mask R-CNN | AugReg | 3x | Yes | 50.9 | 44.8 | 348M | -Advanced Detectors +**Advanced Detectors** | Method | Framework | Pre-train | Lr schd | Aug | box AP | mask AP | #Param | |:-------------:|:-------------------:|:---------:|:-------:|:---:|:------:|:-------:|:------:| @@ -88,7 +125,7 @@ Advanced Detectors | ViT-Adapter-B | Upgraded Mask R-CNN | MAE | 25ep | LSJ | 50.3 | 44.7 | 122M | | ViT-Adapter-B | Upgraded Mask R-CNN | MAE | 50ep | LSJ | 50.8 | 45.1 | 122M | -ADE20K val +**ADE20K val** | Method | Framework | Pre-train | Iters | Crop Size | mIoU | +MS | #Param | |:-------------:|:---------:|:---------:|:-----:|:---------:|:----:|:----:|:------:| diff --git a/detection/README.md b/detection/README.md index 7f61d7c4f..3caf54fa5 100644 --- a/detection/README.md +++ b/detection/README.md @@ -35,25 +35,54 @@ Prepare COCO according to the guidelines in [MMDetection v2.22.0](https://github ## Results and models -#### ViT-Adapter on COCO test-dev - -HTC++ - -| Method | Backbone | Pre-train | Lr schd | box AP | mask AP | #Param | Config | Download | -|:------:|:------------------:|:---------------------------------------------------------------------------------------------------------------------:|:-------:|:------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------:|:------:|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------:| -| HTC++ | ViT-Adapter-L | [BEiT-L](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) | 3x | [58.5](https://drive.google.com/file/d/11zpPSvmuAn7aP5brxzHE8naObnOfFxby/view?usp=sharing) | [50.8](https://drive.google.com/file/d/1wIbtzfHfPqkvZaSivzcsh4HWu1oSiun6/view?usp=sharing) | 401M | [config](./configs/htc++/htc++_beit_adapter_large_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.0/htc++_beit_adapter_large_fpn_3x_coco.pth.tar) | -| HTC++ | ViT-Adapter-L (MS) | [BEiT-L](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) | 3x | [60.1](https://drive.google.com/file/d/1i-qjgUK4CMwZcmu5pkndldwfVbdkw5sU/view?usp=sharing) | [52.1](https://drive.google.com/file/d/16mlEOPY7K-Xpx_CL650A-LWbVDm2vl4X/view?usp=sharing) | 401M | TODO | - | - -#### ViT-Adapter on COCO minival - -HTC++ - -| Method | Backbone | Pre-train | Lr schd | box AP | mask AP | #Param | Config | Download | -|:------:|:------------------:|:---------------------------------------------------------------------------------------------------------------------:|:-------:|:------:|:-------:|:------:|:-----------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------:| -| HTC++ | ViT-Adapter-L | [BEiT-L](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) | 3x | 57.9 | 50.2 | 401M | [config](./configs/htc++/htc++_beit_adapter_large_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.0/htc++_beit_adapter_large_fpn_3x_coco.pth.tar) | -| HTC++ | ViT-Adapter-L (MS) | [BEiT-L](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) | 3x | 59.8 | 51.7 | 401M | TODO | - | - -Baseline Detectors +**HTC++** + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
BackbonePre-trainLr schdmini-valtest-dev#ParamConfigDownload
box APmask APbox APmask AP
ViT-Adapter-LBEiT-L3x58.450.858.951.3401Mconfig model
ViT-Adapter-L (MS)BEiT-L3x60.252.260.452.5401M--
+ + +**Mask R-CNN** | Method | Backbone | Pre-train | Lr schd | Aug | box AP | mask AP | #Param | Config | Download | |:----------:|:-------------:|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------:|:---:|:------:|:-------:|:------:|:--------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------:| @@ -63,7 +92,7 @@ Baseline Detectors | Mask R-CNN | ViT-Adapter-B | [Uni-Perceiver](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.1/uniperceiver_pretrain.pth) | 3x | Yes | 50.7 | 44.9 | 120M | [config](./configs/mask_rcnn/mask_rcnn_uniperceiver_adapter_base_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.1/mask_rcnn_uniperceiver_adapter_base_fpn_3x_coco.pth.tar) | | Mask R-CNN | ViT-Adapter-L | [AugReg](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.6/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.pth) | 3x | Yes | 50.9 | 44.8 | 348M | [config](./configs/mask_rcnn/mask_rcnn_augreg_adapter_large_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.2/mask_rcnn_augreg_adapter_large_fpn_3x_coco.pth.tar) | -Advanced Detectors +**Advanced Detectors** | Method | Framework | Pre-train | Lr schd | Aug | box AP | mask AP | #Param | Config | Download | |:-------------:|:-------------------:|:---------------------------------------------------------------------------------:|:-------:|:---:|:------:|:-------:|:------:|:--------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------:| @@ -71,8 +100,8 @@ Advanced Detectors | ViT-Adapter-S | ATSS | [DeiT-S](https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth) | 3x | Yes | 49.6 | - | 36M | [config](./configs/atss/atss_deit_adapter_small_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.5/atss_deit_adapter_small_fpn_3x_coco.pth.tar) | | ViT-Adapter-S | GFL | [DeiT-S](https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth) | 3x | Yes | 50.0 | - | 36M | [config](./configs/gfl/gfl_deit_adapter_small_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.6/gfl_deit_adapter_small_fpn_3x_coco.pth.tar) | | ViT-Adapter-S | Sparse R-CNN | [DeiT-S](https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth) | 3x | Yes | 48.1 | - | 110M | [config](./configs/sparse_rcnn/sparse_rcnn_deit_adapter_small_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.6/sparse_rcnn_deit_adapter_small_fpn_3x_coco.pth.tar) | -| ViT-Adapter-B | Upgraded Mask R-CNN | [MAE](https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth) | 25ep | LSJ | 50.3 | 44.7 | 122M | [config](./configs/upgraded_mask_rcnn/mask_rcnn_mae_adapter_base_lsj_fpn_25ep_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.4/mask_rcnn_mae_adapter_base_lsj_fpn_25ep_coco.pth.tar) | -| ViT-Adapter-B | Upgraded Mask R-CNN | [MAE](https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth) | 50ep | LSJ | 50.8 | 45.1 | 122M | [config](./configs/upgraded_mask_rcnn/mask_rcnn_mae_adapter_base_lsj_fpn_50ep_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.4/mask_rcnn_mae_adapter_base_lsj_fpn_50ep_coco.pth.tar) | +| ViT-Adapter-B | Upgraded Mask R-CNN | [MAE-B](https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth) | 25ep | LSJ | 50.3 | 44.7 | 122M | [config](./configs/upgraded_mask_rcnn/mask_rcnn_mae_adapter_base_lsj_fpn_25ep_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.4/mask_rcnn_mae_adapter_base_lsj_fpn_25ep_coco.pth.tar) | +| ViT-Adapter-B | Upgraded Mask R-CNN | [MAE-B](https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth) | 50ep | LSJ | 50.8 | 45.1 | 122M | [config](./configs/upgraded_mask_rcnn/mask_rcnn_mae_adapter_base_lsj_fpn_50ep_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.4/mask_rcnn_mae_adapter_base_lsj_fpn_50ep_coco.pth.tar) | ## Evaluation @@ -86,32 +115,32 @@ This should give ``` Evaluate annotation type *bbox* - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.579 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = 0.766 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = 0.635 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = 0.436 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = 0.616 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.726 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.736 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=300 ] = 0.736 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=1000 ] = 0.736 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = 0.608 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = 0.768 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.863 + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.584 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = 0.771 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = 0.642 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = 0.441 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = 0.622 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.725 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.742 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=300 ] = 0.742 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=1000 ] = 0.742 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = 0.615 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = 0.775 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.864 Evaluate annotation type *segm* - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.502 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = 0.744 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = 0.549 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = 0.328 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = 0.533 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.683 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.638 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=300 ] = 0.638 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=1000 ] = 0.638 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = 0.499 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = 0.669 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.776 + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.508 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=1000 ] = 0.750 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=1000 ] = 0.556 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = 0.331 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = 0.542 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.687 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.645 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=300 ] = 0.645 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=1000 ] = 0.645 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=1000 ] = 0.503 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=1000 ] = 0.681 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=1000 ] = 0.780 ``` ## Training diff --git a/detection/configs/htc++/README.md b/detection/configs/htc++/README.md index d74ca2cf4..329dfd8b9 100644 --- a/detection/configs/htc++/README.md +++ b/detection/configs/htc++/README.md @@ -27,19 +27,104 @@ detection ## Results and Models -The results on **COCO 2017val** are shown in the below table. +The results on COCO mini-val and test-dev are shown in the below table. -| Backbone | Pre-train | Lr schd | box AP | mask AP | #Param | Config | Download | -|:------------------:|:---------------------------------------------------------------------------------------------------------------------:|:-------:|:------:|:-------:|:------:|:---------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| -| ViT-Adapter-L | [BEiT-L](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) | 3x | 57.9 | 50.2 | 401M | [config](./htc++_beit_adapter_large_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.0/htc++_beit_adapter_large_fpn_3x_coco.pth.tar) \| [log](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.0/20220512_030903.log) | -| ViT-Adapter-L (MS) | - | - | 59.8 | 51.7 | 401M | TODO | - | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
BackbonePre-trainLr schdmini-valtest-dev#ParamConfigDownload
box APmask APbox APmask AP
ViT-Adapter-LBEiT-L3x58.450.858.951.3401Mconfig model | + log
ViT-Adapter-L (MS)BEiT-L3x60.252.260.452.5401M--
- MS denotes multi-scale testing. Note that the ms config is only for testing. - We use 16 A100 GPUs with 1 image/GPU for ViT-Adapter-L models. -The results on **COCO 2017test-dev** are shown in the below table. +## Old Results -| Backbone | Pre-train | Lr schd | box AP | mask AP | #Param | Config | Download | -|:------------------:|:---------------------------------------------------------------------------------------------------------------------:|:-------:|:------:|:-------:|:------:|:---------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| -| ViT-Adapter-L | [BEiT-L](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) | 3x | 58.5 | 50.8 | 401M | [config](./htc++_beit_adapter_large_fpn_3x_coco.py) | [model](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.0/htc++_beit_adapter_large_fpn_3x_coco.pth.tar) \| [log](https://github.com/czczup/ViT-Adapter/releases/download/v0.1.0/20220512_030903.log) | -| ViT-Adapter-L (MS) | - | - | 60.1 | 52.1 | 401M | TODO | - | +The results on COCO mini-val and test-dev are shown in the below table. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
BackbonePre-trainLr schdmini-valtest-dev#ParamConfigDownload
box APmask APbox APmask AP
ViT-Adapter-LBEiT-L3x57.950.258.550.8401Mconfig model | + log
ViT-Adapter-L (MS)BEiT-L3x59.851.760.152.1401M--
+ +- MS denotes multi-scale testing. Note that the ms config is only for testing. +- We use 16 A100 GPUs with 1 image/GPU for ViT-Adapter-L models. diff --git a/detection/configs/htc++/htc++_beit_adapter_large_fpn_3x_coco.py b/detection/configs/htc++/htc++_beit_adapter_large_fpn_3x_coco.py index 6fe6fc8db..a319b800e 100644 --- a/detection/configs/htc++/htc++_beit_adapter_large_fpn_3x_coco.py +++ b/detection/configs/htc++/htc++_beit_adapter_large_fpn_3x_coco.py @@ -20,7 +20,7 @@ use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-6, - drop_path_rate=0.3, # maybe 0.4 is better + drop_path_rate=0.4, conv_inplane=64, n_points=4, deform_num_heads=16, diff --git a/detection/configs/htc++/htc++_beit_adapter_large_fpn_3x_coco_old.py b/detection/configs/htc++/htc++_beit_adapter_large_fpn_3x_coco_old.py new file mode 100644 index 000000000..9505993a6 --- /dev/null +++ b/detection/configs/htc++/htc++_beit_adapter_large_fpn_3x_coco_old.py @@ -0,0 +1,326 @@ +# Copyright (c) Shanghai AI Lab. All rights reserved. +_base_ = [ + '../_base_/datasets/coco_instance.py', + '../_base_/schedules/schedule_3x.py', + '../_base_/default_runtime.py' +] +# pretrained = 'https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth' +pretrained = 'pretrained/beit_large_patch16_224_pt22k_ft22k.pth' +model = dict( + type='HybridTaskCascade', + backbone=dict( + type='BEiTAdapter', + img_size=224, + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + qkv_bias=True, + use_abs_pos_emb=False, + use_rel_pos_bias=True, + init_values=1e-6, + drop_path_rate=0.3, # maybe 0.4 is better + conv_inplane=64, + n_points=4, + deform_num_heads=16, + cffn_ratio=0.25, + deform_ratio=0.5, + window_attn=[True, True, True, True, True, True, + True, True, True, True, True, True, + True, True, True, True, True, True, + True, True, True, True, True, True], + window_size=[14, 14, 14, 14, 14, 56, + 14, 14, 14, 14, 14, 56, + 14, 14, 14, 14, 14, 56, + 14, 14, 14, 14, 14, 56], + interaction_indexes=[[0, 5], [6, 11], [12, 17], [18, 23]], + pretrained=pretrained, + version='old'), + neck=[ + dict( + type='ExtraAttention', + in_channels=[1024, 1024, 1024, 1024], + num_head=32, + with_ffn=True, + ffn_ratio=4.0, + drop_path=0.3, + ), + dict( + type='FPN', + in_channels=[1024, 1024, 1024, 1024], + out_channels=256, + num_outs=5)], + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='HybridTaskCascadeRoIHead', + interleaved=True, + mask_info_flow=True, + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + dict( + type='Shared4Conv1FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=[ + dict( + type='HTCMaskHead', + with_conv_res=False, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), + dict( + type='HTCMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)) + ], + semantic_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[8]), + semantic_head=dict( + type='FusedSemanticHead', + num_ins=5, + fusion_level=1, + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=183, + loss_seg=dict( + type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=2000, + max_per_img=2000, + nms=dict(type='soft_nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_pre=1000, + max_per_img=1000, + nms=dict(type='soft_nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict( + score_thr=0.001, + nms=dict(type='soft_nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) +# optimizer +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +# augmentation strategy originates from DETR / Sparse RCNN +# file_client_args = dict(backend='petrel') +file_client_args = dict(backend='disk') +train_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='InstaBoost', + action_candidate=('normal', 'horizontal', 'skip'), + action_prob=(1, 0, 0), + scale=(0.8, 1.2), + dx=15, + dy=15, + theta=(-1, 1), + color_prob=0.5, + hflag=False, + aug_ratio=0.5), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True, + with_seg=True), + dict(type='Resize', + img_scale=[(1600, 400), (1600, 1400)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SegRescale', scale_factor=1 / 8), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile', file_client_args=file_client_args), + dict(type='MultiScaleFlipAug', + img_scale=(1600, 1400), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict(samples_per_gpu=1, + train=dict(seg_prefix='data/coco/stuffthingmaps/train2017/', + pipeline=train_pipeline), + test=dict(pipeline=test_pipeline), + val=dict(pipeline=test_pipeline)) +optimizer = dict(_delete_=True, + type='AdamW', + lr=0.0001, + betas=(0.9, 0.999), + weight_decay=0.05, + constructor='LayerDecayOptimizerConstructor', + paramwise_cfg=dict(num_layers=24, layer_decay_rate=0.90)) +optimizer_config = dict(grad_clip=None) +checkpoint_config = dict( + interval=1, + max_keep_ckpts=2, + save_last=True, +) +# fp16 = dict(loss_scale=dict(init_scale=512)) +# find_unused_parameters=True diff --git a/detection/mmdet_custom/models/backbones/beit_adapter.py b/detection/mmdet_custom/models/backbones/beit_adapter.py index 0fda43780..200bf217c 100644 --- a/detection/mmdet_custom/models/backbones/beit_adapter.py +++ b/detection/mmdet_custom/models/backbones/beit_adapter.py @@ -20,12 +20,13 @@ class BEiTAdapter(BEiT): def __init__(self, pretrain_size=224, conv_inplane=64, n_points=4, deform_num_heads=6, init_values=0., cffn_ratio=0.25, deform_ratio=1.0, with_cffn=True, - interaction_indexes=None, add_vit_feature=True, *args, **kwargs): + interaction_indexes=None, add_vit_feature=True, version='new', *args, **kwargs): super().__init__(init_values=init_values, *args, **kwargs) self.num_classes = 80 # self.cls_token = None + self.version = version self.num_block = len(self.blocks) self.pretrain_size = (pretrain_size, pretrain_size) self.flags = [i for i in range(-1, self.num_block, self.num_block // 4)][1:] @@ -110,7 +111,8 @@ def forward(self, x): indexes = self.interaction_indexes[i] x, c = layer(x, c, self.blocks[indexes[0]:indexes[-1] + 1], deform_inputs1, deform_inputs2, H, W) - outs.append(x.transpose(1, 2).view(bs, dim, H, W).contiguous()) + if self.version == 'old': + outs.append(x.transpose(1, 2).view(bs, dim, H, W).contiguous()) # Split & Reshape c2 = c[:, 0:c2.size(1), :] @@ -123,7 +125,11 @@ def forward(self, x): c1 = self.up(c2) + c1 if self.add_vit_feature: - x1, x2, x3, x4 = outs + if self.version == 'old': + x1, x2, x3, x4 = outs + else: + x = x.transpose(1, 2).view(bs, dim, H, W).contiguous() + x1, x2, x3, x4 = x, x, x, x x1 = F.interpolate(x1, scale_factor=4, mode='bilinear', align_corners=False) x2 = F.interpolate(x2, scale_factor=2, mode='bilinear', align_corners=False) x4 = F.interpolate(x4, scale_factor=0.5, mode='bilinear', align_corners=False)