From ee2e5423571d57d7f4156bade5a2cd6470fdffae Mon Sep 17 00:00:00 2001
From: Cycyes <92714336+Cycyes@users.noreply.github.com>
Date: Tue, 21 Nov 2023 09:13:26 +0800
Subject: [PATCH] Add GroundingDINO on ODinW results, and support caption
 prompt of GroundingDINO (#11187)

---
 configs/odinw/README.md                       | 116 +--
 .../grounding_dino_swin-b_pretrain_odinw13.py | 338 ++++++++
 .../grounding_dino_swin-b_pretrain_odinw35.py | 796 ++++++++++++++++++
 .../grounding_dino_swin-t_pretrain_odinw13.py | 338 ++++++++
 .../grounding_dino_swin-t_pretrain_odinw35.py | 796 ++++++++++++++++++
 mmdet/models/detectors/grounding_dino.py      | 105 ++-
 6 files changed, 2409 insertions(+), 80 deletions(-)
 create mode 100644 configs/odinw/grounding_dino_swin-b_pretrain_odinw13.py
 create mode 100644 configs/odinw/grounding_dino_swin-b_pretrain_odinw35.py
 create mode 100644 configs/odinw/grounding_dino_swin-t_pretrain_odinw13.py
 create mode 100644 configs/odinw/grounding_dino_swin-t_pretrain_odinw35.py

diff --git a/configs/odinw/README.md b/configs/odinw/README.md
index 108aac3685e..72d95933653 100644
--- a/configs/odinw/README.md
+++ b/configs/odinw/README.md
@@ -6,7 +6,7 @@
 
 ## Get Started
 
-1. development Developmennt Setup can reger to hits /\\To download dataset, you can refer to [reference document](../../docs/zh_cn/user_guides/dataset_prepare.md)
+1. To download dataset, you can refer to [reference document](../../docs/zh_cn/user_guides/dataset_prepare.md)
 
 2. You can use the following data to run the inference.
 
@@ -22,73 +22,75 @@ Learning visual representations from natural language supervision has recently s
 
 ## Results and models of odinw13
 
-| Method                | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  |
-| --------------------- | --------- | --------- | --------- | --------- | --------- | --------- |
-| AerialMaritimeDrone   | 0.123     | 0.122     | 0.110     | 0.11      | 0.130     | 0.130     |
-| Aquarium              | 0.175     | 0.174     | 0.173     | 0.169     | 0.191     | 0.190     |
-| CottontailRabbits     | 0.686     | 0.686     | 0.688     | 0.688     | 0.744     | 0.744     |
-| EgoHands              | 0.013     | 0.013     | 0.003     | 0.540     | 0.314     | 0.315     |
-| NorthAmericaMushrooms | 0.502     | 0.502     | 0.367     | 0.051     | 0.297     | 0.296     |
-| Packages              | 0.589     | 0.589     | 0.083     | 0.030     | 0.699     | 0.699     |
-| PascalVOC             | 0.512     | 0.512     | 0.541     | 0.288     | 0.565     | 0.565     |
-| pistols               | 0.339     | 0.339     | 0.502     | 0.338     | 0.503     | 0.504     |
-| pothole               | 0.007     | 0.007     | 0.030     | 0.475     | 0.058     | 0.058     |
-| Raccoon               | 0.075     | 0.075     | 0.285     | 0.288     | 0.241     | 0.244     |
-| ShellfishOpenImages   | 0.372     | 0.372     | 0.337     | 0.338     | 0.300     | 0.302     |
-| thermalDogsAndPeople  | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     |
-| VehiclesOpenImages    | 0.574     | 0.574     | 0.562     | 0.547     | 0.549     | 0.534     |
-| Average               | **0.334** | **0.324** | **0.320** | **0.318** | **0.392** | **0.392** |
+| Method                | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  | GroundingDINO-T | GroundingDINO-B |
+| --------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------------- | --------------- |
+| AerialMaritimeDrone   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     | 0.173           | 0.281           |
+| Aquarium              | 0.175     | 0.174     | 0.173     | 0.169     | 0.191     | 0.190     | 0.195           | 0.445           |
+| CottontailRabbits     | 0.686     | 0.686     | 0.688     | 0.688     | 0.744     | 0.744     | 0.799           | 0.808           |
+| EgoHands              | 0.013     | 0.013     | 0.003     | 0.004     | 0.314     | 0.315     | 0.608           | 0.764           |
+| NorthAmericaMushrooms | 0.502     | 0.502     | 0.367     | 0.367     | 0.297     | 0.296     | 0.507           | 0.675           |
+| Packages              | 0.589     | 0.589     | 0.083     | 0.083     | 0.699     | 0.699     | 0.687           | 0.670           |
+| PascalVOC             | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     | 0.563           | 0.711           |
+| pistols               | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     | 0.726           | 0.771           |
+| pothole               | 0.007     | 0.007     | 0.030     | 0.030     | 0.058     | 0.058     | 0.215           | 0.478           |
+| Raccoon               | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     | 0.549           | 0.541           |
+| ShellfishOpenImages   | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     | 0.393           | 0.650           |
+| thermalDogsAndPeople  | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     | 0.657           | 0.633           |
+| VehiclesOpenImages    | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     | 0.613           | 0.647           |
+| Average               | **0.325** | **0.324** | **0.320** | **0.318** | **0.392** | **0.392** | **0.514**       | **0.621**       |
 
 Note:
 
 1. The above are zero-shot evaluation results.
-2. The config and weights can be found at [here](../glip/README.md)
+2. The config and weights of GLIPs models can be found at [here](../glip/README.md)
+3. The config and weights of GroundingDINO models can be found at [here](../grounding_dino/README.md)
 
 ## Results and models of odinw35
 
-| Method                      | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  |
-| --------------------------- | --------- | --------- | --------- | --------- | --------- | --------- |
-| AerialMaritimeDrone_large   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     |
-| AerialMaritimeDrone_tiled   | 0.174     | 0.174     | 0.172     | 0.172     | 0.172     | 0.172     |
-| AmericanSignLanguageLetters | 0.001     | 0.001     | 0.003     | 0.003     | 0.009     | 0.009     |
-| Aquarium                    | 0.175     | 0.175     | 0.173     | 0.171     | 0.192     | 0.182     |
-| BCCD                        | 0.016     | 0.016     | 0.001     | 0.001     | 0.000     | 0.000     |
-| boggleBoards                | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     |
-| brackishUnderwater          | 0.016     | 0..013    | 0.021     | 0.027     | 0.020     | 0.022     |
-| ChessPieces                 | 0.001     | 0.001     | 0.000     | 0.000     | 0.001     | 0.001     |
-| CottontailRabbits           | 0.710     | 0.709     | 0.683     | 0.683     | 0.752     | 0.752     |
-| dice                        | 0.005     | 0.005     | 0.004     | 0.004     | 0.004     | 0.004     |
-| DroneControl                | 0.016     | 0.017     | 0.006     | 0.008     | 0.005     | 0.007     |
-| EgoHands_generic            | 0.009     | 0.010     | 0.005     | 0.006     | 0.510     | 0.508     |
-| EgoHands_specific           | 0.001     | 0.001     | 0.004     | 0.006     | 0.003     | 0.004     |
-| HardHatWorkers              | 0.029     | 0.029     | 0.023     | 0.023     | 0.033     | 0.033     |
-| MaskWearing                 | 0.007     | 0.007     | 0.003     | 0.002     | 0.005     | 0.005     |
-| MountainDewCommercial       | 0.218     | 0.227     | 0.199     | 0.197     | 0.478     | 0.463     |
-| NorthAmericaMushrooms       | 0.502     | 0.502     | 0.450     | 0.450     | 0.497     | 0.497     |
-| openPoetryVision            | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     |
-| OxfordPets_by_breed         | 0.001     | 0.002     | 0.002     | 0.004     | 0.001     | 0.002     |
-| OxfordPets_by_species       | 0.016     | 0.011     | 0.012     | 0.009     | 0.013     | 0.009     |
-| PKLot                       | 0.002     | 0.002     | 0.000     | 0.000     | 0.000     | 0.000     |
-| Packages                    | 0.569     | 0.569     | 0.279     | 0.279     | 0.712     | 0.712     |
-| PascalVOC                   | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     |
-| pistols                     | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     |
-| plantdoc                    | 0.002     | 0.002     | 0.007     | 0.007     | 0.009     | 0.009     |
-| pothole                     | 0.007     | 0.010     | 0.024     | 0.025     | 0.085     | 0.101     |
-| Raccoons                    | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     |
-| selfdrivingCar              | 0.071     | 0.072     | 0.074     | 0.074     | 0.081     | 0.080     |
-| ShellfishOpenImages         | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     |
-| ThermalCheetah              | 0.028     | 0.028     | 0.000     | 0.000     | 0.028     | 0.028     |
-| thermalDogsAndPeople        | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     |
-| UnoCards                    | 0.000     | 0.000     | 0.000     | 0.001     | 0.002     | 0.003     |
-| VehiclesOpenImages          | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     |
-| WildfireSmoke               | 0.000     | 0.000     | 0.000     | 0.000     | 0.017     | 0.017     |
-| websiteScreenshots          | 0.003     | 0.004     | 0.003     | 0.005     | 0.005     | 0.006     |
-| Average                     | **0.134** | **0.134** | **0.138** | **0.138** | **0.179** | **0.178** |
+| Method                      | GLIP-T(A) | Official  | GLIP-T(B) | Official  | GLIP-T(C) | Official  | GroundingDINO-T | GroundingDINO-B |
+| --------------------------- | --------- | --------- | --------- | --------- | --------- | --------- | --------------- | --------------- |
+| AerialMaritimeDrone_large   | 0.123     | 0.122     | 0.110     | 0.110     | 0.130     | 0.130     | 0.173           | 0.281           |
+| AerialMaritimeDrone_tiled   | 0.174     | 0.174     | 0.172     | 0.172     | 0.172     | 0.172     | 0.206           | 0.364           |
+| AmericanSignLanguageLetters | 0.001     | 0.001     | 0.003     | 0.003     | 0.009     | 0.009     | 0.002           | 0.096           |
+| Aquarium                    | 0.175     | 0.175     | 0.173     | 0.171     | 0.192     | 0.182     | 0.195           | 0.445           |
+| BCCD                        | 0.016     | 0.016     | 0.001     | 0.001     | 0.000     | 0.000     | 0.161           | 0.584           |
+| boggleBoards                | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000           | 0.134           |
+| brackishUnderwater          | 0.016     | 0..013    | 0.021     | 0.027     | 0.020     | 0.022     | 0.021           | 0.454           |
+| ChessPieces                 | 0.001     | 0.001     | 0.000     | 0.000     | 0.001     | 0.001     | 0.000           | 0.000           |
+| CottontailRabbits           | 0.710     | 0.709     | 0.683     | 0.683     | 0.752     | 0.752     | 0.806           | 0.797           |
+| dice                        | 0.005     | 0.005     | 0.004     | 0.004     | 0.004     | 0.004     | 0.004           | 0.082           |
+| DroneControl                | 0.016     | 0.017     | 0.006     | 0.008     | 0.005     | 0.007     | 0.042           | 0.638           |
+| EgoHands_generic            | 0.009     | 0.010     | 0.005     | 0.006     | 0.510     | 0.508     | 0.608           | 0.764           |
+| EgoHands_specific           | 0.001     | 0.001     | 0.004     | 0.006     | 0.003     | 0.004     | 0.002           | 0.687           |
+| HardHatWorkers              | 0.029     | 0.029     | 0.023     | 0.023     | 0.033     | 0.033     | 0.046           | 0.439           |
+| MaskWearing                 | 0.007     | 0.007     | 0.003     | 0.002     | 0.005     | 0.005     | 0.004           | 0.406           |
+| MountainDewCommercial       | 0.218     | 0.227     | 0.199     | 0.197     | 0.478     | 0.463     | 0.430           | 0.580           |
+| NorthAmericaMushrooms       | 0.502     | 0.502     | 0.450     | 0.450     | 0.497     | 0.497     | 0.471           | 0.501           |
+| openPoetryVision            | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000     | 0.000           | 0.051           |
+| OxfordPets_by_breed         | 0.001     | 0.002     | 0.002     | 0.004     | 0.001     | 0.002     | 0.003           | 0.799           |
+| OxfordPets_by_species       | 0.016     | 0.011     | 0.012     | 0.009     | 0.013     | 0.009     | 0.011           | 0.872           |
+| PKLot                       | 0.002     | 0.002     | 0.000     | 0.000     | 0.000     | 0.000     | 0.001           | 0.774           |
+| Packages                    | 0.569     | 0.569     | 0.279     | 0.279     | 0.712     | 0.712     | 0.695           | 0.728           |
+| PascalVOC                   | 0.512     | 0.512     | 0.541     | 0.540     | 0.565     | 0.565     | 0.563           | 0.711           |
+| pistols                     | 0.339     | 0.339     | 0.502     | 0.501     | 0.503     | 0.504     | 0.726           | 0.771           |
+| plantdoc                    | 0.002     | 0.002     | 0.007     | 0.007     | 0.009     | 0.009     | 0.005           | 0.376           |
+| pothole                     | 0.007     | 0.010     | 0.024     | 0.025     | 0.085     | 0.101     | 0.215           | 0.478           |
+| Raccoons                    | 0.075     | 0.074     | 0.285     | 0.288     | 0.241     | 0.244     | 0.549           | 0.541           |
+| selfdrivingCar              | 0.071     | 0.072     | 0.074     | 0.074     | 0.081     | 0.080     | 0.089           | 0.318           |
+| ShellfishOpenImages         | 0.253     | 0.253     | 0.337     | 0.338     | 0.300     | 0.302     | 0.393           | 0.650           |
+| ThermalCheetah              | 0.028     | 0.028     | 0.000     | 0.000     | 0.028     | 0.028     | 0.087           | 0.290           |
+| thermalDogsAndPeople        | 0.372     | 0.372     | 0.475     | 0.475     | 0.510     | 0.510     | 0.657           | 0.633           |
+| UnoCards                    | 0.000     | 0.000     | 0.000     | 0.001     | 0.002     | 0.003     | 0.006           | 0.754           |
+| VehiclesOpenImages          | 0.574     | 0.566     | 0.562     | 0.547     | 0.549     | 0.534     | 0.613           | 0.647           |
+| WildfireSmoke               | 0.000     | 0.000     | 0.000     | 0.000     | 0.017     | 0.017     | 0.134           | 0.410           |
+| websiteScreenshots          | 0.003     | 0.004     | 0.003     | 0.005     | 0.005     | 0.006     | 0.012           | 0.175           |
+| Average                     | **0.134** | **0.134** | **0.138** | **0.138** | **0.179** | **0.178** | **0.227**       | **0.492**       |
 
 Note:
 
 1. The above are zero-shot evaluation results.
-2. The config and weights can be found at [here](../glip/README.md)
+2. The config and weights of GLIPs models can be found at [here](../glip/README.md)
+3. The config and weights of GroundingDINO models can be found at [here](../grounding_dino/README.md)
 
 ## Citation
 
diff --git a/configs/odinw/grounding_dino_swin-b_pretrain_odinw13.py b/configs/odinw/grounding_dino_swin-b_pretrain_odinw13.py
new file mode 100644
index 00000000000..b853d23fafe
--- /dev/null
+++ b/configs/odinw/grounding_dino_swin-b_pretrain_odinw13.py
@@ -0,0 +1,338 @@
+_base_ = '../grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py'
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+caption_prompt = None
+# caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+caption_prompt = None
+# caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+caption_prompt = None
+# caption_prompt = {
+#     'CoW': {
+#         'name': 'flat mushroom'
+#     },
+#     'chanterelle': {
+#         'name': 'yellow mushroom'
+#     }
+# }
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'package': {
+#         'prefix': 'there is a ',
+#         'suffix': ' on the porch'
+#     }
+# }
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'pothole': {
+#         'prefix': 'there are some ',
+#         'name': 'holes',
+#         'suffix': ' on the road'
+#     }
+# }
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/odinw/grounding_dino_swin-b_pretrain_odinw35.py b/configs/odinw/grounding_dino_swin-b_pretrain_odinw35.py
new file mode 100644
index 00000000000..a4b546b5998
--- /dev/null
+++ b/configs/odinw/grounding_dino_swin-b_pretrain_odinw35.py
@@ -0,0 +1,796 @@
+_base_ = '../grounding_dino/grounding_dino_swin-b_pretrain_mixeddata.py'
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    # NOTE w. prompt 0.548; wo. prompt 0.764
+    # caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,  # NOTE w. prompt 0.728; wo. prompt 0.670
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some ',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    # NOTE w. prompt 0.221; wo. prompt 0.478
+    # caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/odinw/grounding_dino_swin-t_pretrain_odinw13.py b/configs/odinw/grounding_dino_swin-t_pretrain_odinw13.py
new file mode 100644
index 00000000000..6421ffc24ab
--- /dev/null
+++ b/configs/odinw/grounding_dino_swin-t_pretrain_odinw13.py
@@ -0,0 +1,338 @@
+_base_ = '../grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    test_mode=True,
+    pipeline=base_test_pipeline,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'penguin': {
+#         'suffix': ', which is black and white'
+#     },
+#     'puffin': {
+#         'suffix': ' with orange beaks'
+#     },
+#     'stingray': {
+#         'suffix': ' which is flat and round'
+#     },
+# }
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 CottontailRabbits---------------------#
+class_name = ('Cottontail-Rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+
+caption_prompt = None
+# caption_prompt = {'Cottontail-Rabbit': {'name': 'rabbit'}}
+
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 EgoHands---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+
+caption_prompt = None
+# caption_prompt = {'hand': {'suffix': ' of a person'}}
+
+dataset_EgoHands = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 NorthAmericaMushrooms---------------------#
+class_name = ('CoW', 'chanterelle')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+
+caption_prompt = None
+# caption_prompt = {
+#     'CoW': {
+#         'name': 'flat mushroom'
+#     },
+#     'chanterelle': {
+#         'name': 'yellow mushroom'
+#     }
+# }
+
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'package': {
+#         'prefix': 'there is a ',
+#         'suffix': ' on the porch'
+#     }
+# }
+
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+
+caption_prompt = None
+# caption_prompt = {
+#     'pothole': {
+#         'prefix': 'there are some ',
+#         'name': 'holes',
+#         'suffix': ' on the road'
+#     }
+# }
+
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+dataset_prefixes = [
+    'AerialMaritimeDrone', 'Aquarium', 'CottontailRabbits', 'EgoHands',
+    'NorthAmericaMushrooms', 'Packages', 'PascalVOC', 'pistols', 'pothole',
+    'Raccoon', 'ShellfishOpenImages', 'thermalDogsAndPeople',
+    'VehiclesOpenImages'
+]
+datasets = [
+    dataset_AerialMaritimeDrone, dataset_Aquarium, dataset_CottontailRabbits,
+    dataset_EgoHands, dataset_NorthAmericaMushrooms, dataset_Packages,
+    dataset_PascalVOC, dataset_pistols, dataset_pothole, dataset_Raccoon,
+    dataset_ShellfishOpenImages, dataset_thermalDogsAndPeople,
+    dataset_VehiclesOpenImages
+]
+metrics = [
+    val_evaluator_AerialMaritimeDrone, val_evaluator_Aquarium,
+    val_evaluator_CottontailRabbits, val_evaluator_EgoHands,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_Packages,
+    val_evaluator_PascalVOC, val_evaluator_pistols, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_ShellfishOpenImages,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_VehiclesOpenImages
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/configs/odinw/grounding_dino_swin-t_pretrain_odinw35.py b/configs/odinw/grounding_dino_swin-t_pretrain_odinw35.py
new file mode 100644
index 00000000000..78a3d8626c0
--- /dev/null
+++ b/configs/odinw/grounding_dino_swin-t_pretrain_odinw35.py
@@ -0,0 +1,796 @@
+_base_ = '../grounding_dino/grounding_dino_swin-t_pretrain_obj365_goldg_cap4m.py'  # noqa
+
+dataset_type = 'CocoDataset'
+data_root = 'data/odinw/'
+
+base_test_pipeline = _base_.test_pipeline
+base_test_pipeline[-1]['meta_keys'] = ('img_id', 'img_path', 'ori_shape',
+                                       'img_shape', 'scale_factor', 'text',
+                                       'custom_entities', 'caption_prompt')
+
+# ---------------------1 AerialMaritimeDrone_large---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/large/'
+dataset_AerialMaritimeDrone_large = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_large = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------2 AerialMaritimeDrone_tiled---------------------#
+class_name = ('boat', 'car', 'dock', 'jetski', 'lift')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AerialMaritimeDrone/tiled/'
+dataset_AerialMaritimeDrone_tiled = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AerialMaritimeDrone_tiled = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------3 AmericanSignLanguageLetters---------------------#
+class_name = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
+              'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'AmericanSignLanguageLetters/American Sign Language Letters.v1-v1.coco/'  # noqa
+dataset_AmericanSignLanguageLetters = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_AmericanSignLanguageLetters = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------4 Aquarium---------------------#
+class_name = ('fish', 'jellyfish', 'penguin', 'puffin', 'shark', 'starfish',
+              'stingray')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Aquarium/Aquarium Combined.v2-raw-1024.coco/'
+dataset_Aquarium = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Aquarium = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------5 BCCD---------------------#
+class_name = ('Platelets', 'RBC', 'WBC')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'BCCD/BCCD.v3-raw.coco/'
+dataset_BCCD = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_BCCD = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------6 boggleBoards---------------------#
+class_name = ('Q', 'a', 'an', 'b', 'c', 'd', 'e', 'er', 'f', 'g', 'h', 'he',
+              'i', 'in', 'j', 'k', 'l', 'm', 'n', 'o', 'o ', 'p', 'q', 'qu',
+              'r', 's', 't', 't\\', 'th', 'u', 'v', 'w', 'wild', 'x', 'y', 'z')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'boggleBoards/416x416AutoOrient/export/'
+dataset_boggleBoards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_boggleBoards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------7 brackishUnderwater---------------------#
+class_name = ('crab', 'fish', 'jellyfish', 'shrimp', 'small_fish', 'starfish')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'brackishUnderwater/960x540/'
+dataset_brackishUnderwater = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_brackishUnderwater = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------8 ChessPieces---------------------#
+class_name = ('  ', 'black bishop', 'black king', 'black knight', 'black pawn',
+              'black queen', 'black rook', 'white bishop', 'white king',
+              'white knight', 'white pawn', 'white queen', 'white rook')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ChessPieces/Chess Pieces.v23-raw.coco/'
+dataset_ChessPieces = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ChessPieces = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------9 CottontailRabbits---------------------#
+class_name = ('rabbit', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'CottontailRabbits/'
+dataset_CottontailRabbits = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_CottontailRabbits = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------10 dice---------------------#
+class_name = ('1', '2', '3', '4', '5', '6')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'dice/mediumColor/export/'
+dataset_dice = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_dice = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------11 DroneControl---------------------#
+class_name = ('follow', 'follow_hand', 'land', 'land_hand', 'null', 'object',
+              'takeoff', 'takeoff-hand')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'DroneControl/Drone Control.v3-raw.coco/'
+dataset_DroneControl = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_DroneControl = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------12 EgoHands_generic---------------------#
+class_name = ('hand', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/generic/'
+caption_prompt = {'hand': {'suffix': ' of a person'}}
+dataset_EgoHands_generic = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    # NOTE w. prompt 0.526, wo. prompt 0.608
+    # caption_prompt=caption_prompt,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_generic = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------13 EgoHands_specific---------------------#
+class_name = ('myleft', 'myright', 'yourleft', 'yourright')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'EgoHands/specific/'
+dataset_EgoHands_specific = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_EgoHands_specific = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------14 HardHatWorkers---------------------#
+class_name = ('head', 'helmet', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'HardHatWorkers/raw/'
+dataset_HardHatWorkers = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_HardHatWorkers = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------15 MaskWearing---------------------#
+class_name = ('mask', 'no-mask')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MaskWearing/raw/'
+dataset_MaskWearing = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MaskWearing = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------16 MountainDewCommercial---------------------#
+class_name = ('bottle', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'MountainDewCommercial/'
+dataset_MountainDewCommercial = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_MountainDewCommercial = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------17 NorthAmericaMushrooms---------------------#
+class_name = ('flat mushroom', 'yellow mushroom')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'NorthAmericaMushrooms/North American Mushrooms.v1-416x416.coco/'  # noqa
+dataset_NorthAmericaMushrooms = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/new_annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_NorthAmericaMushrooms = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/new_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------18 openPoetryVision---------------------#
+class_name = ('American Typewriter', 'Andale Mono', 'Apple Chancery', 'Arial',
+              'Avenir', 'Baskerville', 'Big Caslon', 'Bradley Hand',
+              'Brush Script MT', 'Chalkboard', 'Comic Sans MS', 'Copperplate',
+              'Courier', 'Didot', 'Futura', 'Geneva', 'Georgia', 'Gill Sans',
+              'Helvetica', 'Herculanum', 'Impact', 'Kefa', 'Lucida Grande',
+              'Luminari', 'Marker Felt', 'Menlo', 'Monaco', 'Noteworthy',
+              'Optima', 'PT Sans', 'PT Serif', 'Palatino', 'Papyrus',
+              'Phosphate', 'Rockwell', 'SF Pro', 'SignPainter', 'Skia',
+              'Snell Roundhand', 'Tahoma', 'Times New Roman', 'Trebuchet MS',
+              'Verdana')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'openPoetryVision/512x512/'
+dataset_openPoetryVision = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_openPoetryVision = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------19 OxfordPets_by_breed---------------------#
+class_name = ('cat-Abyssinian', 'cat-Bengal', 'cat-Birman', 'cat-Bombay',
+              'cat-British_Shorthair', 'cat-Egyptian_Mau', 'cat-Maine_Coon',
+              'cat-Persian', 'cat-Ragdoll', 'cat-Russian_Blue', 'cat-Siamese',
+              'cat-Sphynx', 'dog-american_bulldog',
+              'dog-american_pit_bull_terrier', 'dog-basset_hound',
+              'dog-beagle', 'dog-boxer', 'dog-chihuahua',
+              'dog-english_cocker_spaniel', 'dog-english_setter',
+              'dog-german_shorthaired', 'dog-great_pyrenees', 'dog-havanese',
+              'dog-japanese_chin', 'dog-keeshond', 'dog-leonberger',
+              'dog-miniature_pinscher', 'dog-newfoundland', 'dog-pomeranian',
+              'dog-pug', 'dog-saint_bernard', 'dog-samoyed',
+              'dog-scottish_terrier', 'dog-shiba_inu',
+              'dog-staffordshire_bull_terrier', 'dog-wheaten_terrier',
+              'dog-yorkshire_terrier')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-breed/'  # noqa
+dataset_OxfordPets_by_breed = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_breed = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------20 OxfordPets_by_species---------------------#
+class_name = ('cat', 'dog')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'OxfordPets/by-species/'  # noqa
+dataset_OxfordPets_by_species = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_OxfordPets_by_species = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------21 PKLot---------------------#
+class_name = ('space-empty', 'space-occupied')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PKLot/640/'  # noqa
+dataset_PKLot = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PKLot = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------22 Packages---------------------#
+class_name = ('package', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Packages/Raw/'
+caption_prompt = {
+    'package': {
+        'prefix': 'there is a ',
+        'suffix': ' on the porch'
+    }
+}
+dataset_Packages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=base_test_pipeline,
+    caption_prompt=caption_prompt,  # NOTE w. prompt 0.695; wo. prompt 0.687
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Packages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------23 PascalVOC---------------------#
+class_name = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+              'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+              'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+              'tvmonitor')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'PascalVOC/'
+dataset_PascalVOC = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_PascalVOC = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------24 pistols---------------------#
+class_name = ('pistol', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pistols/export/'
+dataset_pistols = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pistols = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------25 plantdoc---------------------#
+class_name = ('Apple Scab Leaf', 'Apple leaf', 'Apple rust leaf',
+              'Bell_pepper leaf', 'Bell_pepper leaf spot', 'Blueberry leaf',
+              'Cherry leaf', 'Corn Gray leaf spot', 'Corn leaf blight',
+              'Corn rust leaf', 'Peach leaf', 'Potato leaf',
+              'Potato leaf early blight', 'Potato leaf late blight',
+              'Raspberry leaf', 'Soyabean leaf', 'Soybean leaf',
+              'Squash Powdery mildew leaf', 'Strawberry leaf',
+              'Tomato Early blight leaf', 'Tomato Septoria leaf spot',
+              'Tomato leaf', 'Tomato leaf bacterial spot',
+              'Tomato leaf late blight', 'Tomato leaf mosaic virus',
+              'Tomato leaf yellow virus', 'Tomato mold leaf',
+              'Tomato two spotted spider mites leaf', 'grape leaf',
+              'grape leaf black rot')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'plantdoc/416x416/'
+dataset_plantdoc = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_plantdoc = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------26 pothole---------------------#
+class_name = ('pothole', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'pothole/'
+caption_prompt = {
+    'pothole': {
+        'name': 'holes',
+        'prefix': 'there are some',
+        'suffix': ' on the road'
+    }
+}
+dataset_pothole = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    # NOTE w. prompt 0.137; wo. prompt 0.215
+    # caption_prompt=caption_prompt,
+    pipeline=base_test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_pothole = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------27 Raccoon---------------------#
+class_name = ('raccoon', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'Raccoon/Raccoon.v2-raw.coco/'
+dataset_Raccoon = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_Raccoon = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------28 selfdrivingCar---------------------#
+class_name = ('biker', 'car', 'pedestrian', 'trafficLight',
+              'trafficLight-Green', 'trafficLight-GreenLeft',
+              'trafficLight-Red', 'trafficLight-RedLeft',
+              'trafficLight-Yellow', 'trafficLight-YellowLeft', 'truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'selfdrivingCar/fixedLarge/export/'
+dataset_selfdrivingCar = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='val_annotations_without_background.json',
+    data_prefix=dict(img=''),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_selfdrivingCar = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'val_annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------29 ShellfishOpenImages---------------------#
+class_name = ('Crab', 'Lobster', 'Shrimp')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ShellfishOpenImages/raw/'
+dataset_ShellfishOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ShellfishOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------30 ThermalCheetah---------------------#
+class_name = ('cheetah', 'human')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'ThermalCheetah/'
+dataset_ThermalCheetah = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_ThermalCheetah = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------31 thermalDogsAndPeople---------------------#
+class_name = ('dog', 'person')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'thermalDogsAndPeople/'
+dataset_thermalDogsAndPeople = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_thermalDogsAndPeople = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------32 UnoCards---------------------#
+class_name = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
+              '12', '13', '14')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'UnoCards/raw/'
+dataset_UnoCards = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_UnoCards = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------33 VehiclesOpenImages---------------------#
+class_name = ('Ambulance', 'Bus', 'Car', 'Motorcycle', 'Truck')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'VehiclesOpenImages/416x416/'
+dataset_VehiclesOpenImages = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_VehiclesOpenImages = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------34 WildfireSmoke---------------------#
+class_name = ('smoke', )
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'WildfireSmoke/'
+dataset_WildfireSmoke = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_WildfireSmoke = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# ---------------------35 websiteScreenshots---------------------#
+class_name = ('button', 'field', 'heading', 'iframe', 'image', 'label', 'link',
+              'text')
+metainfo = dict(classes=class_name)
+_data_root = data_root + 'websiteScreenshots/'
+dataset_websiteScreenshots = dict(
+    type=dataset_type,
+    metainfo=metainfo,
+    data_root=_data_root,
+    ann_file='valid/annotations_without_background.json',
+    data_prefix=dict(img='valid/'),
+    pipeline=_base_.test_pipeline,
+    test_mode=True,
+    return_classes=True)
+val_evaluator_websiteScreenshots = dict(
+    type='CocoMetric',
+    ann_file=_data_root + 'valid/annotations_without_background.json',
+    metric='bbox')
+
+# --------------------- Config---------------------#
+
+dataset_prefixes = [
+    'AerialMaritimeDrone_large',
+    'AerialMaritimeDrone_tiled',
+    'AmericanSignLanguageLetters',
+    'Aquarium',
+    'BCCD',
+    'boggleBoards',
+    'brackishUnderwater',
+    'ChessPieces',
+    'CottontailRabbits',
+    'dice',
+    'DroneControl',
+    'EgoHands_generic',
+    'EgoHands_specific',
+    'HardHatWorkers',
+    'MaskWearing',
+    'MountainDewCommercial',
+    'NorthAmericaMushrooms',
+    'openPoetryVision',
+    'OxfordPets_by_breed',
+    'OxfordPets_by_species',
+    'PKLot',
+    'Packages',
+    'PascalVOC',
+    'pistols',
+    'plantdoc',
+    'pothole',
+    'Raccoons',
+    'selfdrivingCar',
+    'ShellfishOpenImages',
+    'ThermalCheetah',
+    'thermalDogsAndPeople',
+    'UnoCards',
+    'VehiclesOpenImages',
+    'WildfireSmoke',
+    'websiteScreenshots',
+]
+
+datasets = [
+    dataset_AerialMaritimeDrone_large, dataset_AerialMaritimeDrone_tiled,
+    dataset_AmericanSignLanguageLetters, dataset_Aquarium, dataset_BCCD,
+    dataset_boggleBoards, dataset_brackishUnderwater, dataset_ChessPieces,
+    dataset_CottontailRabbits, dataset_dice, dataset_DroneControl,
+    dataset_EgoHands_generic, dataset_EgoHands_specific,
+    dataset_HardHatWorkers, dataset_MaskWearing, dataset_MountainDewCommercial,
+    dataset_NorthAmericaMushrooms, dataset_openPoetryVision,
+    dataset_OxfordPets_by_breed, dataset_OxfordPets_by_species, dataset_PKLot,
+    dataset_Packages, dataset_PascalVOC, dataset_pistols, dataset_plantdoc,
+    dataset_pothole, dataset_Raccoon, dataset_selfdrivingCar,
+    dataset_ShellfishOpenImages, dataset_ThermalCheetah,
+    dataset_thermalDogsAndPeople, dataset_UnoCards, dataset_VehiclesOpenImages,
+    dataset_WildfireSmoke, dataset_websiteScreenshots
+]
+
+metrics = [
+    val_evaluator_AerialMaritimeDrone_large,
+    val_evaluator_AerialMaritimeDrone_tiled,
+    val_evaluator_AmericanSignLanguageLetters, val_evaluator_Aquarium,
+    val_evaluator_BCCD, val_evaluator_boggleBoards,
+    val_evaluator_brackishUnderwater, val_evaluator_ChessPieces,
+    val_evaluator_CottontailRabbits, val_evaluator_dice,
+    val_evaluator_DroneControl, val_evaluator_EgoHands_generic,
+    val_evaluator_EgoHands_specific, val_evaluator_HardHatWorkers,
+    val_evaluator_MaskWearing, val_evaluator_MountainDewCommercial,
+    val_evaluator_NorthAmericaMushrooms, val_evaluator_openPoetryVision,
+    val_evaluator_OxfordPets_by_breed, val_evaluator_OxfordPets_by_species,
+    val_evaluator_PKLot, val_evaluator_Packages, val_evaluator_PascalVOC,
+    val_evaluator_pistols, val_evaluator_plantdoc, val_evaluator_pothole,
+    val_evaluator_Raccoon, val_evaluator_selfdrivingCar,
+    val_evaluator_ShellfishOpenImages, val_evaluator_ThermalCheetah,
+    val_evaluator_thermalDogsAndPeople, val_evaluator_UnoCards,
+    val_evaluator_VehiclesOpenImages, val_evaluator_WildfireSmoke,
+    val_evaluator_websiteScreenshots
+]
+
+# -------------------------------------------------#
+val_dataloader = dict(
+    dataset=dict(_delete_=True, type='ConcatDataset', datasets=datasets))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    _delete_=True,
+    type='MultiDatasetsEvaluator',
+    metrics=metrics,
+    dataset_prefixes=dataset_prefixes)
+test_evaluator = val_evaluator
diff --git a/mmdet/models/detectors/grounding_dino.py b/mmdet/models/detectors/grounding_dino.py
index 24518e62edd..cc6cccedf29 100644
--- a/mmdet/models/detectors/grounding_dino.py
+++ b/mmdet/models/detectors/grounding_dino.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import re
 import warnings
-from typing import Dict, Tuple, Union
+from typing import Dict, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -8,6 +9,7 @@
 
 from mmdet.registry import MODELS
 from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType
 from ..layers import SinePositionalEncoding
 from ..layers.transformer.grounding_dino_layers import (
     GroundingDinoTransformerDecoder, GroundingDinoTransformerEncoder)
@@ -16,6 +18,13 @@
                    run_ner)
 
 
+def clean_label_name(name: str) -> str:
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    return name
+
+
 @MODELS.register_module()
 class GroundingDINO(DINO):
     """Implementation of `Grounding DINO: Marrying DINO with Grounded Pre-
@@ -64,10 +73,49 @@ def init_weights(self) -> None:
         nn.init.constant_(self.text_feat_map.bias.data, 0)
         nn.init.xavier_uniform_(self.text_feat_map.weight.data)
 
+    def to_enhance_text_prompts(self, original_caption, enhanced_text_prompts):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            if word in enhanced_text_prompts:
+                enhanced_text_dict = enhanced_text_prompts[word]
+                if 'prefix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['prefix']
+                start_i = len(caption_string)
+                if 'name' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['name']
+                else:
+                    caption_string += word
+                end_i = len(caption_string)
+                tokens_positive.append([[start_i, end_i]])
+
+                if 'suffix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['suffix']
+            else:
+                tokens_positive.append(
+                    [[len(caption_string),
+                      len(caption_string) + len(word)]])
+                caption_string += word
+            caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def to_plain_text_prompts(self, original_caption):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            tokens_positive.append(
+                [[len(caption_string),
+                  len(caption_string) + len(word)]])
+            caption_string += word
+            caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
     def get_tokens_and_prompts(
-            self,
-            original_caption: Union[str, list, tuple],
-            custom_entities: bool = False) -> Tuple[dict, str, list]:
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompts: Optional[ConfigType] = None
+    ) -> Tuple[dict, str, list]:
         """Get the tokens positive and prompts for the caption."""
         if isinstance(original_caption, (list, tuple)) or custom_entities:
             if custom_entities and isinstance(original_caption, str):
@@ -76,14 +124,15 @@ def get_tokens_and_prompts(
                 original_caption = list(
                     filter(lambda x: len(x) > 0, original_caption))
 
-            caption_string = ''
-            tokens_positive = []
-            for idx, word in enumerate(original_caption):
-                tokens_positive.append(
-                    [[len(caption_string),
-                      len(caption_string) + len(word)]])
-                caption_string += word
-                caption_string += self._special_tokens
+            original_caption = [clean_label_name(i) for i in original_caption]
+
+            if custom_entities and enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption, enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption)
+
             # NOTE: Tokenizer in Grounding DINO is different from
             # that in GLIP. The tokenizer in GLIP will pad the
             # caption_string to max_length, while the tokenizer
@@ -123,9 +172,11 @@ def get_positive_map(self, tokenized, tokens_positive):
         return positive_map_label_to_token, positive_map
 
     def get_tokens_positive_and_prompts(
-            self,
-            original_caption: Union[str, list, tuple],
-            custom_entities: bool = False) -> Tuple[dict, str, Tensor, list]:
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompt: Optional[ConfigType] = None
+    ) -> Tuple[dict, str, Tensor, list]:
         """Get the tokens positive and prompts for the caption.
 
         Args:
@@ -141,7 +192,7 @@ def get_tokens_positive_and_prompts(
         """
         tokenized, caption_string, tokens_positive, entities = \
             self.get_tokens_and_prompts(
-                original_caption, custom_entities)
+                original_caption, custom_entities, enhanced_text_prompt)
         positive_map_label_to_token, positive_map = self.get_positive_map(
             tokenized, tokens_positive)
         return positive_map_label_to_token, caption_string, \
@@ -326,9 +377,15 @@ def loss(self, batch_inputs: Tensor,
         return losses
 
     def predict(self, batch_inputs, batch_data_samples, rescale: bool = True):
-        text_prompts = [
-            data_samples.text for data_samples in batch_data_samples
-        ]
+        text_prompts = []
+        enhanced_text_prompts = []
+        for data_samples in batch_data_samples:
+            text_prompts.append(data_samples.text)
+            if 'caption_prompt' in data_samples:
+                enhanced_text_prompts.append(data_samples.caption_prompt)
+            else:
+                enhanced_text_prompts.append(None)
+
         if 'custom_entities' in batch_data_samples[0]:
             # Assuming that the `custom_entities` flag
             # inside a batch is always the same. For single image inference
@@ -339,14 +396,16 @@ def predict(self, batch_inputs, batch_data_samples, rescale: bool = True):
             # All the text prompts are the same,
             # so there is no need to calculate them multiple times.
             _positive_maps_and_prompts = [
-                self.get_tokens_positive_and_prompts(text_prompts[0],
-                                                     custom_entities)
+                self.get_tokens_positive_and_prompts(
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0])
             ] * len(batch_inputs)
         else:
             _positive_maps_and_prompts = [
                 self.get_tokens_positive_and_prompts(text_prompt,
-                                                     custom_entities)
-                for text_prompt in text_prompts
+                                                     custom_entities,
+                                                     enhanced_text_prompt)
+                for text_prompt, enhanced_text_prompt in zip(
+                    text_prompts, enhanced_text_prompts)
             ]
         token_positive_maps, text_prompts, _, entities = zip(
             *_positive_maps_and_prompts)