From e9c5e34eda2d389bd01c482dab98bff5ac3b5164 Mon Sep 17 00:00:00 2001
From: zhe chen <wztxy89@163.com>
Date: Tue, 29 Oct 2024 00:34:25 +0800
Subject: [PATCH] Update classification code

---
 classification/config.py                      |  14 +-
 ...ng_intern_vit_6b_224px_in1k_224_64gpu.yaml |  35 ++
 ...it_6b_224px_in1k_224_64gpu_imagenet_a.yaml |  36 ++
 ...it_6b_224px_in1k_224_64gpu_imagenet_r.yaml |  36 ++
 ...6b_224px_in1k_224_64gpu_imagenet_real.yaml |  36 ++
 ..._224px_in1k_224_64gpu_imagenet_sketch.yaml |  36 ++
 ...it_6b_224px_in1k_224_64gpu_imagenetv2.yaml |  36 ++
 ...tern_vit_6b_224px_in1k_224to448_64gpu.yaml |  36 ++
 ..._224px_in1k_224to448_64gpu_imagenet_a.yaml |  37 ++
 ..._224px_in1k_224to448_64gpu_imagenet_r.yaml |  37 ++
 ...4px_in1k_224to448_64gpu_imagenet_real.yaml |  37 ++
 ...x_in1k_224to448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._224px_in1k_224to448_64gpu_imagenetv2.yaml |  37 ++
 ...tern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml |  36 ++
 ..._448px_v1_0_in1k_448_64gpu_imagenet_a.yaml |  37 ++
 ..._448px_v1_0_in1k_448_64gpu_imagenet_r.yaml |  37 ++
 ...8px_v1_0_in1k_448_64gpu_imagenet_real.yaml |  37 ++
 ...x_v1_0_in1k_448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._448px_v1_0_in1k_448_64gpu_imagenetv2.yaml |  37 ++
 ...tern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml |  36 ++
 ..._448px_v1_2_in1k_448_64gpu_imagenet_a.yaml |  37 ++
 ..._448px_v1_2_in1k_448_64gpu_imagenet_r.yaml |  37 ++
 ...8px_v1_2_in1k_448_64gpu_imagenet_real.yaml |  37 ++
 ...x_v1_2_in1k_448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._448px_v1_2_in1k_448_64gpu_imagenetv2.yaml |  37 ++
 ...tern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml |  36 ++
 ..._448px_v1_5_in1k_448_64gpu_imagenet_a.yaml |  37 ++
 ..._448px_v1_5_in1k_448_64gpu_imagenet_r.yaml |  37 ++
 ...8px_v1_5_in1k_448_64gpu_imagenet_real.yaml |  37 ++
 ...x_v1_5_in1k_448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._448px_v1_5_in1k_448_64gpu_imagenetv2.yaml |  37 ++
 ...tern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml |  36 ++
 ..._448px_v2_5_in1k_448_64gpu_imagenet_a.yaml |  37 ++
 ..._448px_v2_5_in1k_448_64gpu_imagenet_r.yaml |  37 ++
 ...8px_v2_5_in1k_448_64gpu_imagenet_real.yaml |  37 ++
 ...x_v2_5_in1k_448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._448px_v2_5_in1k_448_64gpu_imagenetv2.yaml |  37 ++
 ...ng_intern_vit_6b_224px_in1k_224_64gpu.yaml |  35 ++
 ...it_6b_224px_in1k_224_64gpu_imagenet_a.yaml |  36 ++
 ...it_6b_224px_in1k_224_64gpu_imagenet_r.yaml |  36 ++
 ...6b_224px_in1k_224_64gpu_imagenet_real.yaml |  36 ++
 ..._224px_in1k_224_64gpu_imagenet_sketch.yaml |  36 ++
 ...it_6b_224px_in1k_224_64gpu_imagenetv2.yaml |  36 ++
 ...tern_vit_6b_224px_in1k_224to448_64gpu.yaml |  36 ++
 ..._224px_in1k_224to448_64gpu_imagenet_a.yaml |  37 ++
 ..._224px_in1k_224to448_64gpu_imagenet_r.yaml |  37 ++
 ...4px_in1k_224to448_64gpu_imagenet_real.yaml |  37 ++
 ...x_in1k_224to448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._224px_in1k_224to448_64gpu_imagenetv2.yaml |  37 ++
 ...tern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml |  36 ++
 ..._448px_v1_0_in1k_448_64gpu_imagenet_a.yaml |  37 ++
 ..._448px_v1_0_in1k_448_64gpu_imagenet_r.yaml |  37 ++
 ...8px_v1_0_in1k_448_64gpu_imagenet_real.yaml |  37 ++
 ...x_v1_0_in1k_448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._448px_v1_0_in1k_448_64gpu_imagenetv2.yaml |  37 ++
 ...tern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml |  36 ++
 ..._448px_v1_2_in1k_448_64gpu_imagenet_a.yaml |  37 ++
 ..._448px_v1_2_in1k_448_64gpu_imagenet_r.yaml |  37 ++
 ...8px_v1_2_in1k_448_64gpu_imagenet_real.yaml |  37 ++
 ...x_v1_2_in1k_448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._448px_v1_2_in1k_448_64gpu_imagenetv2.yaml |  37 ++
 ...tern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml |  36 ++
 ..._448px_v1_5_in1k_448_64gpu_imagenet_a.yaml |  37 ++
 ..._448px_v1_5_in1k_448_64gpu_imagenet_r.yaml |  37 ++
 ...8px_v1_5_in1k_448_64gpu_imagenet_real.yaml |  37 ++
 ...x_v1_5_in1k_448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._448px_v1_5_in1k_448_64gpu_imagenetv2.yaml |  37 ++
 ...tern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml |  36 ++
 ..._448px_v2_5_in1k_448_64gpu_imagenet_a.yaml |  37 ++
 ..._448px_v2_5_in1k_448_64gpu_imagenet_r.yaml |  37 ++
 ...8px_v2_5_in1k_448_64gpu_imagenet_real.yaml |  37 ++
 ...x_v2_5_in1k_448_64gpu_imagenet_sketch.yaml |  37 ++
 ..._448px_v2_5_in1k_448_64gpu_imagenetv2.yaml |  37 ++
 classification/ema_deepspeed.py               | 100 ----
 classification/export.py                      | 128 -----
 classification/gflops.py                      |   2 +-
 classification/hf2pytorch.py                  |  70 +++
 classification/main_accelerate.py             | 377 ------------
 classification/main_deepspeed.py              | 544 ------------------
 classification/models/build.py                |  17 +-
 classification/models/clip_vit.py             | 195 +++++++
 classification/models/intern_vit_6b.py        |  28 +-
 classification/train_in1k_deepspeed.sh        |  27 -
 83 files changed, 2953 insertions(+), 1189 deletions(-)
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
 create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml
 delete mode 100644 classification/ema_deepspeed.py
 delete mode 100644 classification/export.py
 create mode 100644 classification/hf2pytorch.py
 delete mode 100644 classification/main_accelerate.py
 delete mode 100644 classification/main_deepspeed.py
 create mode 100644 classification/models/clip_vit.py
 delete mode 100644 classification/train_in1k_deepspeed.sh

diff --git a/classification/config.py b/classification/config.py
index f27ab8b9..53073592 100644
--- a/classification/config.py
+++ b/classification/config.py
@@ -81,7 +81,19 @@
 _C.MODEL.INTERN_VIT_6B.FREEZE_VIT = True
 _C.MODEL.INTERN_VIT_6B.PRETRAINED = None
 _C.MODEL.INTERN_VIT_6B.CLS_TARGET = 'cls_patch_concat'
-_C.MODEL.INTERN_VIT_6B.HEAD_NORM_TYPE = 'bn'
+_C.MODEL.INTERN_VIT_6B.NORM_TYPE = 'rms'
+
+# CLIP_VIT parameters
+_C.MODEL.CLIP_VIT = CN()
+_C.MODEL.CLIP_VIT.PATCH_SIZE = 14
+_C.MODEL.CLIP_VIT.PRETRAIN_SIZE = 336
+_C.MODEL.CLIP_VIT.EMBED_DIM = 1024
+_C.MODEL.CLIP_VIT.NUM_HEADS = 16
+_C.MODEL.CLIP_VIT.MLP_RATIO = 4
+_C.MODEL.CLIP_VIT.DEPTH = 24
+_C.MODEL.CLIP_VIT.FREEZE_VIT = True
+_C.MODEL.CLIP_VIT.PRETRAINED = 'openai/clip-vit-large-patch14-336'
+_C.MODEL.CLIP_VIT.CLS_TARGET = 'cls_patch_concat'
 
 # -----------------------------------------------------------------------------
 # Training settings
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
new file mode 100644
index 00000000..3eb85c31
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
@@ -0,0 +1,35 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..98d30d5c
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..97017056
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..5bd19053
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..4dfdbddd
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..c310582a
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
new file mode 100644
index 00000000..4ca8070b
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..95f0dbaf
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..f3771780
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..2b24800b
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..09953bf0
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..f6ec39fe
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml
new file mode 100644
index 00000000..48bb2ff0
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..626e2aa2
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..be2c07f3
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..1572d242
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..fde27a51
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..8b960969
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml
new file mode 100644
index 00000000..751f3976
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..616789aa
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..ce53ee02
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..8fd9f841
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..69d2eb9f
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..f566dd75
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml
new file mode 100644
index 00000000..33818830
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..767bb2b6
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..a94e7175
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..6095ad87
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..e860db91
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..3d64a1bf
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml
new file mode 100644
index 00000000..35115b6f
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..bb61d42f
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..339b665f
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..ff6f4658
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..b4e9a207
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..16f40561
--- /dev/null
+++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
new file mode 100644
index 00000000..6788c616
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml
@@ -0,0 +1,35 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..80e751f9
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..f444f6f9
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..9525a450
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..b1355e38
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..da992da2
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
new file mode 100644
index 00000000..a7d02825
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..de3a17d3
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..014ee789
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..ea364c2f
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..ef10e8c7
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..905b7f9c
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml
new file mode 100644
index 00000000..004db231
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..fa863328
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..a1968602
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..7607b577
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..0a669975
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..70de8bb8
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml
new file mode 100644
index 00000000..23e696be
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..60f9140e
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..1c2475bd
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..396157ec
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..9bbd942d
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..2a7c0232
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml
new file mode 100644
index 00000000..cba76bde
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..6f4af1b5
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..46a53bed
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..53607372
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..aa523d22
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..e3d19a6b
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml
new file mode 100644
index 00000000..f667bf10
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml
new file mode 100644
index 00000000..7e48cabd
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_a'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-a'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
new file mode 100644
index 00000000..b114f88a
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_r'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-r'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 00000000..52f68723
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
new file mode 100644
index 00000000..6b879af1
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet_sketch'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-sketch'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml
new file mode 100644
index 00000000..03a59c64
--- /dev/null
+++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenetv2'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenetv2'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
+    CLS_TARGET: 'cls_patch_concat'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/classification/ema_deepspeed.py b/classification/ema_deepspeed.py
deleted file mode 100644
index 3ae92cb3..00000000
--- a/classification/ema_deepspeed.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from contextlib import contextmanager
-
-import deepspeed
-import torch
-import torch.nn as nn
-from deepspeed.runtime.zero import GatheredParameters
-
-
-class EMADeepspeed(nn.Module):
-    """ migrated from https://github.com/microsoft/DeepSpeed/issues/2056
-    """
-
-    def __init__(self, model, decay=0.9999, use_num_updates=True):
-        super().__init__()
-        if decay < 0.0 or decay > 1.0:
-            raise ValueError('Decay must be between 0 and 1')
-
-        self.m_name2s_name = {}
-        self.decay = decay
-        self.num_updates = 0 if use_num_updates else -1
-
-        with GatheredParameters(model.parameters(), fwd_module=self):
-            for name, p in model.named_parameters():
-                if p.requires_grad:
-                    # remove as '.'-character is not allowed in buffers
-                    s_name = name.replace('.', '')
-                    self.m_name2s_name.update({name: s_name})
-                    self.register_buffer(s_name, p.clone().detach().data)
-                    # remove as '.'-character is not allowed in buffers
-        self.collected_params = []
-
-    def forward(self, model):
-        decay = self.decay
-
-        if self.num_updates >= 0:
-            self.num_updates += 1
-            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
-
-        one_minus_decay = 1.0 - decay
-        shadow_params = dict(self.named_buffers())
-
-        with torch.no_grad():
-            with GatheredParameters(model.parameters()):
-                if deepspeed.comm.get_rank() == 0:
-                    m_param = dict(model.named_parameters())
-
-                    for key in m_param:
-                        if m_param[key].requires_grad:
-                            sname = self.m_name2s_name[key]
-                            shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
-                            shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
-                        else:
-                            assert key not in self.m_name2s_name
-
-    def copy_to(self, model):
-        shadow_params = dict(self.named_buffers())
-        with GatheredParameters(model.parameters(), modifier_rank=0):
-            if deepspeed.comm.get_rank() == 0:
-                m_param = dict(model.named_parameters())
-                for key in m_param:
-                    if m_param[key].requires_grad:
-                        m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
-                    else:
-                        assert key not in self.m_name2s_name
-
-    def store(self, model):
-        """
-        Save the current parameters for restoring later.
-        Args:
-          model: A model that parameters will be stored
-        """
-        with GatheredParameters(model.parameters()):
-            if deepspeed.comm.get_rank() == 0:
-                parameters = model.parameters()
-                self.collected_params = [param.clone() for param in parameters]
-
-    def restore(self, model):
-        """
-        Restore the parameters stored with the `store` method.
-        Useful to validate the model with EMA parameters without affecting the
-        original optimization process. Store the parameters before the
-        `copy_to` method. After validation (or model saving), use this to
-        restore the former parameters.
-        Args:
-          model: A model that to restore its parameters.
-        """
-        with GatheredParameters(model.parameters(), modifier_rank=0):
-            if deepspeed.comm.get_rank() == 0:
-                parameters = model.parameters()
-                for c_param, param in zip(self.collected_params, parameters):
-                    param.data.copy_(c_param.data)
-
-    @contextmanager
-    def activate(self, model):
-        try:
-            self.store(model)
-            self.copy_to(model)
-            yield
-        finally:
-            self.restore(model)
diff --git a/classification/export.py b/classification/export.py
deleted file mode 100644
index 0edb0d85..00000000
--- a/classification/export.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# --------------------------------------------------------
-# InternVL
-# Copyright (c) 2022 OpenGVLab
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-
-import argparse
-import os
-import time
-
-import torch
-from config import get_config
-from models import build_model
-from tqdm import tqdm
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name', type=str,
-                        default='internimage_t_1k_224')
-    parser.add_argument('--ckpt_dir', type=str,
-                        default='/mnt/petrelfs/share_data/huangzhenhang/code/internimage/checkpoint_dir/new/cls')
-    parser.add_argument('--onnx', default=False, action='store_true')
-    parser.add_argument('--trt', default=False, action='store_true')
-
-    args = parser.parse_args()
-    args.cfg = os.path.join('./configs', f'{args.model_name}.yaml')
-    args.ckpt = os.path.join(args.ckpt_dir, f'{args.model_name}.pth')
-    args.size = int(args.model_name.split('.')[0].split('_')[-1])
-
-    cfg = get_config(args)
-    return args, cfg
-
-
-def get_model(args, cfg):
-    model = build_model(cfg)
-    ckpt = torch.load(args.ckpt, map_location='cpu')['model']
-
-    model.load_state_dict(ckpt)
-    return model
-
-
-def speed_test(model, input):
-    # warm-up
-    for _ in tqdm(range(100)):
-        _ = model(input)
-
-    # speed test
-    torch.cuda.synchronize()
-    start = time.time()
-    for _ in tqdm(range(100)):
-        _ = model(input)
-    end = time.time()
-    th = 100 / (end - start)
-    print(f'using time: {end - start}, throughput {th}')
-
-
-def torch2onnx(args, cfg):
-    model = get_model(args, cfg).cuda()
-
-    # speed_test(model)
-
-    onnx_name = f'{args.model_name}.onnx'
-    torch.onnx.export(model,
-                      torch.rand(1, 3, args.size, args.size).cuda(),
-                      onnx_name,
-                      input_names=['input'],
-                      output_names=['output'])
-
-    return model
-
-
-def onnx2trt(args):
-    from mmdeploy.backend.tensorrt import from_onnx
-
-    onnx_name = f'{args.model_name}.onnx'
-    from_onnx(
-        onnx_name,
-        args.model_name,
-        dict(
-            input=dict(
-                min_shape=[1, 3, args.size, args.size],
-                opt_shape=[1, 3, args.size, args.size],
-                max_shape=[1, 3, args.size, args.size],
-            )
-        ),
-        max_workspace_size=2 ** 30,
-    )
-
-
-def check(args, cfg):
-    from mmdeploy.backend.tensorrt.wrapper import TRTWrapper
-
-    model = get_model(args, cfg).cuda()
-    model.eval()
-    trt_model = TRTWrapper(f'{args.model_name}.engine',
-                           ['output'])
-
-    x = torch.randn(1, 3, args.size, args.size).cuda()
-
-    torch_out = model(x)
-    trt_out = trt_model(dict(input=x))['output']
-
-    print('torch out shape:', torch_out.shape)
-    print('trt out shape:', trt_out.shape)
-
-    print('max delta:', (torch_out - trt_out).abs().max())
-    print('mean delta:', (torch_out - trt_out).abs().mean())
-
-    speed_test(model, x)
-    speed_test(trt_model, dict(input=x))
-
-
-def main():
-    args, cfg = get_args()
-
-    if args.onnx or args.trt:
-        torch2onnx(args, cfg)
-        print('torch -> onnx: succeess')
-
-    if args.trt:
-        onnx2trt(args)
-        print('onnx -> trt: success')
-        check(args, cfg)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/classification/gflops.py b/classification/gflops.py
index f6ef0228..dff9af67 100644
--- a/classification/gflops.py
+++ b/classification/gflops.py
@@ -98,7 +98,7 @@ def get_flops(model, input_shape):
                         num_classes=0,
                         attn_pool_num_heads=16,
                         clip_embed_dim=768,
-                        head_norm_type='bn').to(torch.bfloat16)
+                        norm_type='rms').to(torch.bfloat16)
 
     for k, v in model.named_parameters():
         v.requires_grad = True
diff --git a/classification/hf2pytorch.py b/classification/hf2pytorch.py
new file mode 100644
index 00000000..a1ba1dd3
--- /dev/null
+++ b/classification/hf2pytorch.py
@@ -0,0 +1,70 @@
+import argparse
+import os
+
+import torch
+from safetensors.torch import load_file as safetensors_load_file
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description='Process and convert model state_dicts.')
+parser.add_argument('input_dir', type=str, help='Directory containing input .bin and .safetensors files.')
+parser.add_argument('output_file', type=str, help='Output file to save the converted state_dict.')
+args = parser.parse_args()
+
+# Verify that the input directory exists
+if not os.path.isdir(args.input_dir):
+    raise ValueError(f'Input directory does not exist: {args.input_dir}')
+
+# List all files in the input directory
+filenames = os.listdir(args.input_dir)
+
+# Filter files to include only .bin and .safetensors files
+filenames = [f for f in filenames if f.endswith('.bin') or f.endswith('.safetensors')]
+filepaths = [os.path.join(args.input_dir, f) for f in filenames]
+print(f'Found files: {filenames}')
+
+# Initialize an empty state_dict to store the loaded data
+state_dict = {}
+
+# Loop over each file and load its contents
+for filepath in filepaths:
+    print(f'Loading: {filepath}')
+    if filepath.endswith('.bin'):
+        # Load .bin file using torch.load
+        loaded_dict = torch.load(filepath, map_location='cpu')
+        state_dict.update(loaded_dict)
+    elif filepath.endswith('.safetensors'):
+        # Load .safetensors file using safetensors
+        loaded_dict = safetensors_load_file(filepath, device='cpu')
+        state_dict.update(loaded_dict)
+    else:
+        raise ValueError(f'Unsupported file format: {filepath}')
+
+# Print the keys of the loaded state_dict
+print(f'Loaded state_dict keys: {list(state_dict.keys())}')
+
+# Initialize a new state_dict to store the converted data
+new_state_dict = {}
+
+# Iterate over each key-value pair in the original state_dict
+for k, v in state_dict.items():
+    # Replace key substrings according to specified mapping rules
+    k_new = k
+    k_new = k_new.replace('embeddings.class_embedding', 'cls_token')
+    k_new = k_new.replace('embeddings.position_embedding', 'pos_embed')
+    k_new = k_new.replace('embeddings.patch_embedding.weight', 'patch_embed.proj.weight')
+    k_new = k_new.replace('embeddings.patch_embedding.bias', 'patch_embed.proj.bias')
+    k_new = k_new.replace('ls1', 'ls1.gamma')
+    k_new = k_new.replace('ls2', 'ls2.gamma')
+    k_new = k_new.replace('encoder.layers.', 'blocks.')
+    # Update the new_state_dict with the transformed key and original value
+    new_state_dict[k_new] = v
+
+# Print the keys of the converted state_dict
+print(f'Converted state_dict keys: {list(new_state_dict.keys())}')
+
+# Wrap the new_state_dict in a dictionary under the 'module' key
+new_dict = {'module': new_state_dict}
+
+# Save the new_dict to the specified output file
+print(f'Saving converted state_dict to: {args.output_file}')
+torch.save(new_dict, args.output_file)
diff --git a/classification/main_accelerate.py b/classification/main_accelerate.py
deleted file mode 100644
index 975be24b..00000000
--- a/classification/main_accelerate.py
+++ /dev/null
@@ -1,377 +0,0 @@
-# --------------------------------------------------------
-# InternVL
-# Copyright (c) 2022 OpenGVLab
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-
-import argparse
-import datetime
-import logging
-import os
-import random
-import time
-import warnings
-
-import numpy as np
-import torch
-import torch.backends.cudnn as cudnn
-from accelerate import Accelerator, GradScalerKwargs
-from accelerate.logging import get_logger
-from config import get_config
-from dataset import build_loader2
-from ddp_hooks import fp16_compress_hook
-from lr_scheduler import build_scheduler
-from models import build_model
-from optimizer import build_optimizer
-from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
-from timm.utils import AverageMeter, ModelEma, accuracy
-from tqdm import tqdm
-from utils import load_ema_checkpoint, load_pretrained
-
-logger = get_logger(__name__)
-warnings.filterwarnings('ignore')
-
-
-def parse_option():
-    parser = argparse.ArgumentParser(
-        'InternVL training and evaluation script', add_help=False)
-    parser.add_argument('--cfg', type=str, required=True, metavar='FILE', help='path to config file')
-    parser.add_argument('--opts', help="Modify config options by adding 'KEY VALUE' pairs. ", default=None, nargs='+')
-
-    # easy config modification
-    parser.add_argument('--batch-size', type=int, help='batch size for single GPU')
-    parser.add_argument('--dataset', type=str, help='dataset name', default=None)
-    parser.add_argument('--data-path', type=str, help='path to dataset')
-    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
-    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
-                        help='no: no cache, '
-                        'full: cache all data, '
-                        'part: sharding the dataset into nonoverlapping pieces and only cache one piece'
-                        )
-    parser.add_argument('--pretrained', help='pretrained weight from checkpoint, could be imagenet22k pretrained weight')
-    parser.add_argument('--resume', help='resume from checkpoint')
-    parser.add_argument('--output', default='work_dirs', type=str, metavar='PATH',
-                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)'
-                        )
-    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
-    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
-    parser.add_argument('--save-ckpt-num', default=1, type=int)
-    parser.add_argument('--accumulation-steps', type=int, default=1, help='gradient accumulation steps')
-    parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar')
-    parser.add_argument(
-        '--logger',
-        type=str,
-        default='tensorboard',
-        choices=['tensorboard', 'wandb'],
-        help=(
-            'Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)'
-            ' for experiment tracking and logging of model metrics and model checkpoints'
-        ),
-    )
-
-    args, unparsed = parser.parse_known_args()
-    config = get_config(args)
-    config.defrost()
-    config.TRAIN.OPTIMIZER.USE_ZERO = False
-    config.OUTPUT += '_deepspeed'
-    config.DATA.IMG_ON_MEMORY = False
-    config.freeze()
-    return args, config
-
-
-def seed_everything(seed, rank):
-    seed = seed + rank
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    np.random.seed(seed)
-    random.seed(seed)
-    cudnn.benchmark = True
-
-
-def save_config(config):
-    path = os.path.join(config.OUTPUT, 'config.json')
-    with open(path, 'w') as f:
-        f.write(config.dump())
-    logger.info(f'Full config saved to {path}')
-
-
-def build_criterion(config):
-    if config.AUG.MIXUP > 0.:
-        # smoothing is handled with mixup label transform
-        criterion = SoftTargetCrossEntropy()
-    elif config.MODEL.LABEL_SMOOTHING > 0.:
-        criterion = LabelSmoothingCrossEntropy(
-            smoothing=config.MODEL.LABEL_SMOOTHING)
-    else:
-        criterion = torch.nn.CrossEntropyLoss()
-    return criterion
-
-
-def scale_learning_rate(config, num_processes):
-    # linear scale the learning rate according to total batch size, may not be optimal
-    linear_scaled_lr = config.TRAIN.BASE_LR * \
-        config.DATA.BATCH_SIZE * num_processes / 512.0
-    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * \
-        config.DATA.BATCH_SIZE * num_processes / 512.0
-    linear_scaled_min_lr = config.TRAIN.MIN_LR * \
-        config.DATA.BATCH_SIZE * num_processes / 512.0
-    # gradient accumulation also need to scale the learning rate
-    if config.TRAIN.ACCUMULATION_STEPS > 1:
-        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
-        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
-        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
-    config.defrost()
-    config.TRAIN.BASE_LR = linear_scaled_lr
-    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
-    config.TRAIN.MIN_LR = linear_scaled_min_lr
-    config.freeze()
-
-    logger.info('BASE_LR={}'.format(config.TRAIN.BASE_LR))
-    logger.info('WARMUP_LR={}'.format(config.TRAIN.WARMUP_LR))
-    logger.info('MIN_LR={}'.format(config.TRAIN.MIN_LR))
-
-
-def setup_autoresume(config):
-    if config.MODEL.RESUME == '' and config.TRAIN.AUTO_RESUME:
-        last_checkpoint = os.path.join(config.OUTPUT, 'last')
-        resume_file = last_checkpoint if os.path.exists(last_checkpoint) else None
-
-        if resume_file:
-            if config.MODEL.RESUME:
-                logger.warning(f'auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}')
-            config.defrost()
-            config.MODEL.RESUME = resume_file
-            config.freeze()
-            logger.info(f'auto resuming from {resume_file}')
-        else:
-            logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume')
-
-
-def load_model_checkpoint(config, model, accelerator):
-    if config.MODEL.RESUME:
-        try:
-            checkpoint = torch.load(config.MODEL.RESUME)['model']
-            checkpoint = {k.replace('module.', ''): v for k, v in checkpoint.items()}
-            model.load_state_dict(checkpoint)
-        except:
-            accelerator.load_state(config.MODEL.RESUME)
-    elif config.MODEL.PRETRAINED:
-        try:
-            load_pretrained(config, model, logger)
-        except:
-            accelerator.load_state(config.MODEL.PRETRAINED)
-    return model
-
-
-def save_checkpoint(save_dir, accelerator, epoch, max_acc, config, lr_scheduler=None):
-    # let accelerator handle the model and optimizer state for ddp and deepspeed.
-    accelerator.save_state(save_dir)
-
-    if accelerator.is_main_process:
-        save_state = {
-            'lr_scheduler': lr_scheduler.state_dict(),
-            'max_acc': max_acc,
-            'epoch': epoch,
-            'config': config
-        }
-        torch.save(save_state, os.path.join(save_dir, 'additional_state.pth'))
-
-
-def load_checkpoint_if_needed(accelerator, config, lr_scheduler=None):
-    setup_autoresume(config)
-    save_dir = config.MODEL.RESUME
-    if not save_dir:
-        return 0.0
-    accelerator.load_state(save_dir)
-    checkpoint = torch.load(os.path.join(save_dir, 'additional_state.pth'), map_location='cpu')
-    if lr_scheduler is not None:
-        logger.info('resuming lr_scheduler')
-        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
-    config.defrost()
-    config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1
-    config.freeze()
-    max_acc = checkpoint.get('max_acc', 0.0)
-    logger.info(f"=> loaded successfully {config.MODEL.RESUME} (epoch {checkpoint['epoch']})")
-    return max_acc
-
-
-def log_model_statistic(model_wo_ddp):
-    n_parameters = sum(p.numel() for p in model_wo_ddp.parameters()
-                       if p.requires_grad)
-    logger.info(f'number of params: {n_parameters}')
-    if hasattr(model_wo_ddp, 'flops'):
-        flops = model_wo_ddp.flops()
-        logger.info(f'number of GFLOPs: {flops / 1e9}')
-
-
-def train_epoch(*, model, optimizer, data_loader, scheduler, criterion, mixup_fn,
-                accelerator: Accelerator, epoch, config):
-    model.train()
-
-    num_steps = len(data_loader)
-    batch_time = AverageMeter()
-    model_time = AverageMeter()
-    loss_meter = AverageMeter()
-
-    end = time.time()
-
-    gradient_accumulation_steps = config.TRAIN.ACCUMULATION_STEPS
-
-    for step, (samples, targets) in enumerate(data_loader):
-        iter_begin_time = time.time()
-
-        if mixup_fn is not None:
-            samples, targets = mixup_fn(samples, targets)
-
-        with accelerator.accumulate(model):
-            outputs = model(samples)
-            loss = criterion(outputs, targets)
-            accelerator.backward(loss)
-            if accelerator.sync_gradients:
-                accelerator.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD)
-            optimizer.step()
-            optimizer.zero_grad()
-
-        accelerator.wait_for_everyone()
-
-        if (step + 1) % gradient_accumulation_steps == 0:
-            if scheduler is not None:
-                scheduler.step_update((epoch * num_steps + step) // gradient_accumulation_steps)
-
-            batch_time.update(time.time() - end)
-            model_time.update(time.time() - iter_begin_time)
-            loss_meter.update(loss.item())
-            end = time.time()
-
-        if accelerator.is_main_process and step % config.PRINT_FREQ == 0:
-            lr = optimizer.param_groups[0]['lr']
-            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
-            etas = batch_time.avg * (num_steps - step)
-
-            logger.info(
-                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{step}/{num_steps}]\t'
-                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.10f}\t'
-                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
-                f'model_time {model_time.val:.4f} ({model_time.avg:.4f})\t'
-                f'loss {loss_meter.val:.8f} ({loss_meter.avg:.4f})\t'
-                f'mem {memory_used:.0f}MB')
-
-
-@torch.no_grad()
-def eval_epoch(*, config, data_loader, model, accelerator: Accelerator):
-    model.eval()
-
-    acc1_meter = AverageMeter()
-    acc5_meter = AverageMeter()
-
-    for idx, (images, target) in enumerate(tqdm(data_loader, disable=accelerator.is_main_process)):
-        output = model(images)
-
-        # convert 22k to 1k to evaluate
-        if output.size(-1) == 21841:
-            convert_file = './meta_data/map22kto1k.txt'
-            with open(convert_file, 'r') as f:
-                convert_list = [int(line) for line in f.readlines()]
-            output = output[:, convert_list]
-
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        acc1 = accelerator.gather(acc1).mean(0)
-        acc5 = accelerator.gather(acc5).mean(0)
-
-        acc1_meter.update(acc1.item(), target.size(0))
-        acc5_meter.update(acc5.item(), target.size(0))
-
-        if (idx + 1) % config.PRINT_FREQ == 0 or idx + 1 == len(data_loader):
-            logger.info(f'Test: [{idx+1}/{len(data_loader)}]\t'
-                        f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
-                        f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
-                        )
-    return acc1_meter.avg
-
-
-def eval(config, accelerator: Accelerator):
-    _, _, _, _, validate_dataloader, _, _ = build_loader2(config)
-    model = build_model(config)
-    model, validate_dataloader = accelerator.prepare(model, validate_dataloader)
-    model = load_model_checkpoint(config, model, accelerator)
-    log_model_statistic(accelerator.unwrap_model(model))
-    eval_epoch(config=config, data_loader=validate_dataloader, model=model, accelerator=accelerator)
-
-
-def train(config, accelerator: Accelerator):
-    _, _, _, training_dataloader, validate_dataloader, _, mixup_fn = build_loader2(config)
-    model = build_model(config)
-    optimizer = build_optimizer(config, model)
-    criterion = build_criterion(config)
-
-    model, optimizer, training_dataloader, validate_dataloader = accelerator.prepare(
-        model, optimizer, training_dataloader, validate_dataloader)
-
-    effective_update_steps_per_epoch = len(training_dataloader) // config.TRAIN.ACCUMULATION_STEPS
-    lr_scheduler = build_scheduler(config, optimizer, effective_update_steps_per_epoch)
-
-    try:
-        model.register_comm_hook(state=None, hook=fp16_compress_hook)
-        logger.info('using fp16_compress_hook!')
-    except:
-        logger.info('cannot register fp16_compress_hook!')
-
-    max_acc = load_checkpoint_if_needed(accelerator, config, lr_scheduler)
-
-    logger.info(f'Created model:{config.MODEL.TYPE}/{config.MODEL.NAME}')
-    logger.info(str(model))
-    logger.info('Effective Optimizer Steps: {}'.format(effective_update_steps_per_epoch))
-    logger.info('Start training')
-    logger.info('Max accuracy: {}'.format(max_acc))
-    log_model_statistic(accelerator.unwrap_model(model))
-
-    for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS):
-        train_epoch(model=model, optimizer=optimizer, data_loader=training_dataloader,
-                    scheduler=lr_scheduler, criterion=criterion, mixup_fn=mixup_fn,
-                    accelerator=accelerator, epoch=epoch, config=config)
-        acc = eval_epoch(config=config, data_loader=validate_dataloader, model=model,
-                         accelerator=accelerator)
-
-        accelerator.wait_for_everyone()
-        if acc > max_acc:
-            max_acc = acc
-            save_checkpoint(os.path.join(config.OUTPUT, 'best'), accelerator, epoch, max_acc, config, lr_scheduler)
-        logger.info(f'Max Acc@1 {max_acc:.3f}')
-        save_checkpoint(os.path.join(config.OUTPUT, 'last'), accelerator, epoch, max_acc, config, lr_scheduler)
-
-
-def main():
-    args, config = parse_option()
-    os.makedirs(config.OUTPUT, exist_ok=True)
-    logging.basicConfig(
-        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
-        datefmt='%m/%d/%Y %H:%M:%S',
-        filename=os.path.join(config.OUTPUT, 'run.log'),
-        level=logging.INFO,
-    )
-
-    loggers = ['tensorboard']
-    accelerator = Accelerator(
-        log_with=loggers,
-        project_dir=config.OUTPUT,
-        gradient_accumulation_steps=config.TRAIN.ACCUMULATION_STEPS,
-        # When use deepspeed, you could not comment this out
-        # even if you set loss scale to 1.0 in deepspeed config.
-        kwargs_handlers=[GradScalerKwargs(enabled=not args.disable_grad_scalar)],
-    )
-    logger.info(accelerator.state, main_process_only=False)
-
-    scale_learning_rate(config, accelerator.num_processes)
-    seed_everything(config.SEED, accelerator.process_index)
-    save_config(config)
-
-    logger.info(config.dump())
-
-    if config.EVAL_MODE:
-        eval(config, accelerator)
-    else:
-        train(config, accelerator)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/classification/main_deepspeed.py b/classification/main_deepspeed.py
deleted file mode 100644
index 67068e24..00000000
--- a/classification/main_deepspeed.py
+++ /dev/null
@@ -1,544 +0,0 @@
-# --------------------------------------------------------
-# InternVL
-# Copyright (c) 2022 OpenGVLab
-# Licensed under The MIT License [see LICENSE for details]
-# --------------------------------------------------------
-
-import argparse
-import datetime
-import os
-import random
-import subprocess
-import time
-
-import deepspeed
-import numpy as np
-import torch
-import torch.backends.cudnn as cudnn
-import torch.distributed as dist
-from config import get_config
-from dataset import build_loader
-from ddp_hooks import fp16_compress_hook
-from ema_deepspeed import EMADeepspeed
-from logger import create_logger
-from lr_scheduler import build_scheduler
-from models import build_model
-from optimizer import set_weight_decay_and_lr
-from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
-from timm.utils import AverageMeter, accuracy
-from utils import MyAverageMeter, load_pretrained, reduce_tensor
-
-
-def parse_option():
-    parser = argparse.ArgumentParser(
-        'InternVL training and evaluation script', add_help=False)
-    parser.add_argument('--cfg', type=str, required=True, metavar='FILE', help='path to config file')
-    parser.add_argument('--opts', help="Modify config options by adding 'KEY VALUE' pairs. ", default=None, nargs='+')
-
-    # easy config modification
-    parser.add_argument('--batch-size', type=int, help='batch size for single GPU')
-    parser.add_argument('--dataset', type=str, help='dataset name', default=None)
-    parser.add_argument('--data-path', type=str, help='path to dataset')
-    parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
-    parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
-                        help='no: no cache, '
-                             'full: cache all data, '
-                             'part: sharding the dataset into nonoverlapping pieces and only cache one piece'
-                        )
-    parser.add_argument('--pretrained',
-                        help='pretrained weight from checkpoint, could be imagenet22k pretrained weight')
-    parser.add_argument('--resume', help='resume from checkpoint')
-    parser.add_argument('--output', default='work_dirs', type=str, metavar='PATH',
-                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)'
-                        )
-
-    parser.add_argument('--eval', action='store_true', help='Perform evaluation only')
-    parser.add_argument('--throughput', action='store_true', help='Test throughput only')
-    parser.add_argument('--save-ckpt-num', default=1, type=int)
-    parser.add_argument('--accumulation-steps', type=int, default=1, help='gradient accumulation steps')
-
-    # distributed training
-    parser.add_argument('--local-rank', type=int, required=True, help='local rank for DistributedDataParallel')
-
-    # deepspeed config
-    parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar')
-    parser.add_argument('--offload-optimizer', type=str, default='none', choices=['cpu', 'none'],
-                        help='enable optimizer offloading')
-    parser.add_argument('--offload-param', type=str, default='none', choices=['cpu', 'none'],
-                        help='enable model offloading')
-    # To use Zero3, Please use main_accelerate.py instead.
-    # For this script, we are facing a similar issue as https://github.com/microsoft/DeepSpeed/issues/3068
-    parser.add_argument('--zero-stage', type=int, default=1, choices=[1, 2], help='deep speed zero stage')
-
-    args, unparsed = parser.parse_known_args()
-    config = get_config(args)
-
-    return args, config
-
-
-def seed_everything(seed, rank):
-    seed = seed + rank
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    np.random.seed(seed)
-    random.seed(seed)
-    cudnn.benchmark = True
-
-
-def save_config(config):
-    path = os.path.join(config.OUTPUT, 'config.json')
-    with open(path, 'w') as f:
-        f.write(config.dump())
-    logger.info(f'Full config saved to {path}')
-
-
-def build_criterion(config):
-    if config.AUG.MIXUP > 0.:
-        # smoothing is handled with mixup label transform
-        criterion = SoftTargetCrossEntropy()
-    elif config.MODEL.LABEL_SMOOTHING > 0.:
-        criterion = LabelSmoothingCrossEntropy(
-            smoothing=config.MODEL.LABEL_SMOOTHING)
-    else:
-        criterion = torch.nn.CrossEntropyLoss()
-    return criterion
-
-
-def scale_learning_rate(config, num_processes):
-    # linear scale the learning rate according to total batch size, may not be optimal
-    linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * num_processes / 512.0
-    linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * num_processes / 512.0
-    linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * num_processes / 512.0
-    # gradient accumulation also need to scale the learning rate
-    if config.TRAIN.ACCUMULATION_STEPS > 1:
-        linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
-        linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS
-        linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS
-    config.defrost()
-    config.TRAIN.BASE_LR = linear_scaled_lr
-    config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr
-    config.TRAIN.MIN_LR = linear_scaled_min_lr
-    config.freeze()
-
-    logger.info('BASE_LR={}'.format(config.TRAIN.BASE_LR))
-    logger.info('WARMUP_LR={}'.format(config.TRAIN.WARMUP_LR))
-    logger.info('MIN_LR={}'.format(config.TRAIN.MIN_LR))
-
-
-def log_model_statistic(model_wo_ddp):
-    n_parameters = sum(p.numel() for p in model_wo_ddp.parameters()
-                       if p.requires_grad)
-    logger.info(f'number of params: {n_parameters / 1e6} M')
-    if hasattr(model_wo_ddp, 'flops'):
-        flops = model_wo_ddp.flops()
-        logger.info(f'number of GFLOPs: {flops / 1e9}')
-
-
-def get_parameter_groups(model, config):
-    skip = {}
-    skip_keywords = {}
-    if hasattr(model, 'no_weight_decay'):
-        skip = model.no_weight_decay()
-    if hasattr(model, 'no_weight_decay_keywords'):
-        skip_keywords = model.no_weight_decay_keywords()
-
-    parameters = set_weight_decay_and_lr(
-        model,
-        config.TRAIN.WEIGHT_DECAY,
-        config.TRAIN.BASE_LR,
-        skip,
-        skip_keywords,
-        lr_layer_decay=config.TRAIN.LR_LAYER_DECAY,
-        lr_layer_decay_ratio=config.TRAIN.LR_LAYER_DECAY_RATIO,
-        freeze_backbone=config.TRAIN.OPTIMIZER.FREEZE_BACKBONE,
-        dcn_lr_mul=config.TRAIN.OPTIMIZER.DCN_LR_MUL,
-    )
-    return parameters
-
-
-def get_optimizer_state_str(optimizer):
-    states = []
-    for param_group in optimizer.param_groups:
-        states.append(f'name={param_group["name"]} lr={param_group["lr"]} weight_decay={param_group["weight_decay"]}')
-    return '\n'.join(states)
-
-
-def build_ds_config(config, args):
-    opt_lower = config.TRAIN.OPTIMIZER.NAME.lower()
-    if opt_lower == 'adamw':
-        optimizer = {
-            'type': 'AdamW',
-            'params': {
-                'lr': config.TRAIN.BASE_LR,
-                'eps': config.TRAIN.OPTIMIZER.EPS,
-                'betas': config.TRAIN.OPTIMIZER.BETAS,
-                'weight_decay': config.TRAIN.WEIGHT_DECAY
-            }
-        }
-    else:
-        return NotImplemented
-
-    ds_config = {
-        'train_micro_batch_size_per_gpu': config.DATA.BATCH_SIZE,
-        'optimizer': optimizer,
-        'bf16': {
-            'enabled': True,
-        },
-        'zero_optimization': {
-            'stage': 1,
-            'allgather_partitions': True,
-            'allgather_bucket_size': 1e9,
-            'overlap_comm': True,
-            'reduce_scatter': True,
-            'reduce_bucket_size': 1e9,
-            'contiguous_gradients': True
-        },
-        'steps_per_print': 1e10,
-        'gradient_accumulation_steps': config.TRAIN.ACCUMULATION_STEPS,
-        'gradient_clipping': config.TRAIN.CLIP_GRAD,
-    }
-    return ds_config
-
-
-@torch.no_grad()
-def throughput(data_loader, model, logger):
-    model.eval()
-
-    for idx, (images, _) in enumerate(data_loader):
-        images = images.cuda(non_blocking=True)
-        batch_size = images.shape[0]
-        for i in range(50):
-            model(images)
-        torch.cuda.synchronize()
-        logger.info(f'throughput averaged with 30 times')
-        tic1 = time.time()
-        for i in range(30):
-            model(images)
-        torch.cuda.synchronize()
-        tic2 = time.time()
-        logger.info(
-            f'batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}'
-        )
-        return
-
-
-def train_epoch(config, model, criterion, data_loader, optimizer, epoch, mixup_fn, lr_scheduler, model_ema=None):
-    model.train()
-
-    num_steps = len(data_loader)
-    batch_time = AverageMeter()
-    model_time = AverageMeter()
-    loss_meter = AverageMeter()
-    norm_meter = MyAverageMeter(300)
-
-    start = time.time()
-    end = time.time()
-
-    for idx, (samples, targets) in enumerate(data_loader):
-        iter_begin_time = time.time()
-        samples = samples.cuda(non_blocking=True)
-        targets = targets.cuda(non_blocking=True)
-
-        if mixup_fn is not None:
-            samples, targets = mixup_fn(samples, targets)
-
-        outputs = model(samples)
-        loss = criterion(outputs, targets)
-
-        model.backward(loss)
-        model.step()
-
-        if model_ema is not None:
-            model_ema(model)
-
-        if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0:
-            lr_scheduler.step_update(epoch * num_steps + idx)
-
-        torch.cuda.synchronize()
-        loss_meter.update(loss.item(), targets.size(0))
-        norm_meter.update(optimizer._global_grad_norm)
-        batch_time.update(time.time() - end)
-        model_time.update(time.time() - iter_begin_time)
-        end = time.time()
-
-        if idx % config.PRINT_FREQ == 0:
-            lr = optimizer.param_groups[0]['lr']
-            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
-            etas = batch_time.avg * (num_steps - idx)
-            logger.info(
-                f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t'
-                f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t'
-                f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t'
-                f'model_time {model_time.val:.4f} ({model_time.avg:.4f})\t'
-                f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
-                f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f}/{norm_meter.var:.4f})\t'
-                f'mem {memory_used:.0f}MB')
-
-    epoch_time = time.time() - start
-    logger.info(f'EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}')
-
-
-@torch.no_grad()
-def eval_epoch(config, data_loader, model, epoch=None):
-    criterion = torch.nn.CrossEntropyLoss()
-    model.eval()
-
-    batch_time = AverageMeter()
-    loss_meter = AverageMeter()
-    acc1_meter = AverageMeter()
-    acc5_meter = AverageMeter()
-
-    end = time.time()
-    for idx, (images, target) in enumerate(data_loader):
-        images = images.cuda(non_blocking=True)
-        target = target.cuda(non_blocking=True)
-        output = model(images)
-
-        # convert 22k to 1k to evaluate
-        if output.size(-1) == 21841:
-            convert_file = './meta_data/map22kto1k.txt'
-            with open(convert_file, 'r') as f:
-                convert_list = [int(line) for line in f.readlines()]
-            output = output[:, convert_list]
-
-        # measure accuracy and record loss
-        loss = criterion(output, target)
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-
-        acc1 = reduce_tensor(acc1)
-        acc5 = reduce_tensor(acc5)
-        loss = reduce_tensor(loss)
-
-        loss_meter.update(loss.item(), target.size(0))
-        acc1_meter.update(acc1.item(), target.size(0))
-        acc5_meter.update(acc5.item(), target.size(0))
-
-        # measure elapsed time
-        batch_time.update(time.time() - end)
-        end = time.time()
-
-        if idx % config.PRINT_FREQ == 0:
-            memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0)
-            logger.info(f'Test: [{idx}/{len(data_loader)}]\t'
-                        f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
-                        f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t'
-                        f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t'
-                        f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t'
-                        f'Mem {memory_used:.0f}MB')
-    if epoch is not None:
-        logger.info(f'[Epoch:{epoch}] * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
-    else:
-        logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}')
-
-    return acc1_meter.avg, acc5_meter.avg, loss_meter.avg
-
-
-def train(config, ds_config):
-    # -------------- build ---------------- #
-
-    _, dataset_val, _, data_loader_train, data_loader_val, _, mixup_fn = build_loader(config)
-    model = build_model(config)
-    model.cuda()
-
-    if config.MODEL.PRETRAINED:
-        load_pretrained(config, model, logger)
-
-    logger.info(ds_config)
-    model, optimizer, _, _ = deepspeed.initialize(
-        config=ds_config,
-        model=model,
-        model_parameters=get_parameter_groups(model, config),
-        dist_init_required=False,
-    )
-
-    try:
-        model.register_comm_hook(state=None, hook=fp16_compress_hook)
-        logger.info('using fp16_compress_hook!')
-    except:
-        logger.info('cannot register fp16_compress_hook!')
-
-    model_without_ddp = model.module
-
-    lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train))
-    criterion = build_criterion(config)
-
-    model_ema = None
-    if config.TRAIN.EMA.ENABLE:
-        model_ema = EMADeepspeed(model, config.TRAIN.EMA.DECAY)
-
-    # -------------- resume ---------------- #
-
-    max_accuracy = 0.0
-    max_accuracy_ema = 0.0
-    client_state = {}
-    if config.MODEL.RESUME == '' and config.TRAIN.AUTO_RESUME:
-        if os.path.exists(os.path.join(config.OUTPUT, 'latest')):
-            config.defrost()
-            config.MODEL.RESUME = config.OUTPUT
-            config.freeze()
-            tag = None
-    elif config.MODEL.RESUME:
-        config.MODEL.RESUME = os.path.dirname(config.MODEL.RESUME)
-        tag = os.path.basename(config.MODEL.RESUME)
-    if config.MODEL.RESUME:
-        logger.info('loading checkpoint from {}'.format(config.MODEL.RESUME))
-        _, client_state = model.load_checkpoint(load_dir=config.MODEL.RESUME, tag=tag)
-        logger.info(f'client_state={client_state.keys()}')
-        lr_scheduler.load_state_dict(client_state['custom_lr_scheduler'])
-        max_accuracy = client_state['max_accuracy']
-
-        if model_ema is not None:
-            max_accuracy_ema = client_state.get('max_accuracy_ema', 0.0)
-            model_ema.load_state_dict((client_state['model_ema']))
-
-    # -------------- training ---------------- #
-
-    logger.info(f'Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}')
-    logger.info(str(model))
-    logger.info(get_optimizer_state_str(optimizer))
-    logger.info('Start training')
-    logger.info('max_accuracy: {}'.format(max_accuracy))
-    log_model_statistic(model_without_ddp)
-
-    start_time = time.time()
-    start_epoch = client_state['epoch'] + 1 if 'epoch' in client_state else config.TRAIN.START_EPOCH
-    for epoch in range(start_epoch, config.TRAIN.EPOCHS):
-        data_loader_train.sampler.set_epoch(epoch)
-        train_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler,
-                    model_ema=model_ema)
-
-        if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.EPOCHS - 1:
-            model.save_checkpoint(
-                save_dir=config.OUTPUT,
-                tag=f'epoch{epoch}',
-                client_state={
-                    'custom_lr_scheduler': lr_scheduler.state_dict(),
-                    'max_accuracy': max_accuracy,
-                    'epoch': epoch,
-                    'config': config,
-                    'max_accuracy_ema': max_accuracy_ema if model_ema is not None else 0.0,
-                    'model_ema': model_ema.state_dict() if model_ema is not None else None,
-                }
-            )
-
-        if epoch % config.EVAL_FREQ == 0:
-            acc1, _, _ = eval_epoch(config, data_loader_val, model, epoch)
-            logger.info(f'Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%')
-
-            if acc1 > max_accuracy:
-                model.save_checkpoint(
-                    save_dir=config.OUTPUT,
-                    tag='best',
-                    client_state={
-                        'custom_lr_scheduler': lr_scheduler.state_dict(),
-                        'max_accuracy': max_accuracy,
-                        'epoch': epoch,
-                        'config': config,
-                        'max_accuracy_ema': max_accuracy_ema if model_ema is not None else 0.0,
-                        'model_ema': model_ema.state_dict() if model_ema is not None else None,
-                    }
-                )
-
-            max_accuracy = max(max_accuracy, acc1)
-            logger.info(f'Max accuracy: {max_accuracy:.2f}%')
-
-            if model_ema is not None:
-                with model_ema.activate(model):
-                    acc1_ema, _, _ = eval_epoch(config, data_loader_val, model, epoch)
-                    logger.info(f'[EMA] Accuracy of the network on the {len(dataset_val)} test images: {acc1_ema:.1f}%')
-                    max_accuracy_ema = max(max_accuracy_ema, acc1_ema)
-                    logger.info(f'[EMA] Max accuracy: {max_accuracy_ema:.2f}%')
-
-    total_time = time.time() - start_time
-    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-    logger.info('Training time {}'.format(total_time_str))
-
-
-def eval(config):
-    _, _, _, _, data_loader_val, _, _ = build_loader(config)
-    model = build_model(config)
-    model.cuda()
-    model = torch.nn.parallel.DistributedDataParallel(
-        model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False)
-
-    model_wo_ddp = model.module
-    if config.MODEL.RESUME:
-        try:
-            checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
-            msg = model_wo_ddp.load_state_dict(checkpoint['model'], strict=False)
-            logger.info(msg)
-        except:
-            try:
-                from deepspeed.utils.zero_to_fp32 import \
-                    get_fp32_state_dict_from_zero_checkpoint
-                ckpt_dir = os.path.dirname(config.MODEL.RESUME)
-                tag = os.path.basename(config.MODEL.RESUME)
-                state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir=ckpt_dir, tag=tag)
-                model_wo_ddp.load_state_dict(state_dict)
-            except:
-                checkpoint = torch.load(os.path.join(config.MODEL.RESUME, 'mp_rank_00_model_states.pt'),
-                                        map_location='cpu')
-                model_wo_ddp.load_state_dict(checkpoint['module'])
-    elif config.MODEL.PRETRAINED:
-        load_pretrained(config, model_wo_ddp, logger)
-
-    if config.THROUGHPUT_MODE:
-        throughput(data_loader_val, model, logger)
-
-    eval_epoch(config, data_loader_val, model)
-
-
-if __name__ == '__main__':
-    args, config = parse_option()
-
-    # init distributed env
-    if 'SLURM_PROCID' in os.environ:
-        print('\nDist init: SLURM')
-        rank = int(os.environ['SLURM_PROCID'])
-        gpu = rank % torch.cuda.device_count()
-        config.defrost()
-        config.LOCAL_RANK = gpu
-        config.freeze()
-
-        world_size = int(os.environ['SLURM_NTASKS'])
-        if 'MASTER_PORT' not in os.environ:
-            os.environ['MASTER_PORT'] = '29501'
-        node_list = os.environ['SLURM_NODELIST']
-        addr = subprocess.getoutput(
-            f'scontrol show hostname {node_list} | head -n1')
-        if 'MASTER_ADDR' not in os.environ:
-            os.environ['MASTER_ADDR'] = addr
-
-        os.environ['RANK'] = str(rank)
-        os.environ['LOCAL_RANK'] = str(gpu)
-        os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count())
-        os.environ['WORLD_SIZE'] = str(world_size)
-    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
-        rank = int(os.environ['RANK'])
-        world_size = int(os.environ['WORLD_SIZE'])
-        print(f'RANK and WORLD_SIZE in environ: {rank}/{world_size}')
-    else:
-        rank = -1
-        world_size = -1
-    torch.cuda.set_device(config.LOCAL_RANK)
-    torch.distributed.init_process_group(backend='nccl',
-                                         init_method='env://',
-                                         world_size=world_size,
-                                         rank=rank)
-    torch.distributed.barrier()
-
-    os.makedirs(config.OUTPUT, exist_ok=True)
-    logger = create_logger(output_dir=config.OUTPUT,
-                           dist_rank=dist.get_rank(),
-                           name=f'{config.MODEL.NAME}')
-    logger.info(config.dump())
-
-    if dist.get_rank() == 0:
-        save_config(config)
-    scale_learning_rate(config, dist.get_world_size())
-    seed_everything(config.SEED, dist.get_rank())
-
-    if config.EVAL_MODE:
-        eval(config)
-    else:
-        train(config, build_ds_config(config, args))
diff --git a/classification/models/build.py b/classification/models/build.py
index 708737ea..85c15d84 100644
--- a/classification/models/build.py
+++ b/classification/models/build.py
@@ -4,6 +4,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 
+from .clip_vit import CLIPViT
 from .intern_vit_6b import InternViT6B
 
 
@@ -28,7 +29,21 @@ def build_model(config):
             freeze_vit=config.MODEL.INTERN_VIT_6B.FREEZE_VIT,
             pretrained=config.MODEL.INTERN_VIT_6B.PRETRAINED,
             cls_target=config.MODEL.INTERN_VIT_6B.CLS_TARGET,
-            head_norm_type=config.MODEL.INTERN_VIT_6B.HEAD_NORM_TYPE,
+            norm_type=config.MODEL.INTERN_VIT_6B.NORM_TYPE,
+        )
+    elif model_type == 'clip_vit':
+        model = CLIPViT(
+            patch_size=config.MODEL.CLIP_VIT.PATCH_SIZE,
+            img_size=config.DATA.IMG_SIZE,
+            pretrain_size=config.MODEL.CLIP_VIT.PRETRAIN_SIZE,
+            embed_dim=config.MODEL.CLIP_VIT.EMBED_DIM,
+            num_heads=config.MODEL.CLIP_VIT.NUM_HEADS,
+            mlp_ratio=config.MODEL.CLIP_VIT.MLP_RATIO,
+            depth=config.MODEL.CLIP_VIT.DEPTH,
+            with_cp=config.TRAIN.USE_CHECKPOINT,
+            freeze_vit=config.MODEL.CLIP_VIT.FREEZE_VIT,
+            pretrained=config.MODEL.CLIP_VIT.PRETRAINED,
+            cls_target=config.MODEL.CLIP_VIT.CLS_TARGET,
         )
     else:
         raise NotImplementedError(f'Unkown model: {model_type}')
diff --git a/classification/models/clip_vit.py b/classification/models/clip_vit.py
new file mode 100644
index 00000000..33e86640
--- /dev/null
+++ b/classification/models/clip_vit.py
@@ -0,0 +1,195 @@
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2024 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath
+from transformers import CLIPModel
+
+
+def _freeze_params(module):
+    for param in module.parameters():
+        param.requires_grad = False
+
+
+class CrossAttention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None, out_dim=None):
+        super().__init__()
+        if out_dim is None:
+            out_dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        assert all_head_dim == dim
+
+        self.q = nn.Linear(dim, all_head_dim, bias=False)
+        self.k = nn.Linear(dim, all_head_dim, bias=False)
+        self.v = nn.Linear(dim, all_head_dim, bias=False)
+
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, out_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, k=None, v=None):
+        B, N, C = x.shape
+        N_k = k.shape[1]
+        N_v = v.shape[1]
+
+        q_bias, k_bias, v_bias = None, None, None
+        if self.q_bias is not None:
+            q_bias = self.q_bias
+            k_bias = self.k_bias
+            v_bias = self.v_bias
+
+        q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
+        q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)  # (B, N_head, N_q, dim)
+
+        k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
+        k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+
+        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
+        v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class AttentiveBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, attn_head_dim=None, out_dim=None):
+        super().__init__()
+
+        self.norm1_q = norm_layer(dim)
+        self.norm1_k = norm_layer(dim)
+        self.norm1_v = norm_layer(dim)
+        self.cross_attn = CrossAttention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
+            proj_drop=drop, attn_head_dim=attn_head_dim, out_dim=out_dim)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None):
+        x_q = self.norm1_q(x_q + pos_q)
+        x_k = self.norm1_k(x_kv + pos_k)
+        x_v = self.norm1_v(x_kv)
+        x = self.cross_attn(x_q, k=x_k, v=x_v)
+
+        return x
+
+
+class AttentionPoolingBlock(AttentiveBlock):
+
+    def forward(self, x):
+        x_q = x.mean(1, keepdim=True)
+        x_kv, pos_q, pos_k = x, 0, 0
+        x = super().forward(x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None)
+        x = x.squeeze(1)
+        return x
+
+
+class CLIPViT(nn.Module):
+
+    def __init__(self, patch_size=14, img_size=336, pretrain_size=336, embed_dim=1024, num_heads=16,
+                 mlp_ratio=4, depth=48, with_cp=True, freeze_vit=True, cls_target='cls_patch_concat',
+                 num_classes=1000, pretrained=None):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.pretrain_size = pretrain_size
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.cls_target = cls_target
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.with_cp = with_cp
+
+        model = CLIPModel.from_pretrained(pretrained)
+        model.post_layernorm = nn.Identity()
+        self.model = model.vision_model
+
+        if freeze_vit:
+            _freeze_params(self)
+
+        if cls_target == 'cls_patch_concat':
+            self.norm = nn.SyncBatchNorm(embed_dim * 2, eps=1e-6)
+            self.head = nn.Linear(embed_dim * 2, num_classes) if num_classes > 0 else nn.Identity()
+        elif cls_target == 'attention_pooling':
+            self.attn_pooling = AttentionPoolingBlock(
+                dim=embed_dim, num_heads=num_heads, qkv_bias=True, qk_scale=None,
+                drop=0., attn_drop=0.0, norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=embed_dim)
+            self.norm = nn.SyncBatchNorm(embed_dim, eps=1e-6)
+            self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        else:
+            raise NotImplementedError
+
+        if type(self.head) != nn.Identity:
+            self.head.weight.data.normal_(mean=0.0, std=0.01)
+            self.head.bias.data.zero_()
+
+    @property
+    def dtype(self):
+        return self.model.embeddings.patch_embedding.weight.dtype
+
+    def forward_features(self, x):
+        x = x.type(self.dtype)
+        x = self.model(x)
+        x = x.last_hidden_state
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.cls_target == 'cls_patch_concat':
+            x = torch.cat((x[:, 0, :], x[:, 1:, :].mean(dim=1)), dim=-1)
+        elif self.cls_target == 'attention_pooling':
+            x = self.attn_pooling(x)
+        else:
+            raise NotImplementedError
+        x = self.norm(x)
+        x = self.head(x)
+        return x
+
+    @torch.jit.ignore
+    def lr_decay_keywords(self, decay_ratio=0.95):
+        lr_ratios = {}
+
+        # layers
+        for idx in range(self.depth):
+            tag = 'layers.{}.'.format(idx)
+            decay = 1.0 * (decay_ratio ** (self.depth - idx))
+            lr_ratios[tag] = decay
+
+        # patch_embedding
+        lr_ratios['patch_embedding'] = 1.0 * (decay_ratio ** (self.depth + 1))
+        lr_ratios['position_embedding'] = 1.0 * (decay_ratio ** (self.depth + 1))
+        lr_ratios['pre_layrnorm'] = 1.0 * (decay_ratio ** (self.depth + 1))
+
+        return lr_ratios
diff --git a/classification/models/intern_vit_6b.py b/classification/models/intern_vit_6b.py
index f3347629..336a8585 100644
--- a/classification/models/intern_vit_6b.py
+++ b/classification/models/intern_vit_6b.py
@@ -333,7 +333,7 @@ def __init__(self, in_chans=3, patch_size=14, img_size=224, pretrain_size=224, q
                  embed_dim=3200, num_heads=25, mlp_ratio=4, init_values=0.1, qk_normalization=True, depth=48,
                  use_flash_attn=True, with_cp=True, layerscale_force_fp32=False, freeze_vit=True,
                  cls_target='cls_patch_concat', num_classes=1000, attn_pool_num_heads=16, clip_embed_dim=768,
-                 head_norm_type='bn', pretrained=None):
+                 norm_type='rms', pretrained=None):
         super().__init__()
         self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
 
@@ -349,7 +349,13 @@ def __init__(self, in_chans=3, patch_size=14, img_size=224, pretrain_size=224, q
             print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
         use_flash_attn = [use_flash_attn] * depth if not isinstance(use_flash_attn, list) else use_flash_attn
 
-        norm_layer_for_blocks = partial(RMSNorm, eps=1e-6)
+        if norm_type == 'rms':
+            norm_layer_for_blocks = partial(RMSNorm, eps=1e-6)
+        elif norm_type == 'ln':
+            norm_layer_for_blocks = partial(nn.LayerNorm, eps=1e-6)
+        else:
+            raise NotImplementedError
+
         self.norm_layer_for_blocks = norm_layer_for_blocks
         self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
         num_patches = self.patch_embed.num_patches
@@ -381,16 +387,16 @@ def __init__(self, in_chans=3, patch_size=14, img_size=224, pretrain_size=224, q
             _freeze_params(self)
 
         if cls_target == 'cls_patch_concat':
-            if head_norm_type == 'bn':
-                self.norm = nn.SyncBatchNorm(embed_dim * 2, eps=1e-6)
-            else:
-                self.norm = nn.LayerNorm(embed_dim * 2, eps=1e-6)
+            self.norm = nn.SyncBatchNorm(embed_dim * 2, eps=1e-6)
             self.head = nn.Linear(embed_dim * 2, num_classes) if num_classes > 0 else nn.Identity()
+        elif cls_target == 'attention_pooling':
+            self.attn_pooling = AttentionPoolingBlock(
+                dim=embed_dim, num_heads=num_heads, qkv_bias=True, qk_scale=None,
+                drop=0., attn_drop=0.0, norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=embed_dim)
+            self.norm = nn.SyncBatchNorm(embed_dim, eps=1e-6)
+            self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
         elif cls_target == 'clip_projector':
-            if head_norm_type == 'bn':
-                self.norm = nn.SyncBatchNorm(clip_embed_dim, eps=1e-6)
-            else:
-                self.norm = nn.LayerNorm(clip_embed_dim, eps=1e-6)
+            self.norm = nn.SyncBatchNorm(clip_embed_dim, eps=1e-6)
             self.head = nn.Linear(clip_embed_dim, num_classes) if num_classes > 0 else nn.Identity()
         else:
             raise NotImplementedError
@@ -447,6 +453,8 @@ def forward(self, x):
         x = self.forward_features(x)
         if self.cls_target == 'cls_patch_concat':
             x = torch.cat((x[:, 0, :], x[:, 1:, :].mean(dim=1)), dim=-1)
+        elif self.cls_target == 'attention_pooling':
+            x = self.attn_pooling(x)
         elif self.cls_target == 'clip_projector':
             x = self.clip_projector(x)
         else:
diff --git a/classification/train_in1k_deepspeed.sh b/classification/train_in1k_deepspeed.sh
deleted file mode 100644
index 83c9f0b9..00000000
--- a/classification/train_in1k_deepspeed.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-set -x
-
-PARTITION=$1
-JOB_NAME=$2
-CONFIG=$3
-GPUS=${GPUS:-8}
-GPUS_PER_NODE=${GPUS_PER_NODE:-8}
-CPUS_PER_TASK=${CPUS_PER_TASK:-10}
-SRUN_ARGS=${SRUN_ARGS:-""}
-
-PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
-    srun -p ${PARTITION} \
-    --job-name=${JOB_NAME} \
-    --gres=gpu:${GPUS_PER_NODE} \
-    --ntasks=${GPUS} \
-    --ntasks-per-node=${GPUS_PER_NODE} \
-    --cpus-per-task=${CPUS_PER_TASK} \
-    --kill-on-bad-exit=1 \
-    --quotatype=spot \
-    ${SRUN_ARGS} \
-    python -u main_deepspeed.py \
-    --cfg ${CONFIG} \
-    --local-rank 0 \
-    --data-path /mnt/lustre/share/images \
-    --output work_dirs_deepspeed ${@:4}