From e9c5e34eda2d389bd01c482dab98bff5ac3b5164 Mon Sep 17 00:00:00 2001 From: zhe chen Date: Tue, 29 Oct 2024 00:34:25 +0800 Subject: [PATCH] Update classification code --- classification/config.py | 14 +- ...ng_intern_vit_6b_224px_in1k_224_64gpu.yaml | 35 ++ ...it_6b_224px_in1k_224_64gpu_imagenet_a.yaml | 36 ++ ...it_6b_224px_in1k_224_64gpu_imagenet_r.yaml | 36 ++ ...6b_224px_in1k_224_64gpu_imagenet_real.yaml | 36 ++ ..._224px_in1k_224_64gpu_imagenet_sketch.yaml | 36 ++ ...it_6b_224px_in1k_224_64gpu_imagenetv2.yaml | 36 ++ ...tern_vit_6b_224px_in1k_224to448_64gpu.yaml | 36 ++ ..._224px_in1k_224to448_64gpu_imagenet_a.yaml | 37 ++ ..._224px_in1k_224to448_64gpu_imagenet_r.yaml | 37 ++ ...4px_in1k_224to448_64gpu_imagenet_real.yaml | 37 ++ ...x_in1k_224to448_64gpu_imagenet_sketch.yaml | 37 ++ ..._224px_in1k_224to448_64gpu_imagenetv2.yaml | 37 ++ ...tern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml | 36 ++ ..._448px_v1_0_in1k_448_64gpu_imagenet_a.yaml | 37 ++ ..._448px_v1_0_in1k_448_64gpu_imagenet_r.yaml | 37 ++ ...8px_v1_0_in1k_448_64gpu_imagenet_real.yaml | 37 ++ ...x_v1_0_in1k_448_64gpu_imagenet_sketch.yaml | 37 ++ ..._448px_v1_0_in1k_448_64gpu_imagenetv2.yaml | 37 ++ ...tern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml | 36 ++ ..._448px_v1_2_in1k_448_64gpu_imagenet_a.yaml | 37 ++ ..._448px_v1_2_in1k_448_64gpu_imagenet_r.yaml | 37 ++ ...8px_v1_2_in1k_448_64gpu_imagenet_real.yaml | 37 ++ ...x_v1_2_in1k_448_64gpu_imagenet_sketch.yaml | 37 ++ ..._448px_v1_2_in1k_448_64gpu_imagenetv2.yaml | 37 ++ ...tern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml | 36 ++ ..._448px_v1_5_in1k_448_64gpu_imagenet_a.yaml | 37 ++ ..._448px_v1_5_in1k_448_64gpu_imagenet_r.yaml | 37 ++ ...8px_v1_5_in1k_448_64gpu_imagenet_real.yaml | 37 ++ ...x_v1_5_in1k_448_64gpu_imagenet_sketch.yaml | 37 ++ ..._448px_v1_5_in1k_448_64gpu_imagenetv2.yaml | 37 ++ ...tern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml | 36 ++ ..._448px_v2_5_in1k_448_64gpu_imagenet_a.yaml | 37 ++ ..._448px_v2_5_in1k_448_64gpu_imagenet_r.yaml | 37 ++ ...8px_v2_5_in1k_448_64gpu_imagenet_real.yaml | 37 ++ ...x_v2_5_in1k_448_64gpu_imagenet_sketch.yaml | 37 ++ ..._448px_v2_5_in1k_448_64gpu_imagenetv2.yaml | 37 ++ ...ng_intern_vit_6b_224px_in1k_224_64gpu.yaml | 35 ++ ...it_6b_224px_in1k_224_64gpu_imagenet_a.yaml | 36 ++ ...it_6b_224px_in1k_224_64gpu_imagenet_r.yaml | 36 ++ ...6b_224px_in1k_224_64gpu_imagenet_real.yaml | 36 ++ ..._224px_in1k_224_64gpu_imagenet_sketch.yaml | 36 ++ ...it_6b_224px_in1k_224_64gpu_imagenetv2.yaml | 36 ++ ...tern_vit_6b_224px_in1k_224to448_64gpu.yaml | 36 ++ ..._224px_in1k_224to448_64gpu_imagenet_a.yaml | 37 ++ ..._224px_in1k_224to448_64gpu_imagenet_r.yaml | 37 ++ ...4px_in1k_224to448_64gpu_imagenet_real.yaml | 37 ++ ...x_in1k_224to448_64gpu_imagenet_sketch.yaml | 37 ++ ..._224px_in1k_224to448_64gpu_imagenetv2.yaml | 37 ++ ...tern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml | 36 ++ ..._448px_v1_0_in1k_448_64gpu_imagenet_a.yaml | 37 ++ ..._448px_v1_0_in1k_448_64gpu_imagenet_r.yaml | 37 ++ ...8px_v1_0_in1k_448_64gpu_imagenet_real.yaml | 37 ++ ...x_v1_0_in1k_448_64gpu_imagenet_sketch.yaml | 37 ++ ..._448px_v1_0_in1k_448_64gpu_imagenetv2.yaml | 37 ++ ...tern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml | 36 ++ ..._448px_v1_2_in1k_448_64gpu_imagenet_a.yaml | 37 ++ ..._448px_v1_2_in1k_448_64gpu_imagenet_r.yaml | 37 ++ ...8px_v1_2_in1k_448_64gpu_imagenet_real.yaml | 37 ++ ...x_v1_2_in1k_448_64gpu_imagenet_sketch.yaml | 37 ++ ..._448px_v1_2_in1k_448_64gpu_imagenetv2.yaml | 37 ++ ...tern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml | 36 ++ ..._448px_v1_5_in1k_448_64gpu_imagenet_a.yaml | 37 ++ ..._448px_v1_5_in1k_448_64gpu_imagenet_r.yaml | 37 ++ ...8px_v1_5_in1k_448_64gpu_imagenet_real.yaml | 37 ++ ...x_v1_5_in1k_448_64gpu_imagenet_sketch.yaml | 37 ++ ..._448px_v1_5_in1k_448_64gpu_imagenetv2.yaml | 37 ++ ...tern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml | 36 ++ ..._448px_v2_5_in1k_448_64gpu_imagenet_a.yaml | 37 ++ ..._448px_v2_5_in1k_448_64gpu_imagenet_r.yaml | 37 ++ ...8px_v2_5_in1k_448_64gpu_imagenet_real.yaml | 37 ++ ...x_v2_5_in1k_448_64gpu_imagenet_sketch.yaml | 37 ++ ..._448px_v2_5_in1k_448_64gpu_imagenetv2.yaml | 37 ++ classification/ema_deepspeed.py | 100 ---- classification/export.py | 128 ----- classification/gflops.py | 2 +- classification/hf2pytorch.py | 70 +++ classification/main_accelerate.py | 377 ------------ classification/main_deepspeed.py | 544 ------------------ classification/models/build.py | 17 +- classification/models/clip_vit.py | 195 +++++++ classification/models/intern_vit_6b.py | 28 +- classification/train_in1k_deepspeed.sh | 27 - 83 files changed, 2953 insertions(+), 1189 deletions(-) create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml create mode 100644 classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml delete mode 100644 classification/ema_deepspeed.py delete mode 100644 classification/export.py create mode 100644 classification/hf2pytorch.py delete mode 100644 classification/main_accelerate.py delete mode 100644 classification/main_deepspeed.py create mode 100644 classification/models/clip_vit.py delete mode 100644 classification/train_in1k_deepspeed.sh diff --git a/classification/config.py b/classification/config.py index f27ab8b9..53073592 100644 --- a/classification/config.py +++ b/classification/config.py @@ -81,7 +81,19 @@ _C.MODEL.INTERN_VIT_6B.FREEZE_VIT = True _C.MODEL.INTERN_VIT_6B.PRETRAINED = None _C.MODEL.INTERN_VIT_6B.CLS_TARGET = 'cls_patch_concat' -_C.MODEL.INTERN_VIT_6B.HEAD_NORM_TYPE = 'bn' +_C.MODEL.INTERN_VIT_6B.NORM_TYPE = 'rms' + +# CLIP_VIT parameters +_C.MODEL.CLIP_VIT = CN() +_C.MODEL.CLIP_VIT.PATCH_SIZE = 14 +_C.MODEL.CLIP_VIT.PRETRAIN_SIZE = 336 +_C.MODEL.CLIP_VIT.EMBED_DIM = 1024 +_C.MODEL.CLIP_VIT.NUM_HEADS = 16 +_C.MODEL.CLIP_VIT.MLP_RATIO = 4 +_C.MODEL.CLIP_VIT.DEPTH = 24 +_C.MODEL.CLIP_VIT.FREEZE_VIT = True +_C.MODEL.CLIP_VIT.PRETRAINED = 'openai/clip-vit-large-patch14-336' +_C.MODEL.CLIP_VIT.CLS_TARGET = 'cls_patch_concat' # ----------------------------------------------------------------------------- # Training settings diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml new file mode 100644 index 00000000..3eb85c31 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml @@ -0,0 +1,35 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..98d30d5c --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..97017056 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..5bd19053 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..4dfdbddd --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..c310582a --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml new file mode 100644 index 00000000..4ca8070b --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..95f0dbaf --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..f3771780 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..2b24800b --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..09953bf0 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..f6ec39fe --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml new file mode 100644 index 00000000..48bb2ff0 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..626e2aa2 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..be2c07f3 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..1572d242 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..fde27a51 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..8b960969 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml new file mode 100644 index 00000000..751f3976 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..616789aa --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..ce53ee02 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..8fd9f841 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..69d2eb9f --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..f566dd75 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml new file mode 100644 index 00000000..33818830 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..767bb2b6 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..a94e7175 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..6095ad87 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..e860db91 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..3d64a1bf --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml new file mode 100644 index 00000000..35115b6f --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..bb61d42f --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..339b665f --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..ff6f4658 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..b4e9a207 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..16f40561 --- /dev/null +++ b/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'attention_pooling' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml new file mode 100644 index 00000000..6788c616 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu.yaml @@ -0,0 +1,35 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..80e751f9 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..f444f6f9 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_r.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..9525a450 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_real.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..b1355e38 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_sketch.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..da992da2 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenetv2.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml new file mode 100644 index 00000000..a7d02825 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..de3a17d3 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..014ee789 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..ea364c2f --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..ef10e8c7 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..905b7f9c --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 224 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 48 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_224px.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml new file mode 100644 index 00000000..004db231 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..fa863328 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..a1968602 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..7607b577 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..0a669975 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..70de8bb8 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml new file mode 100644 index 00000000..23e696be --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..60f9140e --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..1c2475bd --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..396157ec --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..9bbd942d --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..2a7c0232 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml new file mode 100644 index 00000000..cba76bde --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..6f4af1b5 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..46a53bed --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..53607372 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..aa523d22 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..e3d19a6b --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml new file mode 100644 index 00000000..f667bf10 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu.yaml @@ -0,0 +1,36 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml new file mode 100644 index 00000000..7e48cabd --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_a.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_a' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-a' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml new file mode 100644 index 00000000..b114f88a --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_r' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-r' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml new file mode 100644 index 00000000..52f68723 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet-real' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-1k' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml new file mode 100644 index 00000000..6b879af1 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenet_sketch' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenet-sketch' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml new file mode 100644 index 00000000..03a59c64 --- /dev/null +++ b/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenetv2.yaml @@ -0,0 +1,37 @@ +DATA: + IMG_ON_MEMORY: False + BATCH_SIZE: 16 # single GPU batch size + DATASET: 'imagenetv2' + TRANSFORM: 'build_transform_for_linear_probe' + DATA_PATH: './data/imagenetv2' + IMG_SIZE: 448 +MODEL: + TYPE: intern_vit_6b + DROP_PATH_RATE: 0.0 + INTERN_VIT_6B: + FREEZE_VIT: True + PATCH_SIZE: 14 + PRETRAIN_SIZE: 448 + QKV_BIAS: False + EMBED_DIM: 3200 + NUM_HEADS: 25 + MLP_RATIO: 4 + INIT_VALUES: 0.1 + QK_NORMALIZATION: True + DEPTH: 45 + USE_FLASH_ATTN: True + PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth" + CLS_TARGET: 'cls_patch_concat' +TRAIN: + EMA: + ENABLE: True + DECAY: 0.998 + EPOCHS: 10 + WARMUP_EPOCHS: 1 + WEIGHT_DECAY: 0.0 + BASE_LR: 0.1 # 512 + WARMUP_LR: .0 + MIN_LR: .0 + LR_LAYER_DECAY: false + OPTIMIZER: + NAME: 'sgd' diff --git a/classification/ema_deepspeed.py b/classification/ema_deepspeed.py deleted file mode 100644 index 3ae92cb3..00000000 --- a/classification/ema_deepspeed.py +++ /dev/null @@ -1,100 +0,0 @@ -from contextlib import contextmanager - -import deepspeed -import torch -import torch.nn as nn -from deepspeed.runtime.zero import GatheredParameters - - -class EMADeepspeed(nn.Module): - """ migrated from https://github.com/microsoft/DeepSpeed/issues/2056 - """ - - def __init__(self, model, decay=0.9999, use_num_updates=True): - super().__init__() - if decay < 0.0 or decay > 1.0: - raise ValueError('Decay must be between 0 and 1') - - self.m_name2s_name = {} - self.decay = decay - self.num_updates = 0 if use_num_updates else -1 - - with GatheredParameters(model.parameters(), fwd_module=self): - for name, p in model.named_parameters(): - if p.requires_grad: - # remove as '.'-character is not allowed in buffers - s_name = name.replace('.', '') - self.m_name2s_name.update({name: s_name}) - self.register_buffer(s_name, p.clone().detach().data) - # remove as '.'-character is not allowed in buffers - self.collected_params = [] - - def forward(self, model): - decay = self.decay - - if self.num_updates >= 0: - self.num_updates += 1 - decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates)) - - one_minus_decay = 1.0 - decay - shadow_params = dict(self.named_buffers()) - - with torch.no_grad(): - with GatheredParameters(model.parameters()): - if deepspeed.comm.get_rank() == 0: - m_param = dict(model.named_parameters()) - - for key in m_param: - if m_param[key].requires_grad: - sname = self.m_name2s_name[key] - shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) - shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) - else: - assert key not in self.m_name2s_name - - def copy_to(self, model): - shadow_params = dict(self.named_buffers()) - with GatheredParameters(model.parameters(), modifier_rank=0): - if deepspeed.comm.get_rank() == 0: - m_param = dict(model.named_parameters()) - for key in m_param: - if m_param[key].requires_grad: - m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) - else: - assert key not in self.m_name2s_name - - def store(self, model): - """ - Save the current parameters for restoring later. - Args: - model: A model that parameters will be stored - """ - with GatheredParameters(model.parameters()): - if deepspeed.comm.get_rank() == 0: - parameters = model.parameters() - self.collected_params = [param.clone() for param in parameters] - - def restore(self, model): - """ - Restore the parameters stored with the `store` method. - Useful to validate the model with EMA parameters without affecting the - original optimization process. Store the parameters before the - `copy_to` method. After validation (or model saving), use this to - restore the former parameters. - Args: - model: A model that to restore its parameters. - """ - with GatheredParameters(model.parameters(), modifier_rank=0): - if deepspeed.comm.get_rank() == 0: - parameters = model.parameters() - for c_param, param in zip(self.collected_params, parameters): - param.data.copy_(c_param.data) - - @contextmanager - def activate(self, model): - try: - self.store(model) - self.copy_to(model) - yield - finally: - self.restore(model) diff --git a/classification/export.py b/classification/export.py deleted file mode 100644 index 0edb0d85..00000000 --- a/classification/export.py +++ /dev/null @@ -1,128 +0,0 @@ -# -------------------------------------------------------- -# InternVL -# Copyright (c) 2022 OpenGVLab -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- - -import argparse -import os -import time - -import torch -from config import get_config -from models import build_model -from tqdm import tqdm - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--model_name', type=str, - default='internimage_t_1k_224') - parser.add_argument('--ckpt_dir', type=str, - default='/mnt/petrelfs/share_data/huangzhenhang/code/internimage/checkpoint_dir/new/cls') - parser.add_argument('--onnx', default=False, action='store_true') - parser.add_argument('--trt', default=False, action='store_true') - - args = parser.parse_args() - args.cfg = os.path.join('./configs', f'{args.model_name}.yaml') - args.ckpt = os.path.join(args.ckpt_dir, f'{args.model_name}.pth') - args.size = int(args.model_name.split('.')[0].split('_')[-1]) - - cfg = get_config(args) - return args, cfg - - -def get_model(args, cfg): - model = build_model(cfg) - ckpt = torch.load(args.ckpt, map_location='cpu')['model'] - - model.load_state_dict(ckpt) - return model - - -def speed_test(model, input): - # warm-up - for _ in tqdm(range(100)): - _ = model(input) - - # speed test - torch.cuda.synchronize() - start = time.time() - for _ in tqdm(range(100)): - _ = model(input) - end = time.time() - th = 100 / (end - start) - print(f'using time: {end - start}, throughput {th}') - - -def torch2onnx(args, cfg): - model = get_model(args, cfg).cuda() - - # speed_test(model) - - onnx_name = f'{args.model_name}.onnx' - torch.onnx.export(model, - torch.rand(1, 3, args.size, args.size).cuda(), - onnx_name, - input_names=['input'], - output_names=['output']) - - return model - - -def onnx2trt(args): - from mmdeploy.backend.tensorrt import from_onnx - - onnx_name = f'{args.model_name}.onnx' - from_onnx( - onnx_name, - args.model_name, - dict( - input=dict( - min_shape=[1, 3, args.size, args.size], - opt_shape=[1, 3, args.size, args.size], - max_shape=[1, 3, args.size, args.size], - ) - ), - max_workspace_size=2 ** 30, - ) - - -def check(args, cfg): - from mmdeploy.backend.tensorrt.wrapper import TRTWrapper - - model = get_model(args, cfg).cuda() - model.eval() - trt_model = TRTWrapper(f'{args.model_name}.engine', - ['output']) - - x = torch.randn(1, 3, args.size, args.size).cuda() - - torch_out = model(x) - trt_out = trt_model(dict(input=x))['output'] - - print('torch out shape:', torch_out.shape) - print('trt out shape:', trt_out.shape) - - print('max delta:', (torch_out - trt_out).abs().max()) - print('mean delta:', (torch_out - trt_out).abs().mean()) - - speed_test(model, x) - speed_test(trt_model, dict(input=x)) - - -def main(): - args, cfg = get_args() - - if args.onnx or args.trt: - torch2onnx(args, cfg) - print('torch -> onnx: succeess') - - if args.trt: - onnx2trt(args) - print('onnx -> trt: success') - check(args, cfg) - - -if __name__ == '__main__': - main() diff --git a/classification/gflops.py b/classification/gflops.py index f6ef0228..dff9af67 100644 --- a/classification/gflops.py +++ b/classification/gflops.py @@ -98,7 +98,7 @@ def get_flops(model, input_shape): num_classes=0, attn_pool_num_heads=16, clip_embed_dim=768, - head_norm_type='bn').to(torch.bfloat16) + norm_type='rms').to(torch.bfloat16) for k, v in model.named_parameters(): v.requires_grad = True diff --git a/classification/hf2pytorch.py b/classification/hf2pytorch.py new file mode 100644 index 00000000..a1ba1dd3 --- /dev/null +++ b/classification/hf2pytorch.py @@ -0,0 +1,70 @@ +import argparse +import os + +import torch +from safetensors.torch import load_file as safetensors_load_file + +# Parse command-line arguments +parser = argparse.ArgumentParser(description='Process and convert model state_dicts.') +parser.add_argument('input_dir', type=str, help='Directory containing input .bin and .safetensors files.') +parser.add_argument('output_file', type=str, help='Output file to save the converted state_dict.') +args = parser.parse_args() + +# Verify that the input directory exists +if not os.path.isdir(args.input_dir): + raise ValueError(f'Input directory does not exist: {args.input_dir}') + +# List all files in the input directory +filenames = os.listdir(args.input_dir) + +# Filter files to include only .bin and .safetensors files +filenames = [f for f in filenames if f.endswith('.bin') or f.endswith('.safetensors')] +filepaths = [os.path.join(args.input_dir, f) for f in filenames] +print(f'Found files: {filenames}') + +# Initialize an empty state_dict to store the loaded data +state_dict = {} + +# Loop over each file and load its contents +for filepath in filepaths: + print(f'Loading: {filepath}') + if filepath.endswith('.bin'): + # Load .bin file using torch.load + loaded_dict = torch.load(filepath, map_location='cpu') + state_dict.update(loaded_dict) + elif filepath.endswith('.safetensors'): + # Load .safetensors file using safetensors + loaded_dict = safetensors_load_file(filepath, device='cpu') + state_dict.update(loaded_dict) + else: + raise ValueError(f'Unsupported file format: {filepath}') + +# Print the keys of the loaded state_dict +print(f'Loaded state_dict keys: {list(state_dict.keys())}') + +# Initialize a new state_dict to store the converted data +new_state_dict = {} + +# Iterate over each key-value pair in the original state_dict +for k, v in state_dict.items(): + # Replace key substrings according to specified mapping rules + k_new = k + k_new = k_new.replace('embeddings.class_embedding', 'cls_token') + k_new = k_new.replace('embeddings.position_embedding', 'pos_embed') + k_new = k_new.replace('embeddings.patch_embedding.weight', 'patch_embed.proj.weight') + k_new = k_new.replace('embeddings.patch_embedding.bias', 'patch_embed.proj.bias') + k_new = k_new.replace('ls1', 'ls1.gamma') + k_new = k_new.replace('ls2', 'ls2.gamma') + k_new = k_new.replace('encoder.layers.', 'blocks.') + # Update the new_state_dict with the transformed key and original value + new_state_dict[k_new] = v + +# Print the keys of the converted state_dict +print(f'Converted state_dict keys: {list(new_state_dict.keys())}') + +# Wrap the new_state_dict in a dictionary under the 'module' key +new_dict = {'module': new_state_dict} + +# Save the new_dict to the specified output file +print(f'Saving converted state_dict to: {args.output_file}') +torch.save(new_dict, args.output_file) diff --git a/classification/main_accelerate.py b/classification/main_accelerate.py deleted file mode 100644 index 975be24b..00000000 --- a/classification/main_accelerate.py +++ /dev/null @@ -1,377 +0,0 @@ -# -------------------------------------------------------- -# InternVL -# Copyright (c) 2022 OpenGVLab -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- - -import argparse -import datetime -import logging -import os -import random -import time -import warnings - -import numpy as np -import torch -import torch.backends.cudnn as cudnn -from accelerate import Accelerator, GradScalerKwargs -from accelerate.logging import get_logger -from config import get_config -from dataset import build_loader2 -from ddp_hooks import fp16_compress_hook -from lr_scheduler import build_scheduler -from models import build_model -from optimizer import build_optimizer -from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy -from timm.utils import AverageMeter, ModelEma, accuracy -from tqdm import tqdm -from utils import load_ema_checkpoint, load_pretrained - -logger = get_logger(__name__) -warnings.filterwarnings('ignore') - - -def parse_option(): - parser = argparse.ArgumentParser( - 'InternVL training and evaluation script', add_help=False) - parser.add_argument('--cfg', type=str, required=True, metavar='FILE', help='path to config file') - parser.add_argument('--opts', help="Modify config options by adding 'KEY VALUE' pairs. ", default=None, nargs='+') - - # easy config modification - parser.add_argument('--batch-size', type=int, help='batch size for single GPU') - parser.add_argument('--dataset', type=str, help='dataset name', default=None) - parser.add_argument('--data-path', type=str, help='path to dataset') - parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset') - parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'], - help='no: no cache, ' - 'full: cache all data, ' - 'part: sharding the dataset into nonoverlapping pieces and only cache one piece' - ) - parser.add_argument('--pretrained', help='pretrained weight from checkpoint, could be imagenet22k pretrained weight') - parser.add_argument('--resume', help='resume from checkpoint') - parser.add_argument('--output', default='work_dirs', type=str, metavar='PATH', - help='root of output folder, the full path is // (default: output)' - ) - parser.add_argument('--eval', action='store_true', help='Perform evaluation only') - parser.add_argument('--throughput', action='store_true', help='Test throughput only') - parser.add_argument('--save-ckpt-num', default=1, type=int) - parser.add_argument('--accumulation-steps', type=int, default=1, help='gradient accumulation steps') - parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar') - parser.add_argument( - '--logger', - type=str, - default='tensorboard', - choices=['tensorboard', 'wandb'], - help=( - 'Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)' - ' for experiment tracking and logging of model metrics and model checkpoints' - ), - ) - - args, unparsed = parser.parse_known_args() - config = get_config(args) - config.defrost() - config.TRAIN.OPTIMIZER.USE_ZERO = False - config.OUTPUT += '_deepspeed' - config.DATA.IMG_ON_MEMORY = False - config.freeze() - return args, config - - -def seed_everything(seed, rank): - seed = seed + rank - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - np.random.seed(seed) - random.seed(seed) - cudnn.benchmark = True - - -def save_config(config): - path = os.path.join(config.OUTPUT, 'config.json') - with open(path, 'w') as f: - f.write(config.dump()) - logger.info(f'Full config saved to {path}') - - -def build_criterion(config): - if config.AUG.MIXUP > 0.: - # smoothing is handled with mixup label transform - criterion = SoftTargetCrossEntropy() - elif config.MODEL.LABEL_SMOOTHING > 0.: - criterion = LabelSmoothingCrossEntropy( - smoothing=config.MODEL.LABEL_SMOOTHING) - else: - criterion = torch.nn.CrossEntropyLoss() - return criterion - - -def scale_learning_rate(config, num_processes): - # linear scale the learning rate according to total batch size, may not be optimal - linear_scaled_lr = config.TRAIN.BASE_LR * \ - config.DATA.BATCH_SIZE * num_processes / 512.0 - linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * \ - config.DATA.BATCH_SIZE * num_processes / 512.0 - linear_scaled_min_lr = config.TRAIN.MIN_LR * \ - config.DATA.BATCH_SIZE * num_processes / 512.0 - # gradient accumulation also need to scale the learning rate - if config.TRAIN.ACCUMULATION_STEPS > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS - linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS - linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS - config.defrost() - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr - config.TRAIN.MIN_LR = linear_scaled_min_lr - config.freeze() - - logger.info('BASE_LR={}'.format(config.TRAIN.BASE_LR)) - logger.info('WARMUP_LR={}'.format(config.TRAIN.WARMUP_LR)) - logger.info('MIN_LR={}'.format(config.TRAIN.MIN_LR)) - - -def setup_autoresume(config): - if config.MODEL.RESUME == '' and config.TRAIN.AUTO_RESUME: - last_checkpoint = os.path.join(config.OUTPUT, 'last') - resume_file = last_checkpoint if os.path.exists(last_checkpoint) else None - - if resume_file: - if config.MODEL.RESUME: - logger.warning(f'auto-resume changing resume file from {config.MODEL.RESUME} to {resume_file}') - config.defrost() - config.MODEL.RESUME = resume_file - config.freeze() - logger.info(f'auto resuming from {resume_file}') - else: - logger.info(f'no checkpoint found in {config.OUTPUT}, ignoring auto resume') - - -def load_model_checkpoint(config, model, accelerator): - if config.MODEL.RESUME: - try: - checkpoint = torch.load(config.MODEL.RESUME)['model'] - checkpoint = {k.replace('module.', ''): v for k, v in checkpoint.items()} - model.load_state_dict(checkpoint) - except: - accelerator.load_state(config.MODEL.RESUME) - elif config.MODEL.PRETRAINED: - try: - load_pretrained(config, model, logger) - except: - accelerator.load_state(config.MODEL.PRETRAINED) - return model - - -def save_checkpoint(save_dir, accelerator, epoch, max_acc, config, lr_scheduler=None): - # let accelerator handle the model and optimizer state for ddp and deepspeed. - accelerator.save_state(save_dir) - - if accelerator.is_main_process: - save_state = { - 'lr_scheduler': lr_scheduler.state_dict(), - 'max_acc': max_acc, - 'epoch': epoch, - 'config': config - } - torch.save(save_state, os.path.join(save_dir, 'additional_state.pth')) - - -def load_checkpoint_if_needed(accelerator, config, lr_scheduler=None): - setup_autoresume(config) - save_dir = config.MODEL.RESUME - if not save_dir: - return 0.0 - accelerator.load_state(save_dir) - checkpoint = torch.load(os.path.join(save_dir, 'additional_state.pth'), map_location='cpu') - if lr_scheduler is not None: - logger.info('resuming lr_scheduler') - lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) - config.defrost() - config.TRAIN.START_EPOCH = checkpoint['epoch'] + 1 - config.freeze() - max_acc = checkpoint.get('max_acc', 0.0) - logger.info(f"=> loaded successfully {config.MODEL.RESUME} (epoch {checkpoint['epoch']})") - return max_acc - - -def log_model_statistic(model_wo_ddp): - n_parameters = sum(p.numel() for p in model_wo_ddp.parameters() - if p.requires_grad) - logger.info(f'number of params: {n_parameters}') - if hasattr(model_wo_ddp, 'flops'): - flops = model_wo_ddp.flops() - logger.info(f'number of GFLOPs: {flops / 1e9}') - - -def train_epoch(*, model, optimizer, data_loader, scheduler, criterion, mixup_fn, - accelerator: Accelerator, epoch, config): - model.train() - - num_steps = len(data_loader) - batch_time = AverageMeter() - model_time = AverageMeter() - loss_meter = AverageMeter() - - end = time.time() - - gradient_accumulation_steps = config.TRAIN.ACCUMULATION_STEPS - - for step, (samples, targets) in enumerate(data_loader): - iter_begin_time = time.time() - - if mixup_fn is not None: - samples, targets = mixup_fn(samples, targets) - - with accelerator.accumulate(model): - outputs = model(samples) - loss = criterion(outputs, targets) - accelerator.backward(loss) - if accelerator.sync_gradients: - accelerator.clip_grad_norm_(model.parameters(), config.TRAIN.CLIP_GRAD) - optimizer.step() - optimizer.zero_grad() - - accelerator.wait_for_everyone() - - if (step + 1) % gradient_accumulation_steps == 0: - if scheduler is not None: - scheduler.step_update((epoch * num_steps + step) // gradient_accumulation_steps) - - batch_time.update(time.time() - end) - model_time.update(time.time() - iter_begin_time) - loss_meter.update(loss.item()) - end = time.time() - - if accelerator.is_main_process and step % config.PRINT_FREQ == 0: - lr = optimizer.param_groups[0]['lr'] - memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0) - etas = batch_time.avg * (num_steps - step) - - logger.info( - f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{step}/{num_steps}]\t' - f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.10f}\t' - f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t' - f'model_time {model_time.val:.4f} ({model_time.avg:.4f})\t' - f'loss {loss_meter.val:.8f} ({loss_meter.avg:.4f})\t' - f'mem {memory_used:.0f}MB') - - -@torch.no_grad() -def eval_epoch(*, config, data_loader, model, accelerator: Accelerator): - model.eval() - - acc1_meter = AverageMeter() - acc5_meter = AverageMeter() - - for idx, (images, target) in enumerate(tqdm(data_loader, disable=accelerator.is_main_process)): - output = model(images) - - # convert 22k to 1k to evaluate - if output.size(-1) == 21841: - convert_file = './meta_data/map22kto1k.txt' - with open(convert_file, 'r') as f: - convert_list = [int(line) for line in f.readlines()] - output = output[:, convert_list] - - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - acc1 = accelerator.gather(acc1).mean(0) - acc5 = accelerator.gather(acc5).mean(0) - - acc1_meter.update(acc1.item(), target.size(0)) - acc5_meter.update(acc5.item(), target.size(0)) - - if (idx + 1) % config.PRINT_FREQ == 0 or idx + 1 == len(data_loader): - logger.info(f'Test: [{idx+1}/{len(data_loader)}]\t' - f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t' - f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t' - ) - return acc1_meter.avg - - -def eval(config, accelerator: Accelerator): - _, _, _, _, validate_dataloader, _, _ = build_loader2(config) - model = build_model(config) - model, validate_dataloader = accelerator.prepare(model, validate_dataloader) - model = load_model_checkpoint(config, model, accelerator) - log_model_statistic(accelerator.unwrap_model(model)) - eval_epoch(config=config, data_loader=validate_dataloader, model=model, accelerator=accelerator) - - -def train(config, accelerator: Accelerator): - _, _, _, training_dataloader, validate_dataloader, _, mixup_fn = build_loader2(config) - model = build_model(config) - optimizer = build_optimizer(config, model) - criterion = build_criterion(config) - - model, optimizer, training_dataloader, validate_dataloader = accelerator.prepare( - model, optimizer, training_dataloader, validate_dataloader) - - effective_update_steps_per_epoch = len(training_dataloader) // config.TRAIN.ACCUMULATION_STEPS - lr_scheduler = build_scheduler(config, optimizer, effective_update_steps_per_epoch) - - try: - model.register_comm_hook(state=None, hook=fp16_compress_hook) - logger.info('using fp16_compress_hook!') - except: - logger.info('cannot register fp16_compress_hook!') - - max_acc = load_checkpoint_if_needed(accelerator, config, lr_scheduler) - - logger.info(f'Created model:{config.MODEL.TYPE}/{config.MODEL.NAME}') - logger.info(str(model)) - logger.info('Effective Optimizer Steps: {}'.format(effective_update_steps_per_epoch)) - logger.info('Start training') - logger.info('Max accuracy: {}'.format(max_acc)) - log_model_statistic(accelerator.unwrap_model(model)) - - for epoch in range(config.TRAIN.START_EPOCH, config.TRAIN.EPOCHS): - train_epoch(model=model, optimizer=optimizer, data_loader=training_dataloader, - scheduler=lr_scheduler, criterion=criterion, mixup_fn=mixup_fn, - accelerator=accelerator, epoch=epoch, config=config) - acc = eval_epoch(config=config, data_loader=validate_dataloader, model=model, - accelerator=accelerator) - - accelerator.wait_for_everyone() - if acc > max_acc: - max_acc = acc - save_checkpoint(os.path.join(config.OUTPUT, 'best'), accelerator, epoch, max_acc, config, lr_scheduler) - logger.info(f'Max Acc@1 {max_acc:.3f}') - save_checkpoint(os.path.join(config.OUTPUT, 'last'), accelerator, epoch, max_acc, config, lr_scheduler) - - -def main(): - args, config = parse_option() - os.makedirs(config.OUTPUT, exist_ok=True) - logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - filename=os.path.join(config.OUTPUT, 'run.log'), - level=logging.INFO, - ) - - loggers = ['tensorboard'] - accelerator = Accelerator( - log_with=loggers, - project_dir=config.OUTPUT, - gradient_accumulation_steps=config.TRAIN.ACCUMULATION_STEPS, - # When use deepspeed, you could not comment this out - # even if you set loss scale to 1.0 in deepspeed config. - kwargs_handlers=[GradScalerKwargs(enabled=not args.disable_grad_scalar)], - ) - logger.info(accelerator.state, main_process_only=False) - - scale_learning_rate(config, accelerator.num_processes) - seed_everything(config.SEED, accelerator.process_index) - save_config(config) - - logger.info(config.dump()) - - if config.EVAL_MODE: - eval(config, accelerator) - else: - train(config, accelerator) - - -if __name__ == '__main__': - main() diff --git a/classification/main_deepspeed.py b/classification/main_deepspeed.py deleted file mode 100644 index 67068e24..00000000 --- a/classification/main_deepspeed.py +++ /dev/null @@ -1,544 +0,0 @@ -# -------------------------------------------------------- -# InternVL -# Copyright (c) 2022 OpenGVLab -# Licensed under The MIT License [see LICENSE for details] -# -------------------------------------------------------- - -import argparse -import datetime -import os -import random -import subprocess -import time - -import deepspeed -import numpy as np -import torch -import torch.backends.cudnn as cudnn -import torch.distributed as dist -from config import get_config -from dataset import build_loader -from ddp_hooks import fp16_compress_hook -from ema_deepspeed import EMADeepspeed -from logger import create_logger -from lr_scheduler import build_scheduler -from models import build_model -from optimizer import set_weight_decay_and_lr -from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy -from timm.utils import AverageMeter, accuracy -from utils import MyAverageMeter, load_pretrained, reduce_tensor - - -def parse_option(): - parser = argparse.ArgumentParser( - 'InternVL training and evaluation script', add_help=False) - parser.add_argument('--cfg', type=str, required=True, metavar='FILE', help='path to config file') - parser.add_argument('--opts', help="Modify config options by adding 'KEY VALUE' pairs. ", default=None, nargs='+') - - # easy config modification - parser.add_argument('--batch-size', type=int, help='batch size for single GPU') - parser.add_argument('--dataset', type=str, help='dataset name', default=None) - parser.add_argument('--data-path', type=str, help='path to dataset') - parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset') - parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'], - help='no: no cache, ' - 'full: cache all data, ' - 'part: sharding the dataset into nonoverlapping pieces and only cache one piece' - ) - parser.add_argument('--pretrained', - help='pretrained weight from checkpoint, could be imagenet22k pretrained weight') - parser.add_argument('--resume', help='resume from checkpoint') - parser.add_argument('--output', default='work_dirs', type=str, metavar='PATH', - help='root of output folder, the full path is // (default: output)' - ) - - parser.add_argument('--eval', action='store_true', help='Perform evaluation only') - parser.add_argument('--throughput', action='store_true', help='Test throughput only') - parser.add_argument('--save-ckpt-num', default=1, type=int) - parser.add_argument('--accumulation-steps', type=int, default=1, help='gradient accumulation steps') - - # distributed training - parser.add_argument('--local-rank', type=int, required=True, help='local rank for DistributedDataParallel') - - # deepspeed config - parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar') - parser.add_argument('--offload-optimizer', type=str, default='none', choices=['cpu', 'none'], - help='enable optimizer offloading') - parser.add_argument('--offload-param', type=str, default='none', choices=['cpu', 'none'], - help='enable model offloading') - # To use Zero3, Please use main_accelerate.py instead. - # For this script, we are facing a similar issue as https://github.com/microsoft/DeepSpeed/issues/3068 - parser.add_argument('--zero-stage', type=int, default=1, choices=[1, 2], help='deep speed zero stage') - - args, unparsed = parser.parse_known_args() - config = get_config(args) - - return args, config - - -def seed_everything(seed, rank): - seed = seed + rank - torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - np.random.seed(seed) - random.seed(seed) - cudnn.benchmark = True - - -def save_config(config): - path = os.path.join(config.OUTPUT, 'config.json') - with open(path, 'w') as f: - f.write(config.dump()) - logger.info(f'Full config saved to {path}') - - -def build_criterion(config): - if config.AUG.MIXUP > 0.: - # smoothing is handled with mixup label transform - criterion = SoftTargetCrossEntropy() - elif config.MODEL.LABEL_SMOOTHING > 0.: - criterion = LabelSmoothingCrossEntropy( - smoothing=config.MODEL.LABEL_SMOOTHING) - else: - criterion = torch.nn.CrossEntropyLoss() - return criterion - - -def scale_learning_rate(config, num_processes): - # linear scale the learning rate according to total batch size, may not be optimal - linear_scaled_lr = config.TRAIN.BASE_LR * config.DATA.BATCH_SIZE * num_processes / 512.0 - linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * config.DATA.BATCH_SIZE * num_processes / 512.0 - linear_scaled_min_lr = config.TRAIN.MIN_LR * config.DATA.BATCH_SIZE * num_processes / 512.0 - # gradient accumulation also need to scale the learning rate - if config.TRAIN.ACCUMULATION_STEPS > 1: - linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS - linear_scaled_warmup_lr = linear_scaled_warmup_lr * config.TRAIN.ACCUMULATION_STEPS - linear_scaled_min_lr = linear_scaled_min_lr * config.TRAIN.ACCUMULATION_STEPS - config.defrost() - config.TRAIN.BASE_LR = linear_scaled_lr - config.TRAIN.WARMUP_LR = linear_scaled_warmup_lr - config.TRAIN.MIN_LR = linear_scaled_min_lr - config.freeze() - - logger.info('BASE_LR={}'.format(config.TRAIN.BASE_LR)) - logger.info('WARMUP_LR={}'.format(config.TRAIN.WARMUP_LR)) - logger.info('MIN_LR={}'.format(config.TRAIN.MIN_LR)) - - -def log_model_statistic(model_wo_ddp): - n_parameters = sum(p.numel() for p in model_wo_ddp.parameters() - if p.requires_grad) - logger.info(f'number of params: {n_parameters / 1e6} M') - if hasattr(model_wo_ddp, 'flops'): - flops = model_wo_ddp.flops() - logger.info(f'number of GFLOPs: {flops / 1e9}') - - -def get_parameter_groups(model, config): - skip = {} - skip_keywords = {} - if hasattr(model, 'no_weight_decay'): - skip = model.no_weight_decay() - if hasattr(model, 'no_weight_decay_keywords'): - skip_keywords = model.no_weight_decay_keywords() - - parameters = set_weight_decay_and_lr( - model, - config.TRAIN.WEIGHT_DECAY, - config.TRAIN.BASE_LR, - skip, - skip_keywords, - lr_layer_decay=config.TRAIN.LR_LAYER_DECAY, - lr_layer_decay_ratio=config.TRAIN.LR_LAYER_DECAY_RATIO, - freeze_backbone=config.TRAIN.OPTIMIZER.FREEZE_BACKBONE, - dcn_lr_mul=config.TRAIN.OPTIMIZER.DCN_LR_MUL, - ) - return parameters - - -def get_optimizer_state_str(optimizer): - states = [] - for param_group in optimizer.param_groups: - states.append(f'name={param_group["name"]} lr={param_group["lr"]} weight_decay={param_group["weight_decay"]}') - return '\n'.join(states) - - -def build_ds_config(config, args): - opt_lower = config.TRAIN.OPTIMIZER.NAME.lower() - if opt_lower == 'adamw': - optimizer = { - 'type': 'AdamW', - 'params': { - 'lr': config.TRAIN.BASE_LR, - 'eps': config.TRAIN.OPTIMIZER.EPS, - 'betas': config.TRAIN.OPTIMIZER.BETAS, - 'weight_decay': config.TRAIN.WEIGHT_DECAY - } - } - else: - return NotImplemented - - ds_config = { - 'train_micro_batch_size_per_gpu': config.DATA.BATCH_SIZE, - 'optimizer': optimizer, - 'bf16': { - 'enabled': True, - }, - 'zero_optimization': { - 'stage': 1, - 'allgather_partitions': True, - 'allgather_bucket_size': 1e9, - 'overlap_comm': True, - 'reduce_scatter': True, - 'reduce_bucket_size': 1e9, - 'contiguous_gradients': True - }, - 'steps_per_print': 1e10, - 'gradient_accumulation_steps': config.TRAIN.ACCUMULATION_STEPS, - 'gradient_clipping': config.TRAIN.CLIP_GRAD, - } - return ds_config - - -@torch.no_grad() -def throughput(data_loader, model, logger): - model.eval() - - for idx, (images, _) in enumerate(data_loader): - images = images.cuda(non_blocking=True) - batch_size = images.shape[0] - for i in range(50): - model(images) - torch.cuda.synchronize() - logger.info(f'throughput averaged with 30 times') - tic1 = time.time() - for i in range(30): - model(images) - torch.cuda.synchronize() - tic2 = time.time() - logger.info( - f'batch_size {batch_size} throughput {30 * batch_size / (tic2 - tic1)}' - ) - return - - -def train_epoch(config, model, criterion, data_loader, optimizer, epoch, mixup_fn, lr_scheduler, model_ema=None): - model.train() - - num_steps = len(data_loader) - batch_time = AverageMeter() - model_time = AverageMeter() - loss_meter = AverageMeter() - norm_meter = MyAverageMeter(300) - - start = time.time() - end = time.time() - - for idx, (samples, targets) in enumerate(data_loader): - iter_begin_time = time.time() - samples = samples.cuda(non_blocking=True) - targets = targets.cuda(non_blocking=True) - - if mixup_fn is not None: - samples, targets = mixup_fn(samples, targets) - - outputs = model(samples) - loss = criterion(outputs, targets) - - model.backward(loss) - model.step() - - if model_ema is not None: - model_ema(model) - - if (idx + 1) % config.TRAIN.ACCUMULATION_STEPS == 0: - lr_scheduler.step_update(epoch * num_steps + idx) - - torch.cuda.synchronize() - loss_meter.update(loss.item(), targets.size(0)) - norm_meter.update(optimizer._global_grad_norm) - batch_time.update(time.time() - end) - model_time.update(time.time() - iter_begin_time) - end = time.time() - - if idx % config.PRINT_FREQ == 0: - lr = optimizer.param_groups[0]['lr'] - memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0) - etas = batch_time.avg * (num_steps - idx) - logger.info( - f'Train: [{epoch}/{config.TRAIN.EPOCHS}][{idx}/{num_steps}]\t' - f'eta {datetime.timedelta(seconds=int(etas))} lr {lr:.6f}\t' - f'time {batch_time.val:.4f} ({batch_time.avg:.4f})\t' - f'model_time {model_time.val:.4f} ({model_time.avg:.4f})\t' - f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t' - f'grad_norm {norm_meter.val:.4f} ({norm_meter.avg:.4f}/{norm_meter.var:.4f})\t' - f'mem {memory_used:.0f}MB') - - epoch_time = time.time() - start - logger.info(f'EPOCH {epoch} training takes {datetime.timedelta(seconds=int(epoch_time))}') - - -@torch.no_grad() -def eval_epoch(config, data_loader, model, epoch=None): - criterion = torch.nn.CrossEntropyLoss() - model.eval() - - batch_time = AverageMeter() - loss_meter = AverageMeter() - acc1_meter = AverageMeter() - acc5_meter = AverageMeter() - - end = time.time() - for idx, (images, target) in enumerate(data_loader): - images = images.cuda(non_blocking=True) - target = target.cuda(non_blocking=True) - output = model(images) - - # convert 22k to 1k to evaluate - if output.size(-1) == 21841: - convert_file = './meta_data/map22kto1k.txt' - with open(convert_file, 'r') as f: - convert_list = [int(line) for line in f.readlines()] - output = output[:, convert_list] - - # measure accuracy and record loss - loss = criterion(output, target) - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - - acc1 = reduce_tensor(acc1) - acc5 = reduce_tensor(acc5) - loss = reduce_tensor(loss) - - loss_meter.update(loss.item(), target.size(0)) - acc1_meter.update(acc1.item(), target.size(0)) - acc5_meter.update(acc5.item(), target.size(0)) - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if idx % config.PRINT_FREQ == 0: - memory_used = torch.cuda.max_memory_allocated() / (1024.0 * 1024.0) - logger.info(f'Test: [{idx}/{len(data_loader)}]\t' - f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' - f'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f})\t' - f'Acc@1 {acc1_meter.val:.3f} ({acc1_meter.avg:.3f})\t' - f'Acc@5 {acc5_meter.val:.3f} ({acc5_meter.avg:.3f})\t' - f'Mem {memory_used:.0f}MB') - if epoch is not None: - logger.info(f'[Epoch:{epoch}] * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}') - else: - logger.info(f' * Acc@1 {acc1_meter.avg:.3f} Acc@5 {acc5_meter.avg:.3f}') - - return acc1_meter.avg, acc5_meter.avg, loss_meter.avg - - -def train(config, ds_config): - # -------------- build ---------------- # - - _, dataset_val, _, data_loader_train, data_loader_val, _, mixup_fn = build_loader(config) - model = build_model(config) - model.cuda() - - if config.MODEL.PRETRAINED: - load_pretrained(config, model, logger) - - logger.info(ds_config) - model, optimizer, _, _ = deepspeed.initialize( - config=ds_config, - model=model, - model_parameters=get_parameter_groups(model, config), - dist_init_required=False, - ) - - try: - model.register_comm_hook(state=None, hook=fp16_compress_hook) - logger.info('using fp16_compress_hook!') - except: - logger.info('cannot register fp16_compress_hook!') - - model_without_ddp = model.module - - lr_scheduler = build_scheduler(config, optimizer, len(data_loader_train)) - criterion = build_criterion(config) - - model_ema = None - if config.TRAIN.EMA.ENABLE: - model_ema = EMADeepspeed(model, config.TRAIN.EMA.DECAY) - - # -------------- resume ---------------- # - - max_accuracy = 0.0 - max_accuracy_ema = 0.0 - client_state = {} - if config.MODEL.RESUME == '' and config.TRAIN.AUTO_RESUME: - if os.path.exists(os.path.join(config.OUTPUT, 'latest')): - config.defrost() - config.MODEL.RESUME = config.OUTPUT - config.freeze() - tag = None - elif config.MODEL.RESUME: - config.MODEL.RESUME = os.path.dirname(config.MODEL.RESUME) - tag = os.path.basename(config.MODEL.RESUME) - if config.MODEL.RESUME: - logger.info('loading checkpoint from {}'.format(config.MODEL.RESUME)) - _, client_state = model.load_checkpoint(load_dir=config.MODEL.RESUME, tag=tag) - logger.info(f'client_state={client_state.keys()}') - lr_scheduler.load_state_dict(client_state['custom_lr_scheduler']) - max_accuracy = client_state['max_accuracy'] - - if model_ema is not None: - max_accuracy_ema = client_state.get('max_accuracy_ema', 0.0) - model_ema.load_state_dict((client_state['model_ema'])) - - # -------------- training ---------------- # - - logger.info(f'Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}') - logger.info(str(model)) - logger.info(get_optimizer_state_str(optimizer)) - logger.info('Start training') - logger.info('max_accuracy: {}'.format(max_accuracy)) - log_model_statistic(model_without_ddp) - - start_time = time.time() - start_epoch = client_state['epoch'] + 1 if 'epoch' in client_state else config.TRAIN.START_EPOCH - for epoch in range(start_epoch, config.TRAIN.EPOCHS): - data_loader_train.sampler.set_epoch(epoch) - train_epoch(config, model, criterion, data_loader_train, optimizer, epoch, mixup_fn, lr_scheduler, - model_ema=model_ema) - - if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.EPOCHS - 1: - model.save_checkpoint( - save_dir=config.OUTPUT, - tag=f'epoch{epoch}', - client_state={ - 'custom_lr_scheduler': lr_scheduler.state_dict(), - 'max_accuracy': max_accuracy, - 'epoch': epoch, - 'config': config, - 'max_accuracy_ema': max_accuracy_ema if model_ema is not None else 0.0, - 'model_ema': model_ema.state_dict() if model_ema is not None else None, - } - ) - - if epoch % config.EVAL_FREQ == 0: - acc1, _, _ = eval_epoch(config, data_loader_val, model, epoch) - logger.info(f'Accuracy of the network on the {len(dataset_val)} test images: {acc1:.1f}%') - - if acc1 > max_accuracy: - model.save_checkpoint( - save_dir=config.OUTPUT, - tag='best', - client_state={ - 'custom_lr_scheduler': lr_scheduler.state_dict(), - 'max_accuracy': max_accuracy, - 'epoch': epoch, - 'config': config, - 'max_accuracy_ema': max_accuracy_ema if model_ema is not None else 0.0, - 'model_ema': model_ema.state_dict() if model_ema is not None else None, - } - ) - - max_accuracy = max(max_accuracy, acc1) - logger.info(f'Max accuracy: {max_accuracy:.2f}%') - - if model_ema is not None: - with model_ema.activate(model): - acc1_ema, _, _ = eval_epoch(config, data_loader_val, model, epoch) - logger.info(f'[EMA] Accuracy of the network on the {len(dataset_val)} test images: {acc1_ema:.1f}%') - max_accuracy_ema = max(max_accuracy_ema, acc1_ema) - logger.info(f'[EMA] Max accuracy: {max_accuracy_ema:.2f}%') - - total_time = time.time() - start_time - total_time_str = str(datetime.timedelta(seconds=int(total_time))) - logger.info('Training time {}'.format(total_time_str)) - - -def eval(config): - _, _, _, _, data_loader_val, _, _ = build_loader(config) - model = build_model(config) - model.cuda() - model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[config.LOCAL_RANK], broadcast_buffers=False) - - model_wo_ddp = model.module - if config.MODEL.RESUME: - try: - checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu') - msg = model_wo_ddp.load_state_dict(checkpoint['model'], strict=False) - logger.info(msg) - except: - try: - from deepspeed.utils.zero_to_fp32 import \ - get_fp32_state_dict_from_zero_checkpoint - ckpt_dir = os.path.dirname(config.MODEL.RESUME) - tag = os.path.basename(config.MODEL.RESUME) - state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir=ckpt_dir, tag=tag) - model_wo_ddp.load_state_dict(state_dict) - except: - checkpoint = torch.load(os.path.join(config.MODEL.RESUME, 'mp_rank_00_model_states.pt'), - map_location='cpu') - model_wo_ddp.load_state_dict(checkpoint['module']) - elif config.MODEL.PRETRAINED: - load_pretrained(config, model_wo_ddp, logger) - - if config.THROUGHPUT_MODE: - throughput(data_loader_val, model, logger) - - eval_epoch(config, data_loader_val, model) - - -if __name__ == '__main__': - args, config = parse_option() - - # init distributed env - if 'SLURM_PROCID' in os.environ: - print('\nDist init: SLURM') - rank = int(os.environ['SLURM_PROCID']) - gpu = rank % torch.cuda.device_count() - config.defrost() - config.LOCAL_RANK = gpu - config.freeze() - - world_size = int(os.environ['SLURM_NTASKS']) - if 'MASTER_PORT' not in os.environ: - os.environ['MASTER_PORT'] = '29501' - node_list = os.environ['SLURM_NODELIST'] - addr = subprocess.getoutput( - f'scontrol show hostname {node_list} | head -n1') - if 'MASTER_ADDR' not in os.environ: - os.environ['MASTER_ADDR'] = addr - - os.environ['RANK'] = str(rank) - os.environ['LOCAL_RANK'] = str(gpu) - os.environ['LOCAL_SIZE'] = str(torch.cuda.device_count()) - os.environ['WORLD_SIZE'] = str(world_size) - if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: - rank = int(os.environ['RANK']) - world_size = int(os.environ['WORLD_SIZE']) - print(f'RANK and WORLD_SIZE in environ: {rank}/{world_size}') - else: - rank = -1 - world_size = -1 - torch.cuda.set_device(config.LOCAL_RANK) - torch.distributed.init_process_group(backend='nccl', - init_method='env://', - world_size=world_size, - rank=rank) - torch.distributed.barrier() - - os.makedirs(config.OUTPUT, exist_ok=True) - logger = create_logger(output_dir=config.OUTPUT, - dist_rank=dist.get_rank(), - name=f'{config.MODEL.NAME}') - logger.info(config.dump()) - - if dist.get_rank() == 0: - save_config(config) - scale_learning_rate(config, dist.get_world_size()) - seed_everything(config.SEED, dist.get_rank()) - - if config.EVAL_MODE: - eval(config) - else: - train(config, build_ds_config(config, args)) diff --git a/classification/models/build.py b/classification/models/build.py index 708737ea..85c15d84 100644 --- a/classification/models/build.py +++ b/classification/models/build.py @@ -4,6 +4,7 @@ # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +from .clip_vit import CLIPViT from .intern_vit_6b import InternViT6B @@ -28,7 +29,21 @@ def build_model(config): freeze_vit=config.MODEL.INTERN_VIT_6B.FREEZE_VIT, pretrained=config.MODEL.INTERN_VIT_6B.PRETRAINED, cls_target=config.MODEL.INTERN_VIT_6B.CLS_TARGET, - head_norm_type=config.MODEL.INTERN_VIT_6B.HEAD_NORM_TYPE, + norm_type=config.MODEL.INTERN_VIT_6B.NORM_TYPE, + ) + elif model_type == 'clip_vit': + model = CLIPViT( + patch_size=config.MODEL.CLIP_VIT.PATCH_SIZE, + img_size=config.DATA.IMG_SIZE, + pretrain_size=config.MODEL.CLIP_VIT.PRETRAIN_SIZE, + embed_dim=config.MODEL.CLIP_VIT.EMBED_DIM, + num_heads=config.MODEL.CLIP_VIT.NUM_HEADS, + mlp_ratio=config.MODEL.CLIP_VIT.MLP_RATIO, + depth=config.MODEL.CLIP_VIT.DEPTH, + with_cp=config.TRAIN.USE_CHECKPOINT, + freeze_vit=config.MODEL.CLIP_VIT.FREEZE_VIT, + pretrained=config.MODEL.CLIP_VIT.PRETRAINED, + cls_target=config.MODEL.CLIP_VIT.CLS_TARGET, ) else: raise NotImplementedError(f'Unkown model: {model_type}') diff --git a/classification/models/clip_vit.py b/classification/models/clip_vit.py new file mode 100644 index 00000000..33e86640 --- /dev/null +++ b/classification/models/clip_vit.py @@ -0,0 +1,195 @@ +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import DropPath +from transformers import CLIPModel + + +def _freeze_params(module): + for param in module.parameters(): + param.requires_grad = False + + +class CrossAttention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., attn_head_dim=None, out_dim=None): + super().__init__() + if out_dim is None: + out_dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = qk_scale or head_dim ** -0.5 + assert all_head_dim == dim + + self.q = nn.Linear(dim, all_head_dim, bias=False) + self.k = nn.Linear(dim, all_head_dim, bias=False) + self.v = nn.Linear(dim, all_head_dim, bias=False) + + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.k_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.k_bias = None + self.v_bias = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, out_dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, k=None, v=None): + B, N, C = x.shape + N_k = k.shape[1] + N_v = v.shape[1] + + q_bias, k_bias, v_bias = None, None, None + if self.q_bias is not None: + q_bias = self.q_bias + k_bias = self.k_bias + v_bias = self.v_bias + + q = F.linear(input=x, weight=self.q.weight, bias=q_bias) + q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) # (B, N_head, N_q, dim) + + k = F.linear(input=k, weight=self.k.weight, bias=k_bias) + k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) + + v = F.linear(input=v, weight=self.v.weight, bias=v_bias) + v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) # (B, N_head, N_q, N_k) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class AttentiveBlock(nn.Module): + + def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, attn_head_dim=None, out_dim=None): + super().__init__() + + self.norm1_q = norm_layer(dim) + self.norm1_k = norm_layer(dim) + self.norm1_v = norm_layer(dim) + self.cross_attn = CrossAttention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, + proj_drop=drop, attn_head_dim=attn_head_dim, out_dim=out_dim) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None): + x_q = self.norm1_q(x_q + pos_q) + x_k = self.norm1_k(x_kv + pos_k) + x_v = self.norm1_v(x_kv) + x = self.cross_attn(x_q, k=x_k, v=x_v) + + return x + + +class AttentionPoolingBlock(AttentiveBlock): + + def forward(self, x): + x_q = x.mean(1, keepdim=True) + x_kv, pos_q, pos_k = x, 0, 0 + x = super().forward(x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None) + x = x.squeeze(1) + return x + + +class CLIPViT(nn.Module): + + def __init__(self, patch_size=14, img_size=336, pretrain_size=336, embed_dim=1024, num_heads=16, + mlp_ratio=4, depth=48, with_cp=True, freeze_vit=True, cls_target='cls_patch_concat', + num_classes=1000, pretrained=None): + super().__init__() + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + self.pretrain_size = pretrain_size + self.img_size = img_size + self.patch_size = patch_size + self.cls_target = cls_target + self.depth = depth + self.mlp_ratio = mlp_ratio + self.with_cp = with_cp + + model = CLIPModel.from_pretrained(pretrained) + model.post_layernorm = nn.Identity() + self.model = model.vision_model + + if freeze_vit: + _freeze_params(self) + + if cls_target == 'cls_patch_concat': + self.norm = nn.SyncBatchNorm(embed_dim * 2, eps=1e-6) + self.head = nn.Linear(embed_dim * 2, num_classes) if num_classes > 0 else nn.Identity() + elif cls_target == 'attention_pooling': + self.attn_pooling = AttentionPoolingBlock( + dim=embed_dim, num_heads=num_heads, qkv_bias=True, qk_scale=None, + drop=0., attn_drop=0.0, norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=embed_dim) + self.norm = nn.SyncBatchNorm(embed_dim, eps=1e-6) + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + else: + raise NotImplementedError + + if type(self.head) != nn.Identity: + self.head.weight.data.normal_(mean=0.0, std=0.01) + self.head.bias.data.zero_() + + @property + def dtype(self): + return self.model.embeddings.patch_embedding.weight.dtype + + def forward_features(self, x): + x = x.type(self.dtype) + x = self.model(x) + x = x.last_hidden_state + return x + + def forward(self, x): + x = self.forward_features(x) + if self.cls_target == 'cls_patch_concat': + x = torch.cat((x[:, 0, :], x[:, 1:, :].mean(dim=1)), dim=-1) + elif self.cls_target == 'attention_pooling': + x = self.attn_pooling(x) + else: + raise NotImplementedError + x = self.norm(x) + x = self.head(x) + return x + + @torch.jit.ignore + def lr_decay_keywords(self, decay_ratio=0.95): + lr_ratios = {} + + # layers + for idx in range(self.depth): + tag = 'layers.{}.'.format(idx) + decay = 1.0 * (decay_ratio ** (self.depth - idx)) + lr_ratios[tag] = decay + + # patch_embedding + lr_ratios['patch_embedding'] = 1.0 * (decay_ratio ** (self.depth + 1)) + lr_ratios['position_embedding'] = 1.0 * (decay_ratio ** (self.depth + 1)) + lr_ratios['pre_layrnorm'] = 1.0 * (decay_ratio ** (self.depth + 1)) + + return lr_ratios diff --git a/classification/models/intern_vit_6b.py b/classification/models/intern_vit_6b.py index f3347629..336a8585 100644 --- a/classification/models/intern_vit_6b.py +++ b/classification/models/intern_vit_6b.py @@ -333,7 +333,7 @@ def __init__(self, in_chans=3, patch_size=14, img_size=224, pretrain_size=224, q embed_dim=3200, num_heads=25, mlp_ratio=4, init_values=0.1, qk_normalization=True, depth=48, use_flash_attn=True, with_cp=True, layerscale_force_fp32=False, freeze_vit=True, cls_target='cls_patch_concat', num_classes=1000, attn_pool_num_heads=16, clip_embed_dim=768, - head_norm_type='bn', pretrained=None): + norm_type='rms', pretrained=None): super().__init__() self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models @@ -349,7 +349,13 @@ def __init__(self, in_chans=3, patch_size=14, img_size=224, pretrain_size=224, q print('Warning: Flash Attention is not available, use_flash_attn is set to False.') use_flash_attn = [use_flash_attn] * depth if not isinstance(use_flash_attn, list) else use_flash_attn - norm_layer_for_blocks = partial(RMSNorm, eps=1e-6) + if norm_type == 'rms': + norm_layer_for_blocks = partial(RMSNorm, eps=1e-6) + elif norm_type == 'ln': + norm_layer_for_blocks = partial(nn.LayerNorm, eps=1e-6) + else: + raise NotImplementedError + self.norm_layer_for_blocks = norm_layer_for_blocks self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim) num_patches = self.patch_embed.num_patches @@ -381,16 +387,16 @@ def __init__(self, in_chans=3, patch_size=14, img_size=224, pretrain_size=224, q _freeze_params(self) if cls_target == 'cls_patch_concat': - if head_norm_type == 'bn': - self.norm = nn.SyncBatchNorm(embed_dim * 2, eps=1e-6) - else: - self.norm = nn.LayerNorm(embed_dim * 2, eps=1e-6) + self.norm = nn.SyncBatchNorm(embed_dim * 2, eps=1e-6) self.head = nn.Linear(embed_dim * 2, num_classes) if num_classes > 0 else nn.Identity() + elif cls_target == 'attention_pooling': + self.attn_pooling = AttentionPoolingBlock( + dim=embed_dim, num_heads=num_heads, qkv_bias=True, qk_scale=None, + drop=0., attn_drop=0.0, norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=embed_dim) + self.norm = nn.SyncBatchNorm(embed_dim, eps=1e-6) + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() elif cls_target == 'clip_projector': - if head_norm_type == 'bn': - self.norm = nn.SyncBatchNorm(clip_embed_dim, eps=1e-6) - else: - self.norm = nn.LayerNorm(clip_embed_dim, eps=1e-6) + self.norm = nn.SyncBatchNorm(clip_embed_dim, eps=1e-6) self.head = nn.Linear(clip_embed_dim, num_classes) if num_classes > 0 else nn.Identity() else: raise NotImplementedError @@ -447,6 +453,8 @@ def forward(self, x): x = self.forward_features(x) if self.cls_target == 'cls_patch_concat': x = torch.cat((x[:, 0, :], x[:, 1:, :].mean(dim=1)), dim=-1) + elif self.cls_target == 'attention_pooling': + x = self.attn_pooling(x) elif self.cls_target == 'clip_projector': x = self.clip_projector(x) else: diff --git a/classification/train_in1k_deepspeed.sh b/classification/train_in1k_deepspeed.sh deleted file mode 100644 index 83c9f0b9..00000000 --- a/classification/train_in1k_deepspeed.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash - -set -x - -PARTITION=$1 -JOB_NAME=$2 -CONFIG=$3 -GPUS=${GPUS:-8} -GPUS_PER_NODE=${GPUS_PER_NODE:-8} -CPUS_PER_TASK=${CPUS_PER_TASK:-10} -SRUN_ARGS=${SRUN_ARGS:-""} - -PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ - srun -p ${PARTITION} \ - --job-name=${JOB_NAME} \ - --gres=gpu:${GPUS_PER_NODE} \ - --ntasks=${GPUS} \ - --ntasks-per-node=${GPUS_PER_NODE} \ - --cpus-per-task=${CPUS_PER_TASK} \ - --kill-on-bad-exit=1 \ - --quotatype=spot \ - ${SRUN_ARGS} \ - python -u main_deepspeed.py \ - --cfg ${CONFIG} \ - --local-rank 0 \ - --data-path /mnt/lustre/share/images \ - --output work_dirs_deepspeed ${@:4}