From d0bdd8f5b06cc3f41f0f6ea01c3285176781e97b Mon Sep 17 00:00:00 2001
From: Peisong Wang <peisong.wang@nlpr.ia.ac.cn>
Date: Fri, 13 Dec 2024 20:55:23 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BD=8E=E7=B2=BE=E5=BA=A6=E9=87=8F=E5=8C=96?=
 =?UTF-8?q?=E8=AE=AD=E7=BB=83=EF=BC=88LPT=E3=80=81QAT=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 example/low_precision_training/README.md      |  112 +
 .../low_precision_training/ppcls/.DS_Store    |  Bin 0 -> 6148 bytes
 .../low_precision_training/ppcls/__init__.py  |   20 +
 .../ppcls/arch/__init__.py                    |  177 ++
 .../ppcls/arch/backbone/__init__.py           |  117 +
 .../ppcls/arch/backbone/base/__init__.py      |    0
 .../ppcls/arch/backbone/base/dbb/dbb_block.py |  365 +++
 .../arch/backbone/base/dbb/dbb_transforms.py  |   73 +
 .../ppcls/arch/backbone/base/theseus_layer.py |  398 ++++
 .../backbone/legendary_models/__init__.py     |    8 +
 .../legendary_models/custom_devices_layers.py |   37 +
 .../arch/backbone/legendary_models/esnet.py   |  369 +++
 .../arch/backbone/legendary_models/hrnet.py   |  797 +++++++
 .../backbone/legendary_models/inception_v3.py |  559 +++++
 .../backbone/legendary_models/mobilenet_v1.py |  259 +++
 .../backbone/legendary_models/mobilenet_v3.py |  591 +++++
 .../backbone/legendary_models/mobilenet_v4.py |  836 +++++++
 .../backbone/legendary_models/pp_hgnet.py     |  376 +++
 .../backbone/legendary_models/pp_hgnet_v2.py  |  706 ++++++
 .../backbone/legendary_models/pp_lcnet.py     |  520 +++++
 .../backbone/legendary_models/pp_lcnet_v2.py  |  394 ++++
 .../arch/backbone/legendary_models/resnet.py  |  649 ++++++
 .../legendary_models/swin_transformer.py      | 1002 ++++++++
 .../arch/backbone/legendary_models/vgg.py     |  261 +++
 .../ppcls/arch/backbone/model_zoo/__init__.py |    0
 .../arch/backbone/model_zoo/adaface_ir_net.py |  529 +++++
 .../ppcls/arch/backbone/model_zoo/alexnet.py  |  170 ++
 .../ppcls/arch/backbone/model_zoo/cae.py      |  860 +++++++
 .../ppcls/arch/backbone/model_zoo/convnext.py |  282 +++
 .../ppcls/arch/backbone/model_zoo/cspnet.py   |  377 +++
 .../backbone/model_zoo/cswin_transformer.py   |  651 ++++++
 .../ppcls/arch/backbone/model_zoo/cvt.py      |  723 ++++++
 .../ppcls/arch/backbone/model_zoo/darknet.py  |  199 ++
 .../ppcls/arch/backbone/model_zoo/densenet.py |  346 +++
 .../model_zoo/distilled_vision_transformer.py |  273 +++
 .../ppcls/arch/backbone/model_zoo/dla.py      |  529 +++++
 .../ppcls/arch/backbone/model_zoo/dpn.py      |  453 ++++
 .../ppcls/arch/backbone/model_zoo/dsnet.py    |  701 ++++++
 .../arch/backbone/model_zoo/efficientnet.py   | 1028 +++++++++
 .../backbone/model_zoo/efficientnet_v2.py     |  994 ++++++++
 .../arch/backbone/model_zoo/fasternet.py      |  399 ++++
 .../arch/backbone/model_zoo/foundation_vit.py | 1261 +++++++++++
 .../ppcls/arch/backbone/model_zoo/ghostnet.py |  364 +++
 .../arch/backbone/model_zoo/googlenet.py      |  365 +++
 .../ppcls/arch/backbone/model_zoo/hardnet.py  |  294 +++
 .../arch/backbone/model_zoo/inception_v4.py   |  479 ++++
 .../ppcls/arch/backbone/model_zoo/levit.py    |  590 +++++
 .../ppcls/arch/backbone/model_zoo/micronet.py |  618 +++++
 .../ppcls/arch/backbone/model_zoo/mixnet.py   |  812 +++++++
 .../arch/backbone/model_zoo/mobilenet_v2.py   |  316 +++
 .../arch/backbone/model_zoo/mobilenext.py     |  262 +++
 .../arch/backbone/model_zoo/mobilevit.py      |  479 ++++
 .../arch/backbone/model_zoo/mobilevit_v2.py   |  593 +++++
 .../arch/backbone/model_zoo/mobilevit_v3.py   | 1445 ++++++++++++
 .../ppcls/arch/backbone/model_zoo/nextvit.py  |  643 ++++++
 .../ppcls/arch/backbone/model_zoo/peleenet.py |  264 +++
 .../ppcls/arch/backbone/model_zoo/pvt_v2.py   |  493 ++++
 .../ppcls/arch/backbone/model_zoo/rednet.py   |  204 ++
 .../ppcls/arch/backbone/model_zoo/regnet.py   |  531 +++++
 .../ppcls/arch/backbone/model_zoo/repvgg.py   |  451 ++++
 .../ppcls/arch/backbone/model_zoo/res2net.py  |  266 +++
 .../arch/backbone/model_zoo/res2net_vd.py     |  308 +++
 .../ppcls/arch/backbone/model_zoo/resnest.py  |  780 +++++++
 .../arch/backbone/model_zoo/resnet_vc.py      |  311 +++
 .../ppcls/arch/backbone/model_zoo/resnext.py  |  303 +++
 .../arch/backbone/model_zoo/resnext101_wsl.py |  506 +++++
 .../arch/backbone/model_zoo/resnext_vd.py     |  319 +++
 .../ppcls/arch/backbone/model_zoo/rexnet.py   |  283 +++
 .../arch/backbone/model_zoo/se_resnet_vd.py   |  392 ++++
 .../arch/backbone/model_zoo/se_resnext.py     |  369 +++
 .../arch/backbone/model_zoo/se_resnext_vd.py  |  311 +++
 .../arch/backbone/model_zoo/shufflenet_v2.py  |  364 +++
 .../arch/backbone/model_zoo/squeezenet.py     |  196 ++
 .../ppcls/arch/backbone/model_zoo/starnet.py  |  197 ++
 .../ppcls/arch/backbone/model_zoo/svtrnet.py  |  699 ++++++
 .../backbone/model_zoo/swin_transformer_v2.py | 1061 +++++++++
 .../ppcls/arch/backbone/model_zoo/tinynet.py  |  196 ++
 .../ppcls/arch/backbone/model_zoo/tnt.py      |  410 ++++
 .../ppcls/arch/backbone/model_zoo/twins.py    |  692 ++++++
 .../arch/backbone/model_zoo/uniformer.py      |  552 +++++
 .../ppcls/arch/backbone/model_zoo/van.py      |  362 +++
 .../backbone/model_zoo/vision_transformer.py  |  459 ++++
 .../arch/backbone/model_zoo/wideresnet.py     |  236 ++
 .../ppcls/arch/backbone/model_zoo/xception.py |  393 ++++
 .../backbone/model_zoo/xception_deeplab.py    |  423 ++++
 .../arch/backbone/variant_models/__init__.py  |    5 +
 .../variant_models/efficientnet_variant.py    |   44 +
 .../variant_models/foundation_vit_variant.py  |   52 +
 .../variant_models/pp_lcnet_variant.py        |   29 +
 .../variant_models/pp_lcnetv2_variant.py      |   56 +
 .../backbone/variant_models/resnet_variant.py |  203 ++
 .../swin_transformer_variant.py               |  355 +++
 .../backbone/variant_models/vgg_variant.py    |   28 +
 .../ppcls/arch/distill/afd_attention.py       |  123 +
 .../ppcls/arch/gears/__init__.py              |   74 +
 .../ppcls/arch/gears/adamargin.py             |  111 +
 .../ppcls/arch/gears/arcmargin.py             |   74 +
 .../ppcls/arch/gears/bnneck.py                |   56 +
 .../ppcls/arch/gears/circlemargin.py          |   61 +
 .../ppcls/arch/gears/cosmargin.py             |   57 +
 .../ppcls/arch/gears/fc.py                    |   48 +
 .../ppcls/arch/gears/frfn_neck.py             |   32 +
 .../ppcls/arch/gears/identity_head.py         |    9 +
 .../ppcls/arch/gears/metabnneck.py            |  122 +
 .../ppcls/arch/gears/ml_decoder.py            |  124 +
 .../ppcls/arch/gears/vehicle_neck.py          |   52 +
 .../ppcls/arch/slim/__init__.py               |   16 +
 .../ppcls/arch/slim/prune.py                  |   64 +
 .../ppcls/arch/slim/quant.py                  |   63 +
 .../ppcls/arch/utils.py                       |   99 +
 .../ppcls/configs/.DS_Store                   |  Bin 0 -> 6148 bytes
 .../PPLCNet_x1_0_pedestrian_attribute.yaml    |  148 ++
 .../Attr/PPLCNet_x1_0_vehicle_attribute.yaml  |  149 ++
 .../configs/Attr/StrongBaselineAttr.yaml      |  113 +
 .../CAE/cae_base_patch16_224_finetune.yaml    |  169 ++
 .../CAE/cae_large_patch16_224_finetune.yaml   |  169 ++
 .../CLIP_vit_base_patch16_224_finetune.yaml   |  162 ++
 .../CLIP_vit_large_patch14_224_finetune.yaml  |  162 ++
 .../Cartoonface/ResNet50_icartoon.yaml        |  149 ++
 .../ppcls/configs/DeepHash/DCH.yaml           |  141 ++
 .../ppcls/configs/DeepHash/DSHSD.yaml         |  142 ++
 .../ppcls/configs/DeepHash/LCDSH.yaml         |  138 ++
 .../Gallery2FC_PPLCNet_x2_5.yaml              |   51 +
 .../GeneralRecognition_PPLCNet_x2_5.yaml      |  148 ++
 ...eneralRecognition_PPLCNet_x2_5_binary.yaml |  145 ++
 .../GeneralRecognition_PPLCNet_x2_5_dml.yaml  |  188 ++
 .../GeneralRecognition_PPLCNet_x2_5_udml.yaml |  193 ++
 .../GeneralRecognitionV2_CLIP_vit_base.yaml   |  169 ++
 .../GeneralRecognitionV2_CLIP_vit_large.yaml  |  169 ++
 .../GeneralRecognitionV2_PPLCNetV2_base.yaml  |  209 ++
 .../configs/ImageNet/AlexNet/AlexNet.yaml     |  141 ++
 .../configs/ImageNet/CSPNet/CSPDarkNet53.yaml |  143 ++
 .../CSWinTransformer_base_224.yaml            |  174 ++
 .../CSWinTransformer_base_384.yaml            |  173 ++
 .../CSWinTransformer_large_224.yaml           |  173 ++
 .../CSWinTransformer_large_384.yaml           |  173 ++
 .../CSWinTransformer_small_224.yaml           |  173 ++
 .../CSWinTransformer_tiny_224.yaml            |  173 ++
 .../ImageNet/ConvNeXt/ConvNeXt_base_224.yaml  |  182 ++
 .../ImageNet/ConvNeXt/ConvNeXt_base_384.yaml  |  182 ++
 .../ImageNet/ConvNeXt/ConvNeXt_large_224.yaml |  182 ++
 .../ImageNet/ConvNeXt/ConvNeXt_large_384.yaml |  182 ++
 .../ImageNet/ConvNeXt/ConvNeXt_small.yaml     |  182 ++
 .../ImageNet/ConvNeXt/ConvNeXt_tiny.yaml      |  182 ++
 .../configs/ImageNet/CvT/CvT_13_224.yaml      |  174 ++
 .../configs/ImageNet/CvT/CvT_13_384.yaml      |  170 ++
 .../configs/ImageNet/CvT/CvT_21_224.yaml      |  154 ++
 .../configs/ImageNet/CvT/CvT_21_384.yaml      |  170 ++
 .../configs/ImageNet/CvT/CvT_W24_384.yaml     |  170 ++
 .../ppcls/configs/ImageNet/DLA/DLA102.yaml    |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA102x.yaml   |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA102x2.yaml  |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA169.yaml    |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA34.yaml     |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA46_c.yaml   |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA46x_c.yaml  |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA60.yaml     |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA60x.yaml    |  142 ++
 .../ppcls/configs/ImageNet/DLA/DLA60x_c.yaml  |  142 ++
 .../ppcls/configs/ImageNet/DPN/DPN107.yaml    |  142 ++
 .../ppcls/configs/ImageNet/DPN/DPN131.yaml    |  142 ++
 .../ppcls/configs/ImageNet/DPN/DPN68.yaml     |  142 ++
 .../ppcls/configs/ImageNet/DPN/DPN92.yaml     |  142 ++
 .../ppcls/configs/ImageNet/DPN/DPN98.yaml     |  142 ++
 .../configs/ImageNet/DSNet/DSNet_base.yaml    |  169 ++
 .../configs/ImageNet/DSNet/DSNet_small.yaml   |  170 ++
 .../configs/ImageNet/DSNet/DSNet_tiny.yaml    |  169 ++
 .../configs/ImageNet/DarkNet/DarkNet53.yaml   |  142 ++
 .../DataAugment/ResNet50_AutoAugment.yaml     |  141 ++
 .../DataAugment/ResNet50_Baseline.yaml        |  140 ++
 .../ImageNet/DataAugment/ResNet50_Cutmix.yaml |  140 ++
 .../ImageNet/DataAugment/ResNet50_Cutout.yaml |  143 ++
 .../DataAugment/ResNet50_GridMask.yaml        |  146 ++
 .../DataAugment/ResNet50_HideAndSeek.yaml     |  141 ++
 .../ImageNet/DataAugment/ResNet50_Mixup.yaml  |  140 ++
 .../DataAugment/ResNet50_RandAugment.yaml     |  143 ++
 .../DataAugment/ResNet50_RandomErasing.yaml   |  146 ++
 .../DeiT/DeiT_base_distilled_patch16_224.yaml |  169 ++
 .../DeiT/DeiT_base_distilled_patch16_384.yaml |  169 ++
 .../ImageNet/DeiT/DeiT_base_patch16_224.yaml  |  169 ++
 .../ImageNet/DeiT/DeiT_base_patch16_384.yaml  |  169 ++
 .../DeiT_small_distilled_patch16_224.yaml     |  168 ++
 .../ImageNet/DeiT/DeiT_small_patch16_224.yaml |  169 ++
 .../DeiT/DeiT_tiny_distilled_patch16_224.yaml |  169 ++
 .../ImageNet/DeiT/DeiT_tiny_patch16_224.yaml  |  169 ++
 .../ImageNet/DenseNet/DenseNet121.yaml        |  142 ++
 .../ImageNet/DenseNet/DenseNet161.yaml        |  142 ++
 .../ImageNet/DenseNet/DenseNet169.yaml        |  142 ++
 .../ImageNet/DenseNet/DenseNet201.yaml        |  142 ++
 .../ImageNet/DenseNet/DenseNet264.yaml        |  142 ++
 .../Distillation/PPLCNet_x1_0_ssld.yaml       |  161 ++
 .../Distillation/PPLCNet_x2_5_dml.yaml        |  161 ++
 .../Distillation/PPLCNet_x2_5_ssld.yaml       |  160 ++
 .../Distillation/PPLCNet_x2_5_udml.yaml       |  171 ++
 ...mv3_large_x1_0_distill_mv3_small_x1_0.yaml |  167 ++
 .../res2net200_vd_distill_pphgnet_base.yaml   |  171 ++
 .../resnet34_distill_resnet18_afd.yaml        |  211 ++
 .../resnet34_distill_resnet18_dist.yaml       |  164 ++
 .../resnet34_distill_resnet18_dkd.yaml        |  166 ++
 .../resnet34_distill_resnet18_mgd.yaml        |  171 ++
 .../resnet34_distill_resnet18_pefd.yaml       |  171 ++
 .../resnet34_distill_resnet18_skd.yaml        |  163 ++
 .../resnet34_distill_resnet18_wsl.yaml        |  164 ++
 .../configs/ImageNet/ESNet/ESNet_x0_25.yaml   |  141 ++
 .../configs/ImageNet/ESNet/ESNet_x0_5.yaml    |  141 ++
 .../configs/ImageNet/ESNet/ESNet_x0_75.yaml   |  141 ++
 .../configs/ImageNet/ESNet/ESNet_x1_0.yaml    |  141 ++
 .../ImageNet/EfficientNet/EfficientNetB0.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB1.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB2.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB3.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB4.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB5.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB6.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB7.yaml |  145 ++
 .../EfficientNetV2/EfficientNetV2_S.yaml      |  147 ++
 .../ImageNet/FasterNet/FasterNet_L.yaml       |  163 ++
 .../ImageNet/FasterNet/FasterNet_M.yaml       |  163 ++
 .../ImageNet/FasterNet/FasterNet_S.yaml       |  163 ++
 .../ImageNet/FasterNet/FasterNet_T0.yaml      |  163 ++
 .../ImageNet/FasterNet/FasterNet_T1.yaml      |  163 ++
 .../ImageNet/FasterNet/FasterNet_T2.yaml      |  162 ++
 .../ImageNet/GhostNet/GhostNet_x0_5.yaml      |  142 ++
 .../ImageNet/GhostNet/GhostNet_x1_0.yaml      |  142 ++
 .../ImageNet/GhostNet/GhostNet_x1_3.yaml      |  142 ++
 .../configs/ImageNet/HRNet/HRNet_W18_C.yaml   |  142 ++
 .../configs/ImageNet/HRNet/HRNet_W30_C.yaml   |  142 ++
 .../configs/ImageNet/HRNet/HRNet_W32_C.yaml   |  142 ++
 .../configs/ImageNet/HRNet/HRNet_W40_C.yaml   |  142 ++
 .../configs/ImageNet/HRNet/HRNet_W44_C.yaml   |  142 ++
 .../configs/ImageNet/HRNet/HRNet_W48_C.yaml   |  142 ++
 .../configs/ImageNet/HRNet/HRNet_W64_C.yaml   |  142 ++
 .../ImageNet/HarDNet/HarDNet39_ds.yaml        |  142 ++
 .../configs/ImageNet/HarDNet/HarDNet68.yaml   |  142 ++
 .../ImageNet/HarDNet/HarDNet68_ds.yaml        |  142 ++
 .../configs/ImageNet/HarDNet/HarDNet85.yaml   |  142 ++
 .../configs/ImageNet/Inception/GoogLeNet.yaml |  141 ++
 .../ImageNet/Inception/InceptionV3.yaml       |  142 ++
 .../ImageNet/Inception/InceptionV4.yaml       |  142 ++
 .../configs/ImageNet/LeViT/LeViT_128.yaml     |  142 ++
 .../configs/ImageNet/LeViT/LeViT_128S.yaml    |  142 ++
 .../configs/ImageNet/LeViT/LeViT_192.yaml     |  142 ++
 .../configs/ImageNet/LeViT/LeViT_256.yaml     |  142 ++
 .../configs/ImageNet/LeViT/LeViT_384.yaml     |  142 ++
 .../ImageNet/MicroNet/MicroNet_M0.yaml        |  147 ++
 .../ImageNet/MicroNet/MicroNet_M1.yaml        |  147 ++
 .../ImageNet/MicroNet/MicroNet_M2.yaml        |  147 ++
 .../ImageNet/MicroNet/MicroNet_M3.yaml        |  152 ++
 .../configs/ImageNet/MixNet/MixNet_L.yaml     |  144 ++
 .../configs/ImageNet/MixNet/MixNet_M.yaml     |  144 ++
 .../configs/ImageNet/MixNet/MixNet_S.yaml     |  144 ++
 .../ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml  |  160 ++
 .../ImageNet/MobileNetV1/MobileNetV1.yaml     |  144 ++
 .../MobileNetV1/MobileNetV1_x0_25.yaml        |  142 ++
 .../MobileNetV1/MobileNetV1_x0_5.yaml         |  142 ++
 .../MobileNetV1/MobileNetV1_x0_75.yaml        |  142 ++
 .../ImageNet/MobileNetV2/MobileNetV2.yaml     |  142 ++
 .../MobileNetV2/MobileNetV2_x0_25.yaml        |  140 ++
 .../MobileNetV2/MobileNetV2_x0_5.yaml         |  140 ++
 .../MobileNetV2/MobileNetV2_x0_75.yaml        |  140 ++
 .../MobileNetV2/MobileNetV2_x1_5.yaml         |  140 ++
 .../MobileNetV2/MobileNetV2_x2_0.yaml         |  140 ++
 .../MobileNetV3/MobileNetV3_large_x0_35.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_large_x0_5.yaml   |  142 ++
 .../MobileNetV3/MobileNetV3_large_x0_75.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_large_x1_0.yaml   |  143 ++
 .../MobileNetV3/MobileNetV3_large_x1_25.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_small_x0_35.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_small_x0_5.yaml   |  142 ++
 .../MobileNetV3/MobileNetV3_small_x0_75.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_small_x1_0.yaml   |  143 ++
 .../MobileNetV3_small_x1_0_ampo2_ultra.yaml   |  141 ++
 .../MobileNetV3_small_x1_0_fp32_ultra.yaml    |  143 ++
 .../MobileNetV3/MobileNetV3_small_x1_25.yaml  |  142 ++
 .../MobileNetV4/MobileNetV4_conv_large.yaml   |  181 ++
 .../MobileNetV4/MobileNetV4_conv_medium.yaml  |  181 ++
 .../MobileNetV4/MobileNetV4_conv_small.yaml   |  181 ++
 .../MobileNetV4/MobileNetV4_hybrid_large.yaml |  182 ++
 .../MobileNetV4_hybrid_medium.yaml            |  176 ++
 .../ImageNet/MobileViT/MobileViT_S.yaml       |  151 ++
 .../ImageNet/MobileViT/MobileViT_XS.yaml      |  151 ++
 .../ImageNet/MobileViT/MobileViT_XXS.yaml     |  151 ++
 .../MobileViTV2/MobileViTV2_x0_5.yaml         |  174 ++
 .../MobileViTV2/MobileViTV2_x1_0.yaml         |  174 ++
 .../MobileViTV2/MobileViTV2_x1_5.yaml         |  174 ++
 .../MobileViTV2/MobileViTV2_x2_0.yaml         |  174 ++
 .../ImageNet/MobileViTV3/MobileViTV3_S.yaml   |  153 ++
 .../MobileViTV3/MobileViTV3_S_L2.yaml         |  153 ++
 .../ImageNet/MobileViTV3/MobileViTV3_XS.yaml  |  153 ++
 .../MobileViTV3/MobileViTV3_XS_L2.yaml        |  153 ++
 .../ImageNet/MobileViTV3/MobileViTV3_XXS.yaml |  153 ++
 .../MobileViTV3/MobileViTV3_XXS_L2.yaml       |  153 ++
 .../MobileViTV3/MobileViTV3_x0_5.yaml         |  175 ++
 .../MobileViTV3/MobileViTV3_x0_75.yaml        |  175 ++
 .../MobileViTV3/MobileViTV3_x1_0.yaml         |  175 ++
 .../ImageNet/NextViT/NextViT_base_224.yaml    |  172 ++
 .../ImageNet/NextViT/NextViT_base_384.yaml    |  172 ++
 .../ImageNet/NextViT/NextViT_large_224.yaml   |  172 ++
 .../ImageNet/NextViT/NextViT_large_384.yaml   |  172 ++
 .../ImageNet/NextViT/NextViT_small_224.yaml   |  172 ++
 .../ImageNet/NextViT/NextViT_small_384.yaml   |  172 ++
 .../ImageNet/PPHGNet/PPHGNet_base.yaml        |  169 ++
 .../ImageNet/PPHGNet/PPHGNet_small.yaml       |  170 ++
 .../ImageNet/PPHGNet/PPHGNet_tiny.yaml        |  170 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml      |  164 ++
 .../PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml   |  172 ++
 .../PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml   |  173 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml      |  164 ++
 .../ImageNet/PPLCNet/PPLCNet_x0_25.yaml       |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x0_35.yaml       |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x0_5.yaml        |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x0_75.yaml       |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x1_0.yaml        |  141 ++
 .../PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml     |  141 ++
 .../PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml      |  143 ++
 .../ImageNet/PPLCNet/PPLCNet_x1_5.yaml        |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x2_0.yaml        |  140 ++
 .../ImageNet/PPLCNet/PPLCNet_x2_5.yaml        |  142 ++
 .../ImageNet/PPLCNetV2/PPLCNetV2_base.yaml    |  145 ++
 .../ImageNet/PPLCNetV2/PPLCNetV2_large.yaml   |  145 ++
 .../ImageNet/PPLCNetV2/PPLCNetV2_small.yaml   |  145 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B0.yaml     |  174 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B1.yaml     |  174 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B2.yaml     |  174 ++
 .../ImageNet/PVTV2/PVT_V2_B2_Linear.yaml      |  174 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B3.yaml     |  175 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B4.yaml     |  175 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B5.yaml     |  175 ++
 .../configs/ImageNet/PeleeNet/PeleeNet.yaml   |  148 ++
 .../configs/ImageNet/ReXNet/ReXNet_1_0.yaml   |  144 ++
 .../configs/ImageNet/ReXNet/ReXNet_1_3.yaml   |  144 ++
 .../configs/ImageNet/ReXNet/ReXNet_1_5.yaml   |  144 ++
 .../configs/ImageNet/ReXNet/ReXNet_2_0.yaml   |  144 ++
 .../configs/ImageNet/ReXNet/ReXNet_3_0.yaml   |  144 ++
 .../configs/ImageNet/RedNet/RedNet101.yaml    |  142 ++
 .../configs/ImageNet/RedNet/RedNet152.yaml    |  142 ++
 .../configs/ImageNet/RedNet/RedNet26.yaml     |  142 ++
 .../configs/ImageNet/RedNet/RedNet38.yaml     |  142 ++
 .../configs/ImageNet/RedNet/RedNet50.yaml     |  142 ++
 .../configs/ImageNet/RegNet/RegNetX_12GF.yaml |  142 ++
 .../ImageNet/RegNet/RegNetX_1600MF.yaml       |  142 ++
 .../configs/ImageNet/RegNet/RegNetX_16GF.yaml |  142 ++
 .../ImageNet/RegNet/RegNetX_200MF.yaml        |  142 ++
 .../ImageNet/RegNet/RegNetX_3200MF.yaml       |  142 ++
 .../configs/ImageNet/RegNet/RegNetX_32GF.yaml |  142 ++
 .../ImageNet/RegNet/RegNetX_400MF.yaml        |  142 ++
 .../ImageNet/RegNet/RegNetX_600MF.yaml        |  142 ++
 .../ImageNet/RegNet/RegNetX_6400MF.yaml       |  142 ++
 .../ImageNet/RegNet/RegNetX_800MF.yaml        |  142 ++
 .../configs/ImageNet/RegNet/RegNetX_8GF.yaml  |  142 ++
 .../configs/ImageNet/RepVGG/RepVGG_A0.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_A1.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_A2.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B0.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B1.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B1g2.yaml  |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B1g4.yaml  |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B2.yaml    |  145 ++
 .../configs/ImageNet/RepVGG/RepVGG_B2g4.yaml  |  145 ++
 .../configs/ImageNet/RepVGG/RepVGG_B3.yaml    |  149 ++
 .../configs/ImageNet/RepVGG/RepVGG_B3g4.yaml  |  149 ++
 .../configs/ImageNet/RepVGG/RepVGG_D2se.yaml  |  149 ++
 .../Res2Net/Res2Net101_vd_26w_4s.yaml         |  142 ++
 .../Res2Net/Res2Net200_vd_26w_4s.yaml         |  142 ++
 .../ImageNet/Res2Net/Res2Net50_14w_8s.yaml    |  142 ++
 .../ImageNet/Res2Net/Res2Net50_26w_4s.yaml    |  142 ++
 .../ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml |  142 ++
 .../configs/ImageNet/ResNeSt/ResNeSt101.yaml  |  143 ++
 .../configs/ImageNet/ResNeSt/ResNeSt200.yaml  |  143 ++
 .../configs/ImageNet/ResNeSt/ResNeSt269.yaml  |  143 ++
 .../configs/ImageNet/ResNeSt/ResNeSt50.yaml   |  143 ++
 .../ResNeSt/ResNeSt50_fast_1s1x64d.yaml       |  143 ++
 .../ImageNet/ResNeXt/ResNeXt101_32x4d.yaml    |  142 ++
 .../ImageNet/ResNeXt/ResNeXt101_64x4d.yaml    |  142 ++
 .../ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml |  142 ++
 .../ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml |  142 ++
 .../ImageNet/ResNeXt/ResNeXt152_32x4d.yaml    |  142 ++
 .../ImageNet/ResNeXt/ResNeXt152_64x4d.yaml    |  142 ++
 .../ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml |  142 ++
 .../ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml |  142 ++
 .../ImageNet/ResNeXt/ResNeXt50_32x4d.yaml     |  142 ++
 .../ImageNet/ResNeXt/ResNeXt50_64x4d.yaml     |  142 ++
 .../ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml  |  142 ++
 .../ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml  |  142 ++
 .../ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml |  142 ++
 .../ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml |  142 ++
 .../ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml |  142 ++
 .../ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml  |  142 ++
 .../configs/ImageNet/ResNet/ResNet101.yaml    |  144 ++
 .../configs/ImageNet/ResNet/ResNet101_vd.yaml |  142 ++
 .../configs/ImageNet/ResNet/ResNet152.yaml    |  144 ++
 .../configs/ImageNet/ResNet/ResNet152_vd.yaml |  142 ++
 .../configs/ImageNet/ResNet/ResNet18.yaml     |  146 ++
 .../ImageNet/ResNet/ResNet18_custom.yaml      |  147 ++
 .../configs/ImageNet/ResNet/ResNet18_dbb.yaml |  153 ++
 .../ImageNet/ResNet/ResNet18_exactmatch.yaml  |  142 ++
 .../configs/ImageNet/ResNet/ResNet18_vd.yaml  |  142 ++
 .../configs/ImageNet/ResNet/ResNet200_vd.yaml |  142 ++
 .../configs/ImageNet/ResNet/ResNet34.yaml     |  142 ++
 .../configs/ImageNet/ResNet/ResNet34_vd.yaml  |  142 ++
 .../configs/ImageNet/ResNet/ResNet50.yaml     |  145 ++
 .../ImageNet/ResNet/ResNet50_amp_O1.yaml      |  144 ++
 .../ResNet/ResNet50_amp_O1_ultra.yaml         |  150 ++
 .../ResNet/ResNet50_amp_O2_ultra.yaml         |  150 ++
 .../ImageNet/ResNet/ResNet50_ampo2_ultra.yaml |  144 ++
 .../ImageNet/ResNet/ResNet50_fp32_ultra.yaml  |  146 ++
 .../configs/ImageNet/ResNet/ResNet50_vd.yaml  |  142 ++
 .../configs/ImageNet/SENet/SENet154_vd.yaml   |  142 ++
 .../ImageNet/SENet/SE_ResNeXt101_32x4d.yaml   |  142 ++
 .../SE_ResNeXt101_32x4d_amp_O2_ultra.yaml     |  144 ++
 .../ImageNet/SENet/SE_ResNeXt50_32x4d.yaml    |  142 ++
 .../ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml |  142 ++
 .../ImageNet/SENet/SE_ResNet18_vd.yaml        |  142 ++
 .../ImageNet/SENet/SE_ResNet34_vd.yaml        |  142 ++
 .../ImageNet/SENet/SE_ResNet50_vd.yaml        |  142 ++
 .../ShuffleNet/ShuffleNetV2_swish.yaml        |  141 ++
 .../ShuffleNet/ShuffleNetV2_x0_25.yaml        |  141 ++
 .../ShuffleNet/ShuffleNetV2_x0_33.yaml        |  141 ++
 .../ShuffleNet/ShuffleNetV2_x0_5.yaml         |  141 ++
 .../ShuffleNet/ShuffleNetV2_x1_0.yaml         |  141 ++
 .../ShuffleNet/ShuffleNetV2_x1_5.yaml         |  141 ++
 .../ShuffleNet/ShuffleNetV2_x2_0.yaml         |  141 ++
 .../ImageNet/SqueezeNet/SqueezeNet1_0.yaml    |  140 ++
 .../ImageNet/SqueezeNet/SqueezeNet1_1.yaml    |  140 ++
 .../configs/ImageNet/StarNet/StarNet_S1.yaml  |  165 ++
 .../configs/ImageNet/StarNet/StarNet_S2.yaml  |  165 ++
 .../configs/ImageNet/StarNet/StarNet_S3.yaml  |  166 ++
 .../configs/ImageNet/StarNet/StarNet_S4.yaml  |  165 ++
 ...nTransformer_base_patch4_window12_384.yaml |  175 ++
 ...inTransformer_base_patch4_window7_224.yaml |  176 ++
 ...Transformer_large_patch4_window12_384.yaml |  175 ++
 ...nTransformer_large_patch4_window7_224.yaml |  175 ++
 ...nTransformer_small_patch4_window7_224.yaml |  175 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  175 ++
 ...ransformerV2_base_patch4_window16_256.yaml |  172 ++
 ...ransformerV2_base_patch4_window24_384.yaml |  172 ++
 ...TransformerV2_base_patch4_window8_256.yaml |  172 ++
 ...ansformerV2_large_patch4_window16_256.yaml |  172 ++
 ...ansformerV2_large_patch4_window24_384.yaml |  172 ++
 ...ansformerV2_small_patch4_window16_256.yaml |  172 ++
 ...ransformerV2_small_patch4_window8_256.yaml |  172 ++
 ...ransformerV2_tiny_patch4_window16_256.yaml |  172 ++
 ...TransformerV2_tiny_patch4_window8_256.yaml |  172 ++
 .../ppcls/configs/ImageNet/TNT/TNT_base.yaml  |  146 ++
 .../ppcls/configs/ImageNet/TNT/TNT_small.yaml |  146 ++
 .../configs/ImageNet/TinyNet/TinyNet_A.yaml   |  167 ++
 .../configs/ImageNet/TinyNet/TinyNet_B.yaml   |  167 ++
 .../configs/ImageNet/TinyNet/TinyNet_C.yaml   |  167 ++
 .../configs/ImageNet/TinyNet/TinyNet_D.yaml   |  167 ++
 .../configs/ImageNet/TinyNet/TinyNet_E.yaml   |  167 ++
 .../configs/ImageNet/Twins/alt_gvt_base.yaml  |  174 ++
 .../configs/ImageNet/Twins/alt_gvt_large.yaml |  174 ++
 .../configs/ImageNet/Twins/alt_gvt_small.yaml |  174 ++
 .../configs/ImageNet/Twins/pcpvt_base.yaml    |  174 ++
 .../configs/ImageNet/Twins/pcpvt_large.yaml   |  174 ++
 .../configs/ImageNet/Twins/pcpvt_small.yaml   |  174 ++
 .../ImageNet/UniFormer/UniFormer_base.yaml    |  174 ++
 .../ImageNet/UniFormer/UniFormer_base_ls.yaml |  174 ++
 .../ImageNet/UniFormer/UniFormer_small.yaml   |  174 ++
 .../UniFormer/UniFormer_small_plus.yaml       |  174 ++
 .../UniFormer/UniFormer_small_plus_dim64.yaml |  174 ++
 .../ppcls/configs/ImageNet/VAN/VAN_B0.yaml    |  170 ++
 .../ppcls/configs/ImageNet/VAN/VAN_B1.yaml    |  170 ++
 .../ppcls/configs/ImageNet/VAN/VAN_B2.yaml    |  170 ++
 .../ppcls/configs/ImageNet/VAN/VAN_B3.yaml    |  170 ++
 .../ppcls/configs/ImageNet/VGG/VGG11.yaml     |  140 ++
 .../ppcls/configs/ImageNet/VGG/VGG13.yaml     |  140 ++
 .../ppcls/configs/ImageNet/VGG/VGG16.yaml     |  140 ++
 .../ppcls/configs/ImageNet/VGG/VGG19.yaml     |  140 ++
 .../ViT_base_patch16_224.yaml                 |  142 ++
 .../ViT_base_patch16_384.yaml                 |  142 ++
 .../ViT_base_patch32_384.yaml                 |  142 ++
 .../ViT_large_patch16_224.yaml                |  142 ++
 .../ViT_large_patch16_384.yaml                |  142 ++
 .../ViT_large_patch32_384.yaml                |  142 ++
 .../ViT_small_patch16_224.yaml                |  142 ++
 .../configs/ImageNet/Xception/Xception41.yaml |  141 ++
 .../ImageNet/Xception/Xception41_deeplab.yaml |  141 ++
 .../configs/ImageNet/Xception/Xception65.yaml |  142 ++
 .../ImageNet/Xception/Xception65_deeplab.yaml |  141 ++
 .../configs/ImageNet/Xception/Xception71.yaml |  142 ++
 .../ppcls/configs/Logo/ResNet50_ReID.yaml     |  151 ++
 ...P_vit_base_patch16_448_ml_decoder_448.yaml |  172 ++
 .../PP-HGNetV2-B0_ml_decoder_448.yaml         |  168 ++
 .../PP-HGNetV2-B4_ml_decoder_448.yaml         |  168 ++
 .../PP-HGNetV2-B6_ml_decoder_448.yaml         |  168 ++
 .../PP-LCNet_x1_0_ml_decoder_448.yaml         |  170 ++
 .../MultiLabelCOCO/MLDecoder/README.md        |  272 +++
 .../MLDecoder/ResNet101_ml_decoder_448.yaml   |  168 ++
 .../MLDecoder/ResNet50_ml_decoder_448.yaml    |  168 ++
 .../car_exists/MobileNetV3_small_x0_35.yaml   |  139 ++
 .../configs/PULC/car_exists/PPLCNet_x1_0.yaml |  152 ++
 .../car_exists/PPLCNet_x1_0_distillation.yaml |  169 ++
 .../PULC/car_exists/PPLCNet_x1_0_search.yaml  |  152 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  169 ++
 .../ppcls/configs/PULC/car_exists/search.yaml |   40 +
 .../PULC/clarity_assessment/PPLCNet_x1_0.yaml |  133 ++
 .../code_exists/MobileNetV3_small_x0_35.yaml  |  137 ++
 .../PULC/code_exists/PPLCNet_x1_0.yaml        |  145 ++
 .../PPLCNet_x1_0_distillation.yaml            |  167 ++
 .../PULC/code_exists/PPLCNet_x1_0_search.yaml |  150 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  167 ++
 .../configs/PULC/code_exists/search.yaml      |   40 +
 .../PULC/image_orientation/PPLCNet_x1_0.yaml  |  144 ++
 .../MobileNetV3_small_x0_35.yaml              |  132 ++
 .../language_classification/PPLCNet_x1_0.yaml |  143 ++
 .../PPLCNet_x1_0_distillation.yaml            |  164 ++
 .../PPLCNet_x1_0_search.yaml                  |  142 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  160 ++
 .../PULC/language_classification/search.yaml  |   40 +
 .../MobileNetV3_small_x0_35.yaml              |  135 ++
 .../PULC/person_attribute/PPLCNet_x1_0.yaml   |  149 ++
 .../PPLCNet_x1_0_Distillation.yaml            |  172 ++
 .../person_attribute/PPLCNet_x1_0_search.yaml |  149 ++
 .../Res2Net200_vd_26w_4s.yaml                 |  134 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  135 ++
 .../configs/PULC/person_attribute/search.yaml |   41 +
 .../MobileNetV3_small_x0_35.yaml              |  138 ++
 .../PULC/person_exists/PPLCNet_x1_0.yaml      |  151 ++
 .../PPLCNet_x1_0_distillation.yaml            |  168 ++
 .../person_exists/PPLCNet_x1_0_search.yaml    |  151 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  168 ++
 .../configs/PULC/person_exists/search.yaml    |   40 +
 .../MobileNetV3_small_x0_35.yaml              |  134 ++
 .../PULC/safety_helmet/PPLCNet_x1_0.yaml      |  148 ++
 .../PPLCNet_x1_0_distillation.yaml            |  185 ++
 .../safety_helmet/PPLCNet_x1_0_search.yaml    |  148 ++
 .../safety_helmet/Res2Net200_vd_26w_4s.yaml   |  137 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  159 ++
 .../configs/PULC/safety_helmet/search.yaml    |   36 +
 .../PULC/table_attribute/PPLCNet_x1_0.yaml    |  133 ++
 .../PPLCNet_x1_0_distillation.yaml            |  155 ++
 .../MobileNetV3_small_x0_35.yaml              |  132 ++
 .../text_image_orientation/PPLCNet_x1_0.yaml  |  143 ++
 .../PPLCNet_x1_0_distillation.yaml            |  164 ++
 .../PPLCNet_x1_0_search.yaml                  |  146 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  157 ++
 .../PULC/text_image_orientation/search.yaml   |   41 +
 .../MobileNetV3_small_x0_35.yaml              |  134 ++
 .../textline_orientation/PPLCNet_x1_0.yaml    |  143 ++
 .../PPLCNet_x1_0_224x224.yaml                 |  132 ++
 .../PPLCNet_x1_0_distillation.yaml            |  162 ++
 .../PPLCNet_x1_0_search.yaml                  |  144 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  164 ++
 .../PULC/textline_orientation/search.yaml     |   41 +
 .../traffic_sign/MobileNetV3_samll_x0_35.yaml |  132 ++
 .../PULC/traffic_sign/PPLCNet_x1_0.yaml       |  148 ++
 .../PPLCNet_x1_0_distillation.yaml            |  172 ++
 .../traffic_sign/PPLCNet_x1_0_search.yaml     |  148 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  170 ++
 .../configs/PULC/traffic_sign/search.yaml     |   41 +
 .../MobileNetV3_small_x0_35.yaml              |  115 +
 .../PULC/vehicle_attribute/PPLCNet_x1_0.yaml  |  149 ++
 .../PPLCNet_x1_0_distillation.yaml            |  171 ++
 .../PPLCNet_x1_0_search.yaml                  |  129 ++
 .../Res2Net200_vd_26w_4s.yaml                 |  122 +
 .../PULC/vehicle_attribute/ResNet50.yaml      |  116 +
 .../PULC/vehicle_attribute/search.yaml        |   35 +
 .../MV3_Large_1x_Aliproduct_DLBHC.yaml        |  149 ++
 .../Products/ResNet50_vd_Aliproduct.yaml      |  119 +
 .../configs/Products/ResNet50_vd_Inshop.yaml  |  157 ++
 .../configs/Products/ResNet50_vd_SOP.yaml     |  156 ++
 .../ppcls/configs/ResNet50_UReID_infer.yaml   |  152 ++
 .../ppcls/configs/SVTR/svtr_base.yml          |  146 ++
 .../ppcls/configs/SVTR/svtr_large.yml         |  146 ++
 .../ppcls/configs/SVTR/svtr_tiny.yml          |  146 ++
 .../ppcls/configs/StrategySearch/person.yaml  |   40 +
 .../configs/Vehicle/PPLCNet_2.5x_ReID.yaml    |  158 ++
 .../ppcls/configs/Vehicle/ResNet50.yaml       |  130 ++
 .../ppcls/configs/Vehicle/ResNet50_ReID.yaml  |  155 ++
 .../configs/metric_learning/adaface_ir18.yaml |  105 +
 .../configs/metric_learning/xbm_resnet50.yaml |  170 ++
 .../multi_scale/MobileNetV1_multi_scale.yaml  |  138 ++
 .../ppcls/configs/practical_models/.gitkeep   |    1 +
 .../CLIP_large_patch14_224_aesthetic.yaml     |   78 +
 .../EfficientNetB3_watermark.yaml             |   81 +
 .../PPHGNet_tiny_calling_halfbody.yaml        |  150 ++
 .../quick_start/MobileNetV1_retrieval.yaml    |  157 ++
 .../quick_start/MobileNetV3_large_x1_0.yaml   |  130 ++
 .../configs/quick_start/ResNet50_vd.yaml      |  129 ++
 .../kunlun/HRNet_W18_C_finetune_kunlun.yaml   |   68 +
 .../kunlun/ResNet50_vd_finetune_kunlun.yaml   |   69 +
 .../kunlun/VGG16_finetune_kunlun.yaml         |   70 +
 .../kunlun/VGG19_finetune_kunlun.yaml         |   70 +
 .../new_user/ShuffleNetV2_x0_25.yaml          |  129 ++
 .../professional/MobileNetV1_multilabel.yaml  |  130 ++
 ...ileNetV3_large_x1_0_CIFAR100_finetune.yaml |  127 ++
 ...50_vd_distill_MV3_large_x1_0_CIFAR100.yaml |  151 ++
 .../professional/ResNet50_vd_CIFAR100.yaml    |  127 ++
 .../ResNet50_vd_mixup_CIFAR100_finetune.yaml  |  127 ++
 .../professional/VGG19_CIFAR10_DeepHash.yaml  |  147 ++
 .../reid/MetaBIN_ResNet50_cross_domain.yaml   |  277 +++
 .../reid/strong_baseline/baseline.yaml        |  158 ++
 .../reid/strong_baseline/softmax_triplet.yaml |  176 ++
 .../softmax_triplet_with_center.yaml          |  187 ++
 ...Recognition_PPLCNet_x2_5_quantization.yaml |  154 ++
 .../slim/MobileNetV3_large_x1_0_prune.yaml    |  139 ++
 .../MobileNetV3_large_x1_0_quantization.yaml  |  138 ++
 .../slim/PPLCNet_x1_0_quantization.yaml       |  138 ++
 .../ppcls/configs/slim/ResNet50_vd_prune.yaml |  138 ++
 .../slim/ResNet50_vd_quantization.yaml        |  137 ++
 .../slim/ResNet50_vehicle_cls_prune.yaml      |  135 ++
 .../ResNet50_vehicle_cls_quantization.yaml    |  134 ++
 .../slim/ResNet50_vehicle_reid_prune.yaml     |  162 ++
 .../ResNet50_vehicle_reid_quantization.yaml   |  161 ++
 .../FixMatchCCSSL_cifar100_10000_4gpu.yaml    |  209 ++
 .../FixMatchCCSSL_cifar10_4000_4gpu.yaml      |  208 ++
 .../ssl/FixMatch/FixMatch_cifar10_250.yaml    |  175 ++
 .../ssl/FixMatch/FixMatch_cifar10_40.yaml     |  175 ++
 .../ssl/FixMatch/FixMatch_cifar10_4000.yaml   |  175 ++
 .../FixMatch/FixMatch_cifar10_40_4gpu.yaml    |  176 ++
 .../ppcls/data/__init__.py                    |  197 ++
 .../DistributedRandomIdentitySampler.py       |  135 ++
 .../ppcls/data/dataloader/__init__.py         |   16 +
 .../ppcls/data/dataloader/cifar.py            |  136 ++
 .../ppcls/data/dataloader/common_dataset.py   |   88 +
 .../data/dataloader/custom_label_dataset.py   |   88 +
 .../ppcls/data/dataloader/dali.py             |  795 +++++++
 .../ppcls/data/dataloader/face_dataset.py     |  163 ++
 .../ppcls/data/dataloader/icartoon_dataset.py |   36 +
 .../ppcls/data/dataloader/imagenet_dataset.py |   75 +
 .../ppcls/data/dataloader/logo_dataset.py     |   46 +
 .../ppcls/data/dataloader/metabin_sampler.py  |  290 +++
 .../ppcls/data/dataloader/mix_dataset.py      |   49 +
 .../ppcls/data/dataloader/mix_sampler.py      |   79 +
 .../data/dataloader/multi_scale_dataset.py    |  118 +
 .../data/dataloader/multi_scale_sampler.py    |  133 ++
 .../data/dataloader/multilabel_dataset.py     |   65 +
 .../ppcls/data/dataloader/person_dataset.py   |  269 +++
 .../ppcls/data/dataloader/pk_sampler.py       |  134 ++
 .../ppcls/data/dataloader/ra_sampler.py       |   57 +
 .../ppcls/data/dataloader/vehicle_dataset.py  |  173 ++
 .../ppcls/data/postprocess/__init__.py        |   44 +
 .../ppcls/data/postprocess/attr_rec.py        |  284 +++
 .../ppcls/data/postprocess/scoreoutput.py     |   18 +
 .../ppcls/data/postprocess/threshoutput.py    |   90 +
 .../ppcls/data/postprocess/topk.py            |   84 +
 .../ppcls/data/preprocess/__init__.py         |  183 ++
 .../data/preprocess/batch_ops/__init__.py     |    1 +
 .../preprocess/batch_ops/batch_operators.py   |  501 ++++
 .../ppcls/data/preprocess/ops/__init__.py     |    1 +
 .../ppcls/data/preprocess/ops/autoaugment.py  |  265 +++
 .../ppcls/data/preprocess/ops/cutout.py       |   55 +
 .../data/preprocess/ops/dali_operators.py     |  235 ++
 .../ppcls/data/preprocess/ops/fmix.py         |  220 ++
 .../ppcls/data/preprocess/ops/functional.py   |  138 ++
 .../ppcls/data/preprocess/ops/grid.py         |   90 +
 .../data/preprocess/ops/hide_and_seek.py      |   45 +
 .../ppcls/data/preprocess/ops/operators.py    |  926 ++++++++
 .../ppcls/data/preprocess/ops/randaugment.py  |  477 ++++
 .../data/preprocess/ops/random_erasing.py     |  113 +
 .../data/preprocess/ops/timm_autoaugment.py   |  878 +++++++
 .../ppcls/data/utils/__init__.py              |   13 +
 .../ppcls/data/utils/get_image_list.py        |   59 +
 .../ppcls/engine/__init__.py                  |    0
 .../ppcls/engine/engine.py                    | 2013 +++++++++++++++++
 .../ppcls/engine/evaluation/__init__.py       |   17 +
 .../ppcls/engine/evaluation/adaface.py        |  260 +++
 .../ppcls/engine/evaluation/classification.py |  175 ++
 .../ppcls/engine/evaluation/retrieval.py      |  327 +++
 .../ppcls/engine/train/__init__.py            |   18 +
 .../ppcls/engine/train/train.py               |  103 +
 .../ppcls/engine/train/train_fixmatch.py      |  152 ++
 .../engine/train/train_fixmatch_ccssl.py      |  125 +
 .../ppcls/engine/train/train_metabin.py       |  251 ++
 .../ppcls/engine/train/train_progressive.py   |   72 +
 .../ppcls/engine/train/utils.py               |   94 +
 .../ppcls/loss/__init__.py                    |   91 +
 .../ppcls/loss/afdloss.py                     |  130 ++
 .../ppcls/loss/ccssl_loss.py                  |   19 +
 .../ppcls/loss/celoss.py                      |   76 +
 .../ppcls/loss/centerloss.py                  |   80 +
 .../ppcls/loss/comfunc.py                     |   45 +
 .../ppcls/loss/contrasiveloss.py              |  152 ++
 .../ppcls/loss/deephashloss.py                |  149 ++
 .../ppcls/loss/dist_loss.py                   |   52 +
 .../ppcls/loss/distanceloss.py                |   43 +
 .../ppcls/loss/distillationloss.py            |  426 ++++
 .../ppcls/loss/dkdloss.py                     |   68 +
 .../ppcls/loss/dmlloss.py                     |   62 +
 .../ppcls/loss/emlloss.py                     |  102 +
 .../ppcls/loss/googlenetloss.py               |   43 +
 .../ppcls/loss/kldivloss.py                   |   33 +
 .../ppcls/loss/metabinloss.py                 |  206 ++
 .../ppcls/loss/mgd_loss.py                    |   84 +
 .../ppcls/loss/msmloss.py                     |   80 +
 .../ppcls/loss/multilabelloss.py              |  119 +
 .../ppcls/loss/npairsloss.py                  |   43 +
 .../ppcls/loss/pairwisecosface.py             |   64 +
 .../ppcls/loss/pefdloss.py                    |   83 +
 .../ppcls/loss/rkdloss.py                     |   99 +
 .../ppcls/loss/skdloss.py                     |   72 +
 .../ppcls/loss/softsuploss.py                 |   75 +
 .../ppcls/loss/softtargetceloss.py            |   16 +
 .../ppcls/loss/supconloss.py                  |  109 +
 .../ppcls/loss/trihardloss.py                 |   84 +
 .../ppcls/loss/triplet.py                     |  157 ++
 .../ppcls/loss/tripletangularmarginloss.py    |  241 ++
 .../ppcls/loss/wslloss.py                     |   66 +
 .../low_precision_training/ppcls/loss/xbm.py  |   89 +
 .../ppcls/metric/__init__.py                  |   71 +
 .../ppcls/metric/avg_metrics.py               |   20 +
 .../ppcls/metric/metrics.py                   |  661 ++++++
 .../ppcls/optimizer/__init__.py               |  137 ++
 .../ppcls/optimizer/learning_rate.py          |  687 ++++++
 .../ppcls/optimizer/optimizer.py              |  518 +++++
 .../ppcls/static/README.md                    |   19 +
 .../ppcls/static/program.py                   |  445 ++++
 .../ppcls/static/run_dali.sh                  |    8 +
 .../ppcls/static/save_load.py                 |  139 ++
 .../ppcls/static/train.py                     |  227 ++
 .../ppcls/utils/COCO2017_label_list.txt       |   80 +
 .../ppcls/utils/NUS-WIDE-SCENE_label_list.txt |   33 +
 .../image_orientation_label_list.txt          |    4 +
 .../language_classification_label_list.txt    |   10 +
 .../text_image_orientation_label_list.txt     |    4 +
 .../textline_orientation_label_list.txt       |    2 +
 .../traffic_sign_label_list.txt               |  232 ++
 .../ppcls/utils/__init__.py                   |   29 +
 .../low_precision_training/ppcls/utils/amp.py |   61 +
 .../ppcls/utils/check.py                      |  149 ++
 .../ppcls/utils/config.py                     |  323 +++
 .../ppcls/utils/create_cls_trainval_lists.py  |  111 +
 .../utils/create_coco_multilabel_lists.py     |  126 ++
 .../ppcls/utils/dist_utils.py                 |   36 +
 .../ppcls/utils/download.py                   |  304 +++
 .../low_precision_training/ppcls/utils/ema.py |   45 +
 .../feature_maps_visualization/fm_vis.py      |   97 +
 .../feature_maps_visualization/resnet.py      |  535 +++++
 .../utils/feature_maps_visualization/utils.py |   85 +
 .../ppcls/utils/imagenet1k_label_list.txt     | 1000 ++++++++
 .../ppcls/utils/initializer.py                |  318 +++
 .../ppcls/utils/logger.py                     |  173 ++
 .../ppcls/utils/metrics.py                    |  107 +
 .../ppcls/utils/misc.py                       |  155 ++
 .../ppcls/utils/model_zoo.py                  |  213 ++
 .../utils/pedestrian_attribute_label_list.txt |   26 +
 .../ppcls/utils/pretrained.list               |  121 +
 .../ppcls/utils/profiler.py                   |  129 ++
 .../ppcls/utils/save_load.py                  |  225 ++
 .../ppcls/utils/save_result.py                |  102 +
 .../utils/vehicle_attribute_label_list.txt    |   19 +
 .../qat_config/qat_a4_mse.yaml                |   15 +
 ...W4_mse_det__A4_mse_det__G4_minmax_sto.yaml |   71 +
 .../quantization/__init__.py                  |    6 +
 .../quantization/config.py                    |  743 ++++++
 .../quantization/cudnn_convolution.cpp        |  124 +
 .../quantization/install_paddle_so.py         |   29 +
 .../quantization/layers.py                    |  310 +++
 .../quantization/managers/__init__.py         |   20 +
 .../managers/base_quantization_manager.py     |   23 +
 .../managers/qat_quantization_manager.py      |  131 ++
 .../managers/qt_quantization_manager.py       |  158 ++
 .../quantization/ops.py                       |  207 ++
 .../quantization/qconfig.py                   |   22 +
 .../quantization/quantization_utils.py        |   93 +
 .../quantization/quantizer/__init__.py        |    3 +
 .../quantization/quantizer/dummy_quantizer.py |   22 +
 .../quantization/quantizer/int_quantizer.py   |  232 ++
 .../quantization/registry.py                  |   57 +
 .../quantization/setup.py                     |   23 +
 .../quantization/utils.py                     |   52 +
 .../low_precision_training/requirements.txt   |  213 ++
 .../low_precision_training/tools/__init__.py  |    1 +
 .../low_precision_training/tools/eval_qt.py   |   49 +
 .../low_precision_training/tools/eval_qt.sh   |    9 +
 example/low_precision_training/tools/train.py |   53 +
 example/low_precision_training/tools/train.sh |    8 +
 .../low_precision_training/tools/train_qat.py |   64 +
 .../low_precision_training/tools/train_qt.py  |   66 +
 .../low_precision_training/tools/train_qt.sh  |    8 +
 776 files changed, 140549 insertions(+)
 create mode 100755 example/low_precision_training/README.md
 create mode 100755 example/low_precision_training/ppcls/.DS_Store
 create mode 100755 example/low_precision_training/ppcls/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/base/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/base/dbb/dbb_block.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/base/dbb/dbb_transforms.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/base/theseus_layer.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/custom_devices_layers.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/esnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/hrnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/inception_v3.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v4.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_hgnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_hgnet_v2.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_lcnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_lcnet_v2.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/resnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/swin_transformer.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/legendary_models/vgg.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/adaface_ir_net.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/alexnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/cae.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/convnext.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/cspnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/cswin_transformer.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/cvt.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/darknet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/densenet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/dla.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/dpn.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/dsnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/efficientnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/efficientnet_v2.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/fasternet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/foundation_vit.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/ghostnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/googlenet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/hardnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/inception_v4.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/levit.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/micronet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/mixnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilenet_v2.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilenext.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit_v2.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit_v3.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/nextvit.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/peleenet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/pvt_v2.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/rednet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/regnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/repvgg.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/res2net.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/res2net_vd.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/resnest.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/resnet_vc.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext_vd.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/rexnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnet_vd.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnext.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnext_vd.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/shufflenet_v2.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/squeezenet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/starnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/svtrnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/swin_transformer_v2.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/tinynet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/tnt.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/twins.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/uniformer.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/van.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/vision_transformer.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/wideresnet.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/xception.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/model_zoo/xception_deeplab.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/variant_models/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/variant_models/efficientnet_variant.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/variant_models/foundation_vit_variant.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/variant_models/pp_lcnetv2_variant.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/variant_models/resnet_variant.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/variant_models/swin_transformer_variant.py
 create mode 100755 example/low_precision_training/ppcls/arch/backbone/variant_models/vgg_variant.py
 create mode 100755 example/low_precision_training/ppcls/arch/distill/afd_attention.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/adamargin.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/arcmargin.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/bnneck.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/circlemargin.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/cosmargin.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/fc.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/frfn_neck.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/identity_head.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/metabnneck.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/ml_decoder.py
 create mode 100755 example/low_precision_training/ppcls/arch/gears/vehicle_neck.py
 create mode 100755 example/low_precision_training/ppcls/arch/slim/__init__.py
 create mode 100755 example/low_precision_training/ppcls/arch/slim/prune.py
 create mode 100755 example/low_precision_training/ppcls/arch/slim/quant.py
 create mode 100755 example/low_precision_training/ppcls/arch/utils.py
 create mode 100755 example/low_precision_training/ppcls/configs/.DS_Store
 create mode 100755 example/low_precision_training/ppcls/configs/Attr/PPLCNet_x1_0_pedestrian_attribute.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Attr/PPLCNet_x1_0_vehicle_attribute.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Attr/StrongBaselineAttr.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/CAE/cae_base_patch16_224_finetune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/CAE/cae_large_patch16_224_finetune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/CLIP/CLIP_vit_large_patch14_224_finetune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Cartoonface/ResNet50_icartoon.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/DeepHash/DCH.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/DeepHash/DSHSD.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/DeepHash/LCDSH.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/GeneralRecognition/Gallery2FC_PPLCNet_x2_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_large.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/AlexNet/AlexNet.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CSPNet/CSPDarkNet53.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_base_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_base_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_large_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_large_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_tiny.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_13_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_13_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_21_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_21_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_W24_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102x.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102x2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA169.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA34.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA46_c.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA46x_c.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60x.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60x_c.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN107.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN131.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN68.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN92.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN98.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_tiny.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet121.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet161.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet169.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet201.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet264.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x1_0_ssld.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_dml.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_ssld.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/res2net200_vd_distill_pphgnet_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dist.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_mgd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_pefd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_skd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_wsl.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB4.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB6.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB7.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/EfficientNetV2/EfficientNetV2_S.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_L.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_M.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_S.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W30_C.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W32_C.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W40_C.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W44_C.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W64_C.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet39_ds.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet68.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet68_ds.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet85.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Inception/GoogLeNet.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Inception/InceptionV3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Inception/InceptionV4.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_128.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_192.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_256.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_L.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_M.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_S.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_ampo2_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_fp32_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_large.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_medium.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_large.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_medium.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_S.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_XS.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_XXS.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x1_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x2_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_S.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_S_L2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XS.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XS_L2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XXS.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XXS_L2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x0_75.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_base_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_base_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_large_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_large_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_small_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_small_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_tiny.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_large.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/PeleeNet/PeleeNet.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_2_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_3_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet101.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet152.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet26.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet38.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet50.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_12GF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_1600MF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_16GF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_200MF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_3200MF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_32GF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_400MF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_600MF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_6400MF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_800MF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_8GF.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1g2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1g4.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B2g4.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B3g4.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_D2se.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt200.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt269.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_64x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_64x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_64x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet101.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet152.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_custom.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_dbb.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_exactmatch.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet34.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_ampo2_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_fp32_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2_ultra.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S4.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window16_256.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window24_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window8_256.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window16_256.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window24_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window16_256.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window8_256.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window16_256.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window8_256.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/TNT/TNT_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/TNT/TNT_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_A.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_B.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_C.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_D.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_E.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_base.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_base_ls.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small_plus.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small_plus_dim64.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B1.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B2.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B3.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG11.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG13.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG16.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG19.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception41.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception41_deeplab.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception65.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception65_deeplab.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception71.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Logo/ResNet50_ReID.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/CLIP_vit_base_patch16_448_ml_decoder_448.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B0_ml_decoder_448.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B4_ml_decoder_448.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B6_ml_decoder_448.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-LCNet_x1_0_ml_decoder_448.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/README.md
 create mode 100755 example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet50_ml_decoder_448.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/car_exists/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/car_exists/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/car_exists/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/clarity_assessment/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/code_exists/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/code_exists/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/code_exists/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/image_orientation/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/language_classification/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_attribute/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_attribute/Res2Net200_vd_26w_4s.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_attribute/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_attribute/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_exists/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_exists/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/person_exists/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/safety_helmet/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/table_attribute/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/table_attribute/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/text_image_orientation/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/textline_orientation/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/textline_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/textline_orientation/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/traffic_sign/MobileNetV3_samll_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/traffic_sign/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/traffic_sign/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/MobileNetV3_small_x0_35.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/Res2Net200_vd_26w_4s.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/ResNet50.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/search.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Products/ResNet50_vd_Aliproduct.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Products/ResNet50_vd_Inshop.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Products/ResNet50_vd_SOP.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ResNet50_UReID_infer.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/SVTR/svtr_base.yml
 create mode 100755 example/low_precision_training/ppcls/configs/SVTR/svtr_large.yml
 create mode 100755 example/low_precision_training/ppcls/configs/SVTR/svtr_tiny.yml
 create mode 100755 example/low_precision_training/ppcls/configs/StrategySearch/person.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Vehicle/PPLCNet_2.5x_ReID.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Vehicle/ResNet50.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/Vehicle/ResNet50_ReID.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/metric_learning/adaface_ir18.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/metric_learning/xbm_resnet50.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/multi_scale/MobileNetV1_multi_scale.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/practical_models/.gitkeep
 create mode 100755 example/low_precision_training/ppcls/configs/practical_models/CLIP_large_patch14_224_aesthetic.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/practical_models/EfficientNetB3_watermark.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/practical_models/PPHGNet_tiny_calling_halfbody.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/MobileNetV1_retrieval.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/MobileNetV3_large_x1_0.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/ResNet50_vd.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/professional/MobileNetV1_multilabel.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/reid/MetaBIN_ResNet50_cross_domain.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/reid/strong_baseline/baseline.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/reid/strong_baseline/softmax_triplet.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/reid/strong_baseline/softmax_triplet_with_center.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/PPLCNet_x1_0_quantization.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/ResNet50_vd_prune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/ResNet50_vd_quantization.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_cls_prune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_cls_quantization.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_reid_prune.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_reid_quantization.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ssl/CCSSL/FixMatchCCSSL_cifar100_10000_4gpu.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ssl/CCSSL/FixMatchCCSSL_cifar10_4000_4gpu.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_250.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_40.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_4000.yaml
 create mode 100755 example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_40_4gpu.yaml
 create mode 100755 example/low_precision_training/ppcls/data/__init__.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/DistributedRandomIdentitySampler.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/__init__.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/cifar.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/common_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/custom_label_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/dali.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/face_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/icartoon_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/imagenet_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/logo_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/metabin_sampler.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/mix_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/mix_sampler.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/multi_scale_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/multi_scale_sampler.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/multilabel_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/person_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/pk_sampler.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/ra_sampler.py
 create mode 100755 example/low_precision_training/ppcls/data/dataloader/vehicle_dataset.py
 create mode 100755 example/low_precision_training/ppcls/data/postprocess/__init__.py
 create mode 100755 example/low_precision_training/ppcls/data/postprocess/attr_rec.py
 create mode 100755 example/low_precision_training/ppcls/data/postprocess/scoreoutput.py
 create mode 100755 example/low_precision_training/ppcls/data/postprocess/threshoutput.py
 create mode 100755 example/low_precision_training/ppcls/data/postprocess/topk.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/__init__.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/batch_ops/__init__.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/batch_ops/batch_operators.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/__init__.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/autoaugment.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/cutout.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/dali_operators.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/fmix.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/functional.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/grid.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/hide_and_seek.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/operators.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/randaugment.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/random_erasing.py
 create mode 100755 example/low_precision_training/ppcls/data/preprocess/ops/timm_autoaugment.py
 create mode 100755 example/low_precision_training/ppcls/data/utils/__init__.py
 create mode 100755 example/low_precision_training/ppcls/data/utils/get_image_list.py
 create mode 100755 example/low_precision_training/ppcls/engine/__init__.py
 create mode 100755 example/low_precision_training/ppcls/engine/engine.py
 create mode 100755 example/low_precision_training/ppcls/engine/evaluation/__init__.py
 create mode 100755 example/low_precision_training/ppcls/engine/evaluation/adaface.py
 create mode 100755 example/low_precision_training/ppcls/engine/evaluation/classification.py
 create mode 100755 example/low_precision_training/ppcls/engine/evaluation/retrieval.py
 create mode 100755 example/low_precision_training/ppcls/engine/train/__init__.py
 create mode 100755 example/low_precision_training/ppcls/engine/train/train.py
 create mode 100755 example/low_precision_training/ppcls/engine/train/train_fixmatch.py
 create mode 100755 example/low_precision_training/ppcls/engine/train/train_fixmatch_ccssl.py
 create mode 100755 example/low_precision_training/ppcls/engine/train/train_metabin.py
 create mode 100755 example/low_precision_training/ppcls/engine/train/train_progressive.py
 create mode 100755 example/low_precision_training/ppcls/engine/train/utils.py
 create mode 100755 example/low_precision_training/ppcls/loss/__init__.py
 create mode 100755 example/low_precision_training/ppcls/loss/afdloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/ccssl_loss.py
 create mode 100755 example/low_precision_training/ppcls/loss/celoss.py
 create mode 100755 example/low_precision_training/ppcls/loss/centerloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/comfunc.py
 create mode 100755 example/low_precision_training/ppcls/loss/contrasiveloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/deephashloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/dist_loss.py
 create mode 100755 example/low_precision_training/ppcls/loss/distanceloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/distillationloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/dkdloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/dmlloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/emlloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/googlenetloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/kldivloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/metabinloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/mgd_loss.py
 create mode 100755 example/low_precision_training/ppcls/loss/msmloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/multilabelloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/npairsloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/pairwisecosface.py
 create mode 100755 example/low_precision_training/ppcls/loss/pefdloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/rkdloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/skdloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/softsuploss.py
 create mode 100755 example/low_precision_training/ppcls/loss/softtargetceloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/supconloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/trihardloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/triplet.py
 create mode 100755 example/low_precision_training/ppcls/loss/tripletangularmarginloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/wslloss.py
 create mode 100755 example/low_precision_training/ppcls/loss/xbm.py
 create mode 100755 example/low_precision_training/ppcls/metric/__init__.py
 create mode 100755 example/low_precision_training/ppcls/metric/avg_metrics.py
 create mode 100755 example/low_precision_training/ppcls/metric/metrics.py
 create mode 100755 example/low_precision_training/ppcls/optimizer/__init__.py
 create mode 100755 example/low_precision_training/ppcls/optimizer/learning_rate.py
 create mode 100755 example/low_precision_training/ppcls/optimizer/optimizer.py
 create mode 100755 example/low_precision_training/ppcls/static/README.md
 create mode 100755 example/low_precision_training/ppcls/static/program.py
 create mode 100755 example/low_precision_training/ppcls/static/run_dali.sh
 create mode 100755 example/low_precision_training/ppcls/static/save_load.py
 create mode 100755 example/low_precision_training/ppcls/static/train.py
 create mode 100755 example/low_precision_training/ppcls/utils/COCO2017_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/NUS-WIDE-SCENE_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/PULC_label_list/image_orientation_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/PULC_label_list/language_classification_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/__init__.py
 create mode 100755 example/low_precision_training/ppcls/utils/amp.py
 create mode 100755 example/low_precision_training/ppcls/utils/check.py
 create mode 100755 example/low_precision_training/ppcls/utils/config.py
 create mode 100755 example/low_precision_training/ppcls/utils/create_cls_trainval_lists.py
 create mode 100755 example/low_precision_training/ppcls/utils/create_coco_multilabel_lists.py
 create mode 100755 example/low_precision_training/ppcls/utils/dist_utils.py
 create mode 100755 example/low_precision_training/ppcls/utils/download.py
 create mode 100755 example/low_precision_training/ppcls/utils/ema.py
 create mode 100755 example/low_precision_training/ppcls/utils/feature_maps_visualization/fm_vis.py
 create mode 100755 example/low_precision_training/ppcls/utils/feature_maps_visualization/resnet.py
 create mode 100755 example/low_precision_training/ppcls/utils/feature_maps_visualization/utils.py
 create mode 100755 example/low_precision_training/ppcls/utils/imagenet1k_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/initializer.py
 create mode 100755 example/low_precision_training/ppcls/utils/logger.py
 create mode 100755 example/low_precision_training/ppcls/utils/metrics.py
 create mode 100755 example/low_precision_training/ppcls/utils/misc.py
 create mode 100755 example/low_precision_training/ppcls/utils/model_zoo.py
 create mode 100755 example/low_precision_training/ppcls/utils/pedestrian_attribute_label_list.txt
 create mode 100755 example/low_precision_training/ppcls/utils/pretrained.list
 create mode 100755 example/low_precision_training/ppcls/utils/profiler.py
 create mode 100755 example/low_precision_training/ppcls/utils/save_load.py
 create mode 100755 example/low_precision_training/ppcls/utils/save_result.py
 create mode 100755 example/low_precision_training/ppcls/utils/vehicle_attribute_label_list.txt
 create mode 100755 example/low_precision_training/qat_config/qat_a4_mse.yaml
 create mode 100755 example/low_precision_training/qt_config/qt_W4_mse_det__A4_mse_det__G4_minmax_sto.yaml
 create mode 100755 example/low_precision_training/quantization/__init__.py
 create mode 100755 example/low_precision_training/quantization/config.py
 create mode 100755 example/low_precision_training/quantization/cudnn_convolution.cpp
 create mode 100755 example/low_precision_training/quantization/install_paddle_so.py
 create mode 100755 example/low_precision_training/quantization/layers.py
 create mode 100755 example/low_precision_training/quantization/managers/__init__.py
 create mode 100755 example/low_precision_training/quantization/managers/base_quantization_manager.py
 create mode 100755 example/low_precision_training/quantization/managers/qat_quantization_manager.py
 create mode 100755 example/low_precision_training/quantization/managers/qt_quantization_manager.py
 create mode 100755 example/low_precision_training/quantization/ops.py
 create mode 100755 example/low_precision_training/quantization/qconfig.py
 create mode 100755 example/low_precision_training/quantization/quantization_utils.py
 create mode 100755 example/low_precision_training/quantization/quantizer/__init__.py
 create mode 100755 example/low_precision_training/quantization/quantizer/dummy_quantizer.py
 create mode 100755 example/low_precision_training/quantization/quantizer/int_quantizer.py
 create mode 100755 example/low_precision_training/quantization/registry.py
 create mode 100755 example/low_precision_training/quantization/setup.py
 create mode 100755 example/low_precision_training/quantization/utils.py
 create mode 100755 example/low_precision_training/requirements.txt
 create mode 100755 example/low_precision_training/tools/__init__.py
 create mode 100755 example/low_precision_training/tools/eval_qt.py
 create mode 100755 example/low_precision_training/tools/eval_qt.sh
 create mode 100755 example/low_precision_training/tools/train.py
 create mode 100755 example/low_precision_training/tools/train.sh
 create mode 100755 example/low_precision_training/tools/train_qat.py
 create mode 100755 example/low_precision_training/tools/train_qt.py
 create mode 100755 example/low_precision_training/tools/train_qt.sh

diff --git a/example/low_precision_training/README.md b/example/low_precision_training/README.md
new file mode 100755
index 000000000..0a11d0e1e
--- /dev/null
+++ b/example/low_precision_training/README.md
@@ -0,0 +1,112 @@
+# Low-precision Training & Quantization-aware Training With PaddlePaddle
+
+## Introduction
+**Quantization-aware Training:** network quantization for inference acceleration, i.e., the weights and activations are quantized into low-bit integers so that the forward pass can be converted into integer matrix multiplications.
+
+**Low-precision Training:** network quantization for training acceleration, i.e., the weights, activations as well as backpropagated errors are quantized into low-bit integers so that the forward and backward pass can be converted into integer matrix multiplications.
+
+
+## Environment Configuration
+python 3.10 \
+paddlepaddle-gpu 2.6
+```sh
+cd classification/
+pip install -r requirements.txt
+cd quantization
+python setup.py build
+install_paddle_so.py build/cudnn_convolution_custom/lib.linux-x86_64-cpython-310/cudnn_convolution_custom.so
+```
+
+## Train
+Low-precision Training: 
+```sh
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+python -m paddle.distributed.launch --gpus="4,5,6,7" train_qt.py -c ../ppcls/configs/ImageNet/ResNet/ResNet18_custom.yaml
+```
+
+```py
+config.qconfigdir = '../qt_config/qt_W4_mse_det__A4_mse_det__G4_minmax_sto.yaml'
+```
+Quantization-Aware Training
+```sh
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+python -m paddle.distributed.launch --gpus="4,5,6,7" train_qat.py -c ../ppcls/configs/ImageNet/ResNet/ResNet18_custom.yaml
+```
+
+```py
+config.qconfigdir = '../qat_config/qat_w4_mse_a4_mse.yaml'
+```
+## Evalute
+```sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch --gpus="0,1,2,3" eval_qt.py -c ../ppcls/configs/ImageNet/ResNet/ResNet18_custom.yaml
+```
+
+## Results
+Datastes: Imagenet ilsvrc12 \
+Model: Resnet18
+|  config   | top-1(%)  | top-5(%)  |
+|  ----  | ----  | ----  |
+| original train  | 70.852 | 89.700 |
+| low-precision train (test with w4a4)  | 66.498 | 86.858 |
+| low-precision train (test with w4)  | 67.200 | 87.196 |
+| low-precision train (test with fp)    | 66.330 | 86.790 |
+
+## 目录结构
+```
+/classification
+├── README.md           # 项目说明文件
+├── requirements.txt    # 项目依赖
+├── tool/               # 源代码目录
+│   ├── train_py.py     # 量化训练主程序入口
+│   ├── train.py        # 全精度训练主程序入口
+│   ├── eval_qt.py      # 测试代码入口
+├── ppcl/               # 分类网络功能实现模块
+├── qt_config/          # 训练量化配置脚本，在此处更改量化配置
+├── qat_config/         # Quantization-Aware Training config
+└── quantization/       # 量化训练功能实现模块
+```
+
+## 量化参数具体说明
+以`qt_W4_minmax_det__A4_mse_det__G4_minmax_sto.yaml`为例：
+
+`quantization_manager`: 'qt_quantization_manager'为 Low-precision Training，'qat_quantization_manager'为 Quantization-Aware Training
+
+#### weight_forward_config 正向权重量化参数
+`qtype`: "int4"
+量化类型。表示权重使用 4 位整数（int4）进行量化。
+
+`granularity`: "N"
+量化粒度。"N" 表示按神经网络的各个层级进行量化。
+
+`scaling`: "dynamic"
+缩放方法。"dynamic" 表示动态缩放，量化过程中根据数据动态调整缩放因子。
+
+`observer`: "minmax"
+量化损失类型。"minmax" 表示使用最小最大值来优化量化参数。
+
+`stochastic`: False
+是否使用随机量化。False 表示不使用随机量化。
+
+`filters`:
+用于具体指定哪些层或部分进行量化。
+
+id: 0, qtype: "full": 表示 ID 为 0 的层（通常是第一个层）使用全量化（"full"）。
+name: "fc", qtype: "full": 表示名称为 fc 的层（通常是全连接层）使用全量化。
+
+#### weight_backward_config 反向传播权重量化参数
+`granularity`: "C"
+量化粒度。"C" 表示按通道（channel）进行量化。
+
+#### input_config 输入量化参数
+`observer`: "mse"
+量化损失类型。"mse" 表示使用均方误差（Mean Squared Error）损失优化量化参数。
+
+#### grad_input_config 输入梯度量化参数
+granularity: "NHW"
+量化粒度。"NHW" 表示按输入数据的每个维度（batch size, height, width）进行量化。
+
+#### grad_weight_config 权重梯度量化参数
+granularity: "NC"
+量化粒度。"NC" 表示按通道和输出维度进行量化。
+
diff --git a/example/low_precision_training/ppcls/.DS_Store b/example/low_precision_training/ppcls/.DS_Store
new file mode 100755
index 0000000000000000000000000000000000000000..2065ebab3a521f8185197ca059b14ec68a1798f1
GIT binary patch
literal 6148
zcmeHK%}T>S5T0$TO({YT3gT(OYm05cs(1;tzJL)usMN;R8jRV}wB}F>c>sMOAH?Tz
zW_JTtd+;P;XJGf6ouA#z2iYG05bbHd0Z;(|2OFUvWr>h^sVirL35A{`fFZ<w5|8|F
z$wYtAMBiS7J5Uh8J$(9p<)i1#AB90Ysn_2{E?-z%FFHl1?A&+{D)A=0$uw^F##b~t
zS1R-~z2{#9gQQ#8JW<i47es@xP6+w~47t7xqP|MnY8v%po$DEgQ*uh(%Fb+7Yu20l
zRat8s&Z}~E)T~$KL8~>Nmz=HLz2md)WB3%Q7el5%-nDF4%-|K3FLgb8<0w?o5ItE|
z7R|^EFayj0Gq6ew*i*nMuTs%mD>J|h{D=YCA0#$H*J5f=Zyngt^_lzyLK4*JEkP(P
zx)xJ|=s^)C715*$`@|3?9qrP_xfWA{CLM%c8RxMpXO9=5S4X?l;UHXt+%p5rz#;>A
zGpx}0e~Q1%(ntPc3XhlpX5gPOAPOC?)5fCg+4`+KI%_4iTWlm0my&{l`obju2eglD
fYp3x`(lO4pm>Q&6$ga~7`68eR;f@*j1qMC<-}p@O

literal 0
HcmV?d00001

diff --git a/example/low_precision_training/ppcls/__init__.py b/example/low_precision_training/ppcls/__init__.py
new file mode 100755
index 000000000..d6cdb6f8f
--- /dev/null
+++ b/example/low_precision_training/ppcls/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import optimizer
+
+from .arch import *
+from .optimizer import *
+from .data import *
+from .utils import *
diff --git a/example/low_precision_training/ppcls/arch/__init__.py b/example/low_precision_training/ppcls/arch/__init__.py
new file mode 100755
index 000000000..798df62eb
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/__init__.py
@@ -0,0 +1,177 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import copy
+import importlib
+import paddle.nn as nn
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+from . import backbone, gears
+from .backbone import *
+from .gears import build_gear, add_ml_decoder_head
+from .utils import *
+from .backbone.base.theseus_layer import TheseusLayer
+from ..utils import logger
+from ..utils.save_load import load_dygraph_pretrain
+from .slim import prune_model, quantize_model
+from .distill.afd_attention import LinearTransformStudent, LinearTransformTeacher
+
+__all__ = ["build_model", "RecModel", "DistillationModel", "AttentionModel"]
+
+
+def build_model(config, mode="train"):
+    arch_config = copy.deepcopy(config["Arch"])
+    model_type = arch_config.pop("name")
+    use_sync_bn = arch_config.pop("use_sync_bn", False)
+    use_ml_decoder = arch_config.pop("use_ml_decoder", False)
+    mod = importlib.import_module(__name__)
+    arch = getattr(mod, model_type)(**arch_config)
+    if use_sync_bn:
+        if config["Global"]["device"] == "gpu":
+            arch = nn.SyncBatchNorm.convert_sync_batchnorm(arch)
+        else:
+            msg = "SyncBatchNorm can only be used on GPU device. The releated setting has been ignored."
+            logger.warning(msg)
+
+    if use_ml_decoder:
+        add_ml_decoder_head(arch, config.get("MLDecoder", {}))
+
+    if isinstance(arch, TheseusLayer):
+        prune_model(config, arch)
+        quantize_model(config, arch, mode)
+
+    return arch
+
+
+def apply_to_static(config, model, is_rec):
+    support_to_static = config['Global'].get('to_static', False)
+
+    if support_to_static:
+        specs = None
+        if 'image_shape' in config['Global']:
+            specs = [InputSpec([None] + config['Global']['image_shape'])]
+            specs[0].stop_gradient = True
+            if is_rec:
+                specs.append(InputSpec([None, 1], 'int64', stop_gradient=True))
+        model = to_static(model, input_spec=specs)
+        logger.info("Successfully to apply @to_static with specs: {}".format(
+            specs))
+    return model
+
+
+class RecModel(TheseusLayer):
+    def __init__(self, **config):
+        super().__init__()
+        backbone_config = config["Backbone"]
+        backbone_name = backbone_config.pop("name")
+        self.backbone = eval(backbone_name)(**backbone_config)
+        self.head_feature_from = config.get('head_feature_from', 'neck')
+
+        if "BackboneStopLayer" in config:
+            backbone_stop_layer = config["BackboneStopLayer"]["name"]
+            self.backbone.stop_after(backbone_stop_layer)
+
+        if "Neck" in config:
+            self.neck = build_gear(config["Neck"])
+        else:
+            self.neck = None
+
+        if "Head" in config:
+            self.head = build_gear(config["Head"])
+        else:
+            self.head = None
+
+    def forward(self, x, label=None):
+        
+        out = dict()
+        x = self.backbone(x)
+        out["backbone"] = x
+        if self.neck is not None:
+            feat = self.neck(x)
+            out["neck"] = feat
+        out["features"] = out['neck'] if self.neck else x
+        if self.head is not None:
+            if self.head_feature_from == 'backbone':
+                y = self.head(out['backbone'], label)
+            elif self.head_feature_from == 'neck':
+                y = self.head(out['features'], label)
+            out["logits"] = y
+        return out
+
+
+class DistillationModel(nn.Layer):
+    def __init__(self,
+                 models=None,
+                 pretrained_list=None,
+                 freeze_params_list=None,
+                 **kargs):
+        super().__init__()
+        assert isinstance(models, list)
+        self.model_list = []
+        self.model_name_list = []
+        if pretrained_list is not None:
+            assert len(pretrained_list) == len(models)
+
+        if freeze_params_list is None:
+            freeze_params_list = [False] * len(models)
+        assert len(freeze_params_list) == len(models)
+        for idx, model_config in enumerate(models):
+            assert len(model_config) == 1
+            key = list(model_config.keys())[0]
+            model_config = model_config[key]
+            model_name = model_config.pop("name")
+            model = eval(model_name)(**model_config)
+
+            if freeze_params_list[idx]:
+                for param in model.parameters():
+                    param.trainable = False
+            self.model_list.append(self.add_sublayer(key, model))
+            self.model_name_list.append(key)
+
+        if pretrained_list is not None:
+            for idx, pretrained in enumerate(pretrained_list):
+                if pretrained is not None:
+                    load_dygraph_pretrain(
+                        self.model_name_list[idx], path=pretrained)
+
+    def forward(self, x, label=None):
+        result_dict = dict()
+        for idx, model_name in enumerate(self.model_name_list):
+            if label is None:
+                result_dict[model_name] = self.model_list[idx](x)
+            else:
+                result_dict[model_name] = self.model_list[idx](x, label)
+        return result_dict
+
+
+class AttentionModel(DistillationModel):
+    def __init__(self,
+                 models=None,
+                 pretrained_list=None,
+                 freeze_params_list=None,
+                 **kargs):
+        super().__init__(models, pretrained_list, freeze_params_list, **kargs)
+
+    def forward(self, x, label=None):
+        result_dict = dict()
+        out = x
+        for idx, model_name in enumerate(self.model_name_list):
+            if label is None:
+                out = self.model_list[idx](out)
+                result_dict.update(out)
+            else:
+                out = self.model_list[idx](out, label)
+                result_dict.update(out)
+        return result_dict
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/arch/backbone/__init__.py b/example/low_precision_training/ppcls/arch/backbone/__init__.py
new file mode 100755
index 000000000..a080443b8
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/__init__.py
@@ -0,0 +1,117 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import inspect
+
+from .legendary_models.mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+from .legendary_models.mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from .legendary_models.mobilenet_v4 import MobileNetV4_conv_small, MobileNetV4_conv_medium, MobileNetV4_conv_large, MobileNetV4_hybrid_medium, MobileNetV4_hybrid_large
+from .model_zoo.fasternet import FasterNet_T0, FasterNet_T1, FasterNet_T2, FasterNet_S, FasterNet_M, FasterNet_L
+from .model_zoo.starnet import StarNet_S1, StarNet_S2, StarNet_S3, StarNet_S4
+from .legendary_models.resnet import ResNet18, ResNet18_vd, ResNet34, ResNet34_vd, ResNet50, ResNet50_vd, ResNet101, ResNet101_vd, ResNet152, ResNet152_vd, ResNet200_vd
+from .legendary_models.vgg import VGG11, VGG13, VGG16, VGG19
+from .legendary_models.inception_v3 import InceptionV3
+from .legendary_models.hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W60_C, HRNet_W64_C, SE_HRNet_W64_C
+from .legendary_models.pp_lcnet import PPLCNetBaseNet, PPLCNet_x0_25, PPLCNet_x0_35, PPLCNet_x0_5, PPLCNet_x0_75, PPLCNet_x1_0, PPLCNet_x1_5, PPLCNet_x2_0, PPLCNet_x2_5
+from .legendary_models.pp_lcnet_v2 import PPLCNetV2_small, PPLCNetV2_base, PPLCNetV2_large
+from .legendary_models.esnet import ESNet_x0_25, ESNet_x0_5, ESNet_x0_75, ESNet_x1_0
+from .legendary_models.pp_hgnet import PPHGNet_tiny, PPHGNet_small, PPHGNet_base
+from .legendary_models.pp_hgnet_v2 import PPHGNetV2_B0, PPHGNetV2_B1, PPHGNetV2_B2, PPHGNetV2_B3, PPHGNetV2_B4, PPHGNetV2_B5, PPHGNetV2_B6
+
+from .model_zoo.resnet_vc import ResNet50_vc
+from .model_zoo.resnext import ResNeXt50_32x4d, ResNeXt50_64x4d, ResNeXt101_32x4d, ResNeXt101_64x4d, ResNeXt152_32x4d, ResNeXt152_64x4d
+from .model_zoo.resnext_vd import ResNeXt50_vd_32x4d, ResNeXt50_vd_64x4d, ResNeXt101_vd_32x4d, ResNeXt101_vd_64x4d, ResNeXt152_vd_32x4d, ResNeXt152_vd_64x4d
+from .model_zoo.res2net import Res2Net50_26w_4s, Res2Net50_14w_8s
+from .model_zoo.res2net_vd import Res2Net50_vd_26w_4s, Res2Net101_vd_26w_4s, Res2Net200_vd_26w_4s
+from .model_zoo.se_resnet_vd import SE_ResNet18_vd, SE_ResNet34_vd, SE_ResNet50_vd
+from .model_zoo.se_resnext_vd import SE_ResNeXt50_vd_32x4d, SE_ResNeXt50_vd_32x4d, SENet154_vd
+from .model_zoo.se_resnext import SE_ResNeXt50_32x4d, SE_ResNeXt101_32x4d, SE_ResNeXt152_64x4d
+from .model_zoo.dpn import DPN68, DPN92, DPN98, DPN107, DPN131
+from .model_zoo.dsnet import DSNet_tiny, DSNet_small, DSNet_base
+from .model_zoo.densenet import DenseNet121, DenseNet161, DenseNet169, DenseNet201, DenseNet264
+from .model_zoo.efficientnet import EfficientNetB0, EfficientNetB1, EfficientNetB2, EfficientNetB3, EfficientNetB4, EfficientNetB5, EfficientNetB6, EfficientNetB7, EfficientNetB0_small
+from .model_zoo.efficientnet_v2 import EfficientNetV2_S
+from .model_zoo.resnest import ResNeSt50_fast_1s1x64d, ResNeSt50, ResNeSt101, ResNeSt200, ResNeSt269
+from .model_zoo.googlenet import GoogLeNet
+from .model_zoo.mobilenet_v2 import MobileNetV2_x0_25, MobileNetV2_x0_5, MobileNetV2_x0_75, MobileNetV2, MobileNetV2_x1_5, MobileNetV2_x2_0
+from .model_zoo.shufflenet_v2 import ShuffleNetV2_x0_25, ShuffleNetV2_x0_33, ShuffleNetV2_x0_5, ShuffleNetV2_x1_0, ShuffleNetV2_x1_5, ShuffleNetV2_x2_0, ShuffleNetV2_swish
+from .model_zoo.ghostnet import GhostNet_x0_5, GhostNet_x1_0, GhostNet_x1_3
+from .model_zoo.alexnet import AlexNet
+from .model_zoo.inception_v4 import InceptionV4
+from .model_zoo.xception import Xception41, Xception65, Xception71
+from .model_zoo.xception_deeplab import Xception41_deeplab, Xception65_deeplab
+from .model_zoo.resnext101_wsl import ResNeXt101_32x8d_wsl, ResNeXt101_32x16d_wsl, ResNeXt101_32x32d_wsl, ResNeXt101_32x48d_wsl
+from .model_zoo.squeezenet import SqueezeNet1_0, SqueezeNet1_1
+from .model_zoo.darknet import DarkNet53
+from .model_zoo.regnet import RegNetX_200MF, RegNetX_400MF, RegNetX_600MF, RegNetX_800MF, RegNetX_1600MF, RegNetX_3200MF, RegNetX_4GF, RegNetX_6400MF, RegNetX_8GF, RegNetX_12GF, RegNetX_16GF, RegNetX_32GF
+from .model_zoo.vision_transformer import ViT_small_patch16_224, ViT_base_patch16_224, ViT_base_patch16_384, ViT_base_patch32_384, ViT_large_patch16_224, ViT_large_patch16_384, ViT_large_patch32_384
+from .model_zoo.distilled_vision_transformer import DeiT_tiny_patch16_224, DeiT_small_patch16_224, DeiT_base_patch16_224, DeiT_tiny_distilled_patch16_224, DeiT_small_distilled_patch16_224, DeiT_base_distilled_patch16_224, DeiT_base_patch16_384, DeiT_base_distilled_patch16_384
+from .legendary_models.swin_transformer import SwinTransformer_tiny_patch4_window7_224, SwinTransformer_small_patch4_window7_224, SwinTransformer_base_patch4_window7_224, SwinTransformer_base_patch4_window12_384, SwinTransformer_large_patch4_window7_224, SwinTransformer_large_patch4_window12_384
+from .model_zoo.swin_transformer_v2 import SwinTransformerV2_tiny_patch4_window8_256, SwinTransformerV2_small_patch4_window8_256, SwinTransformerV2_base_patch4_window8_256, SwinTransformerV2_tiny_patch4_window16_256, SwinTransformerV2_small_patch4_window16_256, SwinTransformerV2_base_patch4_window16_256, SwinTransformerV2_base_patch4_window24_384, SwinTransformerV2_large_patch4_window16_256, SwinTransformerV2_large_patch4_window24_384
+from .model_zoo.cswin_transformer import CSWinTransformer_tiny_224, CSWinTransformer_small_224, CSWinTransformer_base_224, CSWinTransformer_large_224, CSWinTransformer_base_384, CSWinTransformer_large_384
+from .model_zoo.mixnet import MixNet_S, MixNet_M, MixNet_L
+from .model_zoo.rexnet import ReXNet_1_0, ReXNet_1_3, ReXNet_1_5, ReXNet_2_0, ReXNet_3_0
+from .model_zoo.twins import pcpvt_small, pcpvt_base, pcpvt_large, alt_gvt_small, alt_gvt_base, alt_gvt_large
+from .model_zoo.levit import LeViT_128S, LeViT_128, LeViT_192, LeViT_256, LeViT_384
+from .model_zoo.dla import DLA34, DLA46_c, DLA46x_c, DLA60, DLA60x, DLA60x_c, DLA102, DLA102x, DLA102x2, DLA169
+from .model_zoo.rednet import RedNet26, RedNet38, RedNet50, RedNet101, RedNet152
+from .model_zoo.tnt import TNT_small, TNT_base
+from .model_zoo.hardnet import HarDNet68, HarDNet85, HarDNet39_ds, HarDNet68_ds
+from .model_zoo.cspnet import CSPDarkNet53
+from .model_zoo.pvt_v2 import PVT_V2_B0, PVT_V2_B1, PVT_V2_B2_Linear, PVT_V2_B2, PVT_V2_B3, PVT_V2_B4, PVT_V2_B5
+from .model_zoo.mobilevit import MobileViT_XXS, MobileViT_XS, MobileViT_S
+from .model_zoo.repvgg import RepVGG_A0, RepVGG_A1, RepVGG_A2, RepVGG_B0, RepVGG_B1, RepVGG_B2, RepVGG_B1g2, RepVGG_B1g4, RepVGG_B2g4, RepVGG_B3, RepVGG_B3g4, RepVGG_D2se
+from .model_zoo.van import VAN_B0, VAN_B1, VAN_B2, VAN_B3
+from .model_zoo.peleenet import PeleeNet
+from .model_zoo.foundation_vit import CLIP_vit_base_patch32_224, CLIP_vit_base_patch16_224, CLIP_vit_large_patch14_336, CLIP_vit_large_patch14_224, BEiTv2_vit_base_patch16_224, BEiTv2_vit_large_patch16_224, CAE_vit_base_patch16_224, EVA_vit_giant_patch14, MOCOV3_vit_small, MOCOV3_vit_base, MAE_vit_huge_patch14, MAE_vit_large_patch16, MAE_vit_base_patch16
+from .model_zoo.convnext import ConvNeXt_tiny, ConvNeXt_small, ConvNeXt_base_224, ConvNeXt_base_384, ConvNeXt_large_224, ConvNeXt_large_384
+from .model_zoo.nextvit import NextViT_small_224, NextViT_base_224, NextViT_large_224, NextViT_small_384, NextViT_base_384, NextViT_large_384
+from .model_zoo.cae import cae_base_patch16_224, cae_large_patch16_224
+from .model_zoo.cvt import CvT_13_224, CvT_13_384, CvT_21_224, CvT_21_384, CvT_W24_384
+from .model_zoo.micronet import MicroNet_M0, MicroNet_M1, MicroNet_M2, MicroNet_M3
+from .model_zoo.mobilenext import MobileNeXt_x0_35, MobileNeXt_x0_5, MobileNeXt_x0_75, MobileNeXt_x1_0, MobileNeXt_x1_4
+from .model_zoo.mobilevit_v2 import MobileViTV2_x0_5, MobileViTV2_x0_75, MobileViTV2_x1_0, MobileViTV2_x1_25, MobileViTV2_x1_5, MobileViTV2_x1_75, MobileViTV2_x2_0
+from .model_zoo.tinynet import TinyNet_A, TinyNet_B, TinyNet_C, TinyNet_D, TinyNet_E
+from .model_zoo.mobilevit_v3 import MobileViTV3_XXS, MobileViTV3_XS, MobileViTV3_S, MobileViTV3_XXS_L2, MobileViTV3_XS_L2, MobileViTV3_S_L2, MobileViTV3_x0_5, MobileViTV3_x0_75, MobileViTV3_x1_0
+from .model_zoo.svtrnet import SVTR_tiny, SVTR_base, SVTR_large
+
+from .variant_models.resnet_variant import ResNet50_last_stage_stride1
+from .variant_models.resnet_variant import ResNet50_adaptive_max_pool2d
+from .variant_models.resnet_variant import ResNet50_metabin
+from .variant_models.vgg_variant import VGG19Sigmoid
+from .variant_models.pp_lcnet_variant import PPLCNet_x2_5_Tanh
+from .variant_models.pp_lcnetv2_variant import PPLCNetV2_base_ShiTu
+from .variant_models.efficientnet_variant import EfficientNetB3_watermark
+from .variant_models.foundation_vit_variant import CLIP_large_patch14_224_aesthetic
+from .variant_models.swin_transformer_variant import SwinTransformer_tiny_patch4_window7_224_SOLIDER, SwinTransformer_small_patch4_window7_224_SOLIDER, SwinTransformer_base_patch4_window7_224_SOLIDER
+from .model_zoo.adaface_ir_net import AdaFace_IR_18, AdaFace_IR_34, AdaFace_IR_50, AdaFace_IR_101, AdaFace_IR_152, AdaFace_IR_SE_50, AdaFace_IR_SE_101, AdaFace_IR_SE_152, AdaFace_IR_SE_200
+from .model_zoo.wideresnet import WideResNet
+from .model_zoo.uniformer import UniFormer_small, UniFormer_small_plus, UniFormer_small_plus_dim64, UniFormer_base, UniFormer_base_ls
+
+
+# help whl get all the models' api (class type) and components' api (func type)
+def get_apis():
+    current_func = sys._getframe().f_code.co_name
+    current_module = sys.modules[__name__]
+    api = []
+    for _, obj in inspect.getmembers(current_module,
+                                     inspect.isclass) + inspect.getmembers(
+                                         current_module, inspect.isfunction):
+        api.append(obj.__name__)
+    api.remove(current_func)
+    return api
+
+
+__all__ = get_apis()
diff --git a/example/low_precision_training/ppcls/arch/backbone/base/__init__.py b/example/low_precision_training/ppcls/arch/backbone/base/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/example/low_precision_training/ppcls/arch/backbone/base/dbb/dbb_block.py b/example/low_precision_training/ppcls/arch/backbone/base/dbb/dbb_block.py
new file mode 100755
index 000000000..f38c5c257
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/base/dbb/dbb_block.py
@@ -0,0 +1,365 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2103.13425, https://github.com/DingXiaoH/DiverseBranchBlock
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from .dbb_transforms import *
+
+
+def conv_bn(in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            padding_mode='zeros'):
+    conv_layer = nn.Conv2D(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        bias_attr=False,
+        padding_mode=padding_mode)
+    bn_layer = nn.BatchNorm2D(num_features=out_channels)
+    se = nn.Sequential()
+    se.add_sublayer('conv', conv_layer)
+    se.add_sublayer('bn', bn_layer)
+    return se
+
+
+class IdentityBasedConv1x1(nn.Conv2D):
+    def __init__(self, channels, groups=1):
+        super(IdentityBasedConv1x1, self).__init__(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=groups,
+            bias_attr=False)
+
+        assert channels % groups == 0
+        input_dim = channels // groups
+        id_value = np.zeros((channels, input_dim, 1, 1))
+        for i in range(channels):
+            id_value[i, i % input_dim, 0, 0] = 1
+        self.id_tensor = paddle.to_tensor(id_value)
+        self.weight.set_value(paddle.zeros_like(self.weight))
+
+    def forward(self, input):
+        kernel = self.weight + self.id_tensor
+        result = F.conv2d(
+            input,
+            kernel,
+            None,
+            stride=1,
+            padding=0,
+            dilation=self._dilation,
+            groups=self._groups)
+        return result
+
+    def get_actual_kernel(self):
+        return self.weight + self.id_tensor
+
+
+class BNAndPad(nn.Layer):
+    def __init__(self,
+                 pad_pixels,
+                 num_features,
+                 epsilon=1e-5,
+                 momentum=0.1,
+                 last_conv_bias=None,
+                 bn=nn.BatchNorm2D):
+        super().__init__()
+        self.bn = bn(num_features, momentum=momentum, epsilon=epsilon)
+        self.pad_pixels = pad_pixels
+        self.last_conv_bias = last_conv_bias
+
+    def forward(self, input):
+        output = self.bn(input)
+        if self.pad_pixels > 0:
+            bias = -self.bn._mean
+            if self.last_conv_bias is not None:
+                bias += self.last_conv_bias
+            pad_values = self.bn.bias + self.bn.weight * (
+                bias / paddle.sqrt(self.bn._variance + self.bn._epsilon))
+            ''' pad '''
+            # TODO: n,h,w,c format is not supported yet
+            n, c, h, w = output.shape
+            values = pad_values.reshape([1, -1, 1, 1])
+            w_values = values.expand([n, -1, self.pad_pixels, w])
+            x = paddle.concat([w_values, output, w_values], axis=2)
+            h = h + self.pad_pixels * 2
+            h_values = values.expand([n, -1, h, self.pad_pixels])
+            x = paddle.concat([h_values, x, h_values], axis=3)
+            output = x
+        return output
+
+    @property
+    def weight(self):
+        return self.bn.weight
+
+    @property
+    def bias(self):
+        return self.bn.bias
+
+    @property
+    def _mean(self):
+        return self.bn._mean
+
+    @property
+    def _variance(self):
+        return self.bn._variance
+
+    @property
+    def _epsilon(self):
+        return self.bn._epsilon
+
+
+class DiverseBranchBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 is_repped=False,
+                 single_init=False,
+                 **kwargs):
+        super().__init__()
+
+        padding = (filter_size - 1) // 2
+        dilation = 1
+
+        in_channels = num_channels
+        out_channels = num_filters
+        kernel_size = filter_size
+        internal_channels_1x1_3x3 = None
+        nonlinear = act
+
+        self.is_repped = is_repped
+
+        if nonlinear is None:
+            self.nonlinear = nn.Identity()
+        else:
+            self.nonlinear = nn.ReLU()
+
+        self.kernel_size = kernel_size
+        self.out_channels = out_channels
+        self.groups = groups
+        assert padding == kernel_size // 2
+
+        if is_repped:
+            self.dbb_reparam = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=True)
+        else:
+            self.dbb_origin = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups)
+
+            self.dbb_avg = nn.Sequential()
+            if groups < out_channels:
+                self.dbb_avg.add_sublayer(
+                    'conv',
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=groups,
+                        bias_attr=False))
+                self.dbb_avg.add_sublayer(
+                    'bn',
+                    BNAndPad(
+                        pad_pixels=padding, num_features=out_channels))
+                self.dbb_avg.add_sublayer(
+                    'avg',
+                    nn.AvgPool2D(
+                        kernel_size=kernel_size, stride=stride, padding=0))
+                self.dbb_1x1 = conv_bn(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    stride=stride,
+                    padding=0,
+                    groups=groups)
+            else:
+                self.dbb_avg.add_sublayer(
+                    'avg',
+                    nn.AvgPool2D(
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        padding=padding))
+
+            self.dbb_avg.add_sublayer('avgbn', nn.BatchNorm2D(out_channels))
+
+            if internal_channels_1x1_3x3 is None:
+                internal_channels_1x1_3x3 = in_channels if groups < out_channels else 2 * in_channels  # For mobilenet, it is better to have 2X internal channels
+
+            self.dbb_1x1_kxk = nn.Sequential()
+            if internal_channels_1x1_3x3 == in_channels:
+                self.dbb_1x1_kxk.add_sublayer(
+                    'idconv1',
+                    IdentityBasedConv1x1(
+                        channels=in_channels, groups=groups))
+            else:
+                self.dbb_1x1_kxk.add_sublayer(
+                    'conv1',
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=internal_channels_1x1_3x3,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=groups,
+                        bias_attr=False))
+            self.dbb_1x1_kxk.add_sublayer(
+                'bn1',
+                BNAndPad(
+                    pad_pixels=padding,
+                    num_features=internal_channels_1x1_3x3))
+            self.dbb_1x1_kxk.add_sublayer(
+                'conv2',
+                nn.Conv2D(
+                    in_channels=internal_channels_1x1_3x3,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=0,
+                    groups=groups,
+                    bias_attr=False))
+            self.dbb_1x1_kxk.add_sublayer('bn2', nn.BatchNorm2D(out_channels))
+
+        #   The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases.
+        if single_init:
+            #   Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting.
+            self.single_init()
+
+    def forward(self, inputs):
+        if self.is_repped:
+            return self.nonlinear(self.dbb_reparam(inputs))
+
+        out = self.dbb_origin(inputs)
+        if hasattr(self, 'dbb_1x1'):
+            out += self.dbb_1x1(inputs)
+        out += self.dbb_avg(inputs)
+        out += self.dbb_1x1_kxk(inputs)
+        return self.nonlinear(out)
+
+    def init_gamma(self, gamma_value):
+        if hasattr(self, "dbb_origin"):
+            paddle.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1"):
+            paddle.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value)
+        if hasattr(self, "dbb_avg"):
+            paddle.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1_kxk"):
+            paddle.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value)
+
+    def single_init(self):
+        self.init_gamma(0.0)
+        if hasattr(self, "dbb_origin"):
+            paddle.nn.init.constant_(self.dbb_origin.bn.weight, 1.0)
+
+    def get_equivalent_kernel_bias(self):
+        k_origin, b_origin = transI_fusebn(self.dbb_origin.conv.weight,
+                                           self.dbb_origin.bn)
+
+        if hasattr(self, 'dbb_1x1'):
+            k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight,
+                                         self.dbb_1x1.bn)
+            k_1x1 = transVI_multiscale(k_1x1, self.kernel_size)
+        else:
+            k_1x1, b_1x1 = 0, 0
+
+        if hasattr(self.dbb_1x1_kxk, 'idconv1'):
+            k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel()
+        else:
+            k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight
+        k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(k_1x1_kxk_first,
+                                                         self.dbb_1x1_kxk.bn1)
+        k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(
+            self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2)
+        k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(
+            k_1x1_kxk_first,
+            b_1x1_kxk_first,
+            k_1x1_kxk_second,
+            b_1x1_kxk_second,
+            groups=self.groups)
+
+        k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups)
+        k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg,
+                                                           self.dbb_avg.avgbn)
+        if hasattr(self.dbb_avg, 'conv'):
+            k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(
+                self.dbb_avg.conv.weight, self.dbb_avg.bn)
+            k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(
+                k_1x1_avg_first,
+                b_1x1_avg_first,
+                k_1x1_avg_second,
+                b_1x1_avg_second,
+                groups=self.groups)
+        else:
+            k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second
+
+        return transII_addbranch(
+            (k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged),
+            (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged))
+
+    def re_parameterize(self):
+        if self.is_repped:
+            return
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.dbb_reparam = nn.Conv2D(
+            in_channels=self.dbb_origin.conv._in_channels,
+            out_channels=self.dbb_origin.conv._out_channels,
+            kernel_size=self.dbb_origin.conv._kernel_size,
+            stride=self.dbb_origin.conv._stride,
+            padding=self.dbb_origin.conv._padding,
+            dilation=self.dbb_origin.conv._dilation,
+            groups=self.dbb_origin.conv._groups,
+            bias_attr=True)
+
+        self.dbb_reparam.weight.set_value(kernel)
+        self.dbb_reparam.bias.set_value(bias)
+
+        self.__delattr__('dbb_origin')
+        self.__delattr__('dbb_avg')
+        if hasattr(self, 'dbb_1x1'):
+            self.__delattr__('dbb_1x1')
+        self.__delattr__('dbb_1x1_kxk')
+        self.is_repped = True
diff --git a/example/low_precision_training/ppcls/arch/backbone/base/dbb/dbb_transforms.py b/example/low_precision_training/ppcls/arch/backbone/base/dbb/dbb_transforms.py
new file mode 100755
index 000000000..70f55fb09
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/base/dbb/dbb_transforms.py
@@ -0,0 +1,73 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2103.13425, https://github.com/DingXiaoH/DiverseBranchBlock
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+def transI_fusebn(kernel, bn):
+    gamma = bn.weight
+    std = (bn._variance + bn._epsilon).sqrt()
+    return kernel * (
+        (gamma / std).reshape([-1, 1, 1, 1])), bn.bias - bn._mean * gamma / std
+
+
+def transII_addbranch(kernels, biases):
+    return sum(kernels), sum(biases)
+
+
+def transIII_1x1_kxk(k1, b1, k2, b2, groups):
+    if groups == 1:
+        k = F.conv2d(k2, k1.transpose([1, 0, 2, 3]))
+        b_hat = (k2 * b1.reshape([1, -1, 1, 1])).sum((1, 2, 3))
+    else:
+        k_slices = []
+        b_slices = []
+        k1_T = k1.transpose([1, 0, 2, 3])
+        k1_group_width = k1.shape[0] // groups
+        k2_group_width = k2.shape[0] // groups
+        for g in range(groups):
+            k1_T_slice = k1_T[:, g * k1_group_width:(g + 1) *
+                              k1_group_width, :, :]
+            k2_slice = k2[g * k2_group_width:(g + 1) * k2_group_width, :, :, :]
+            k_slices.append(F.conv2d(k2_slice, k1_T_slice))
+            b_slices.append((k2_slice * b1[g * k1_group_width:(
+                g + 1) * k1_group_width].reshape([1, -1, 1, 1])).sum((1, 2, 3
+                                                                      )))
+        k, b_hat = transIV_depthconcat(k_slices, b_slices)
+    return k, b_hat + b2
+
+
+def transIV_depthconcat(kernels, biases):
+    return paddle.cat(kernels, axis=0), paddle.cat(biases)
+
+
+def transV_avg(channels, kernel_size, groups):
+    input_dim = channels // groups
+    k = paddle.zeros((channels, input_dim, kernel_size, kernel_size))
+    k[np.arange(channels), np.tile(np.arange(input_dim),
+                                   groups), :, :] = 1.0 / kernel_size**2
+    return k
+
+
+# This has not been tested with non-square kernels (kernel.shape[2] != kernel.shape[3]) nor even-size kernels
+def transVI_multiscale(kernel, target_kernel_size):
+    H_pixels_to_pad = (target_kernel_size - kernel.shape[2]) // 2
+    W_pixels_to_pad = (target_kernel_size - kernel.shape[3]) // 2
+    return F.pad(
+        kernel,
+        [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad])
diff --git a/example/low_precision_training/ppcls/arch/backbone/base/theseus_layer.py b/example/low_precision_training/ppcls/arch/backbone/base/theseus_layer.py
new file mode 100755
index 000000000..30c5a9f8b
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/base/theseus_layer.py
@@ -0,0 +1,398 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, List, Dict, Union, Callable, Any
+
+from paddle import nn
+from ....utils import logger
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class TheseusLayer(nn.Layer):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.res_dict = {}
+        self.res_name = self.full_name()
+        self.pruner = None
+        self.quanter = None
+
+        self.init_net(*args, **kwargs)
+
+    def _return_dict_hook(self, layer, input, output):
+        res_dict = {"logits": output}
+        # 'list' is needed to avoid error raised by popping self.res_dict
+        for res_key in list(self.res_dict):
+            # clear the res_dict because the forward process may change according to input
+            res_dict[res_key] = self.res_dict.pop(res_key)
+        return res_dict
+
+    def init_net(self,
+                 stages_pattern=None,
+                 return_patterns=None,
+                 return_stages=None,
+                 freeze_befor=None,
+                 stop_after=None,
+                 *args,
+                 **kwargs):
+        # init the output of net
+        if return_patterns or return_stages:
+            if return_patterns and return_stages:
+                msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+                logger.warning(msg)
+                return_stages = None
+
+            if return_stages is True:
+                return_patterns = stages_pattern
+
+            # return_stages is int or bool
+            if type(return_stages) is int:
+                return_stages = [return_stages]
+            if isinstance(return_stages, list):
+                if max(return_stages) > len(stages_pattern) or min(
+                        return_stages) < 0:
+                    msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+                    logger.warning(msg)
+                    return_stages = [
+                        val for val in return_stages
+                        if val >= 0 and val < len(stages_pattern)
+                    ]
+                return_patterns = [stages_pattern[i] for i in return_stages]
+
+            if return_patterns:
+                # call update_res function after the __init__ of the object has completed execution, that is, the contructing of layer or model has been completed.
+                def update_res_hook(layer, input):
+                    self.update_res(return_patterns)
+
+                self.register_forward_pre_hook(update_res_hook)
+
+        # freeze subnet
+        if freeze_befor is not None:
+            self.freeze_befor(freeze_befor)
+
+        # set subnet to Identity
+        if stop_after is not None:
+            self.stop_after(stop_after)
+
+    def init_res(self,
+                 stages_pattern,
+                 return_patterns=None,
+                 return_stages=None):
+        msg = "\"init_res\" will be deprecated, please use \"init_net\" instead."
+        logger.warning(DeprecationWarning(msg))
+
+        if return_patterns and return_stages:
+            msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+            logger.warning(msg)
+            return_stages = None
+
+        if return_stages is True:
+            return_patterns = stages_pattern
+        # return_stages is int or bool
+        if type(return_stages) is int:
+            return_stages = [return_stages]
+        if isinstance(return_stages, list):
+            if max(return_stages) > len(stages_pattern) or min(
+                    return_stages) < 0:
+                msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+                logger.warning(msg)
+                return_stages = [
+                    val for val in return_stages
+                    if val >= 0 and val < len(stages_pattern)
+                ]
+            return_patterns = [stages_pattern[i] for i in return_stages]
+
+        if return_patterns:
+            self.update_res(return_patterns)
+
+    def replace_sub(self, *args, **kwargs) -> None:
+        msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
+
+    def upgrade_sublayer(self,
+                         layer_name_pattern: Union[str, List[str]],
+                         handle_func: Callable[[nn.Layer, str], nn.Layer]
+                         ) -> Dict[str, nn.Layer]:
+        """use 'handle_func' to modify the sub-layer(s) specified by 'layer_name_pattern'.
+
+        Args:
+            layer_name_pattern (Union[str, List[str]]): The name of layer to be modified by 'handle_func'.
+            handle_func (Callable[[nn.Layer, str], nn.Layer]): The function to modify target layer specified by 'layer_name_pattern'. The formal params are the layer(nn.Layer) and pattern(str) that is (a member of) layer_name_pattern (when layer_name_pattern is List type). And the return is the layer processed.
+
+        Returns:
+            Dict[str, nn.Layer]: The key is the pattern and corresponding value is the result returned by 'handle_func()'.
+
+        Examples:
+
+            from paddle import nn
+            import paddleclas
+
+            def rep_func(layer: nn.Layer, pattern: str):
+                new_layer = nn.Conv2D(
+                    in_channels=layer._in_channels,
+                    out_channels=layer._out_channels,
+                    kernel_size=5,
+                    padding=2
+                )
+                return new_layer
+
+            net = paddleclas.MobileNetV1()
+            res = net.upgrade_sublayer(layer_name_pattern=["blocks[11].depthwise_conv.conv", "blocks[12].depthwise_conv.conv"], handle_func=rep_func)
+            print(res)
+            # {'blocks[11].depthwise_conv.conv': the corresponding new_layer, 'blocks[12].depthwise_conv.conv': the corresponding new_layer}
+        """
+
+        if not isinstance(layer_name_pattern, list):
+            layer_name_pattern = [layer_name_pattern]
+
+        hit_layer_pattern_list = []
+        for pattern in layer_name_pattern:
+            # parse pattern to find target layer and its parent
+            layer_list = parse_pattern_str(pattern=pattern, parent_layer=self)
+            if not layer_list:
+                continue
+
+            sub_layer_parent = layer_list[-2]["layer"] if len(
+                layer_list) > 1 else self
+            sub_layer = layer_list[-1]["layer"]
+            sub_layer_name = layer_list[-1]["name"]
+            sub_layer_index_list = layer_list[-1]["index_list"]
+
+            new_sub_layer = handle_func(sub_layer, pattern)
+
+            if sub_layer_index_list:
+                if len(sub_layer_index_list) > 1:
+                    sub_layer_parent = getattr(
+                        sub_layer_parent,
+                        sub_layer_name)[sub_layer_index_list[0]]
+                    for sub_layer_index in sub_layer_index_list[1:-1]:
+                        sub_layer_parent = sub_layer_parent[sub_layer_index]
+                    sub_layer_parent[sub_layer_index_list[-1]] = new_sub_layer
+                else:
+                    getattr(sub_layer_parent, sub_layer_name)[
+                        sub_layer_index_list[0]] = new_sub_layer
+            else:
+                setattr(sub_layer_parent, sub_layer_name, new_sub_layer)
+
+            hit_layer_pattern_list.append(pattern)
+        return hit_layer_pattern_list
+
+    def stop_after(self, stop_layer_name: str) -> bool:
+        """stop forward and backward after 'stop_layer_name'.
+
+        Args:
+            stop_layer_name (str): The name of layer that stop forward and backward after this layer.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        layer_list = parse_pattern_str(stop_layer_name, self)
+        if not layer_list:
+            return False
+
+        parent_layer = self
+        for layer_dict in layer_list:
+            name, index_list = layer_dict["name"], layer_dict["index_list"]
+            if not set_identity(parent_layer, name, index_list):
+                msg = f"Failed to set the layers that after stop_layer_name('{stop_layer_name}') to IdentityLayer. The error layer's name is '{name}'."
+                logger.warning(msg)
+                return False
+            parent_layer = layer_dict["layer"]
+
+        return True
+
+    def freeze_befor(self, layer_name: str) -> bool:
+        """freeze the layer named layer_name and its previous layer.
+
+        Args:
+            layer_name (str): The name of layer that would be freezed.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        def stop_grad(layer, pattern):
+            class StopGradLayer(nn.Layer):
+                def __init__(self):
+                    super().__init__()
+                    self.layer = layer
+
+                def forward(self, x):
+                    x = self.layer(x)
+                    x.stop_gradient = True
+                    return x
+
+            new_layer = StopGradLayer()
+            return new_layer
+
+        res = self.upgrade_sublayer(layer_name, stop_grad)
+        if len(res) == 0:
+            msg = "Failed to stop the gradient befor the layer named '{layer_name}'"
+            logger.warning(msg)
+            return False
+        return True
+
+    def update_res(
+            self,
+            return_patterns: Union[str, List[str]]) -> Dict[str, nn.Layer]:
+        """update the result(s) to be returned.
+
+        Args:
+            return_patterns (Union[str, List[str]]): The name of layer to return output.
+
+        Returns:
+            Dict[str, nn.Layer]: The pattern(str) and corresponding layer(nn.Layer) that have been set successfully.
+        """
+
+        # clear res_dict that could have been set
+        self.res_dict = {}
+
+        class Handler(object):
+            def __init__(self, res_dict):
+                # res_dict is a reference
+                self.res_dict = res_dict
+
+            def __call__(self, layer, pattern):
+                layer.res_dict = self.res_dict
+                layer.res_name = pattern
+                if hasattr(layer, "hook_remove_helper"):
+                    layer.hook_remove_helper.remove()
+                layer.hook_remove_helper = layer.register_forward_post_hook(
+                    save_sub_res_hook)
+                return layer
+
+        handle_func = Handler(self.res_dict)
+
+        hit_layer_pattern_list = self.upgrade_sublayer(
+            return_patterns, handle_func=handle_func)
+
+        if hasattr(self, "hook_remove_helper"):
+            self.hook_remove_helper.remove()
+        self.hook_remove_helper = self.register_forward_post_hook(
+            self._return_dict_hook)
+
+        return hit_layer_pattern_list
+
+
+def save_sub_res_hook(layer, input, output):
+    layer.res_dict[layer.res_name] = output
+
+
+def set_identity(parent_layer: nn.Layer,
+                 layer_name: str,
+                 layer_index_list: str=None) -> bool:
+    """set the layer specified by layer_name and layer_index_list to Indentity.
+
+    Args:
+        parent_layer (nn.Layer): The parent layer of target layer specified by layer_name and layer_index_list.
+        layer_name (str): The name of target layer to be set to Indentity.
+        layer_index_list (str, optional): The index of target layer to be set to Indentity in parent_layer. Defaults to None.
+
+    Returns:
+        bool: True if successfully, False otherwise.
+    """
+
+    stop_after = False
+    for sub_layer_name in parent_layer._sub_layers:
+        if stop_after:
+            parent_layer._sub_layers[sub_layer_name] = Identity()
+            continue
+        if sub_layer_name == layer_name:
+            stop_after = True
+
+    if layer_index_list and stop_after:
+        layer_container = parent_layer._sub_layers[layer_name]
+        for num, layer_index in enumerate(layer_index_list):
+            stop_after = False
+            for i in range(num):
+                layer_container = layer_container[layer_index_list[i]]
+            for sub_layer_index in layer_container._sub_layers:
+                if stop_after:
+                    parent_layer._sub_layers[layer_name][
+                        sub_layer_index] = Identity()
+                    continue
+                if layer_index == sub_layer_index:
+                    stop_after = True
+
+    return stop_after
+
+
+def parse_pattern_str(pattern: str, parent_layer: nn.Layer) -> Union[
+        None, List[Dict[str, Union[nn.Layer, str, None]]]]:
+    """parse the string type pattern.
+
+    Args:
+        pattern (str): The pattern to discribe layer.
+        parent_layer (nn.Layer): The root layer relative to the pattern.
+
+    Returns:
+        Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]: None if failed. If successfully, the members are layers parsed in order:
+                                                                [
+                                                                    {"layer": first layer, "name": first layer's name parsed, "index": first layer's index parsed if exist},
+                                                                    {"layer": second layer, "name": second layer's name parsed, "index": second layer's index parsed if exist},
+                                                                    ...
+                                                                ]
+    """
+
+    pattern_list = pattern.split(".")
+    if not pattern_list:
+        msg = f"The pattern('{pattern}') is illegal. Please check and retry."
+        logger.warning(msg)
+        return None
+
+    layer_list = []
+    while len(pattern_list) > 0:
+        if '[' in pattern_list[0]:
+            target_layer_name = pattern_list[0].split('[')[0]
+            target_layer_index_list = list(
+                index.split(']')[0]
+                for index in pattern_list[0].split('[')[1:])
+        else:
+            target_layer_name = pattern_list[0]
+            target_layer_index_list = None
+
+        target_layer = getattr(parent_layer, target_layer_name, None)
+
+        if target_layer is None:
+            msg = f"Not found layer named('{target_layer_name}') specifed in pattern('{pattern}')."
+            logger.warning(msg)
+            return None
+
+        if target_layer_index_list:
+            for target_layer_index in target_layer_index_list:
+                if int(target_layer_index) < 0 or int(
+                        target_layer_index) >= len(target_layer):
+                    msg = f"Not found layer by index('{target_layer_index}') specifed in pattern('{pattern}'). The index should < {len(target_layer)} and > 0."
+                    logger.warning(msg)
+                    return None
+                target_layer = target_layer[target_layer_index]
+
+        layer_list.append({
+            "layer": target_layer,
+            "name": target_layer_name,
+            "index_list": target_layer_index_list
+        })
+
+        pattern_list = pattern_list[1:]
+        parent_layer = target_layer
+
+    return layer_list
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/__init__.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/__init__.py
new file mode 100755
index 000000000..4a4d48e71
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/__init__.py
@@ -0,0 +1,8 @@
+from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd
+from .hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W64_C
+from .mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+from .mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from .mobilenet_v4 import MobileNetV4_conv_small, MobileNetV4_conv_medium, MobileNetV4_conv_large, MobileNetV4_hybrid_medium, MobileNetV4_hybrid_large
+from .inception_v3 import InceptionV3
+from .vgg import VGG11, VGG13, VGG16, VGG19
+from .pp_lcnet import PPLCNetBaseNet, PPLCNet_x0_25, PPLCNet_x0_35, PPLCNet_x0_5, PPLCNet_x0_75, PPLCNet_x1_0, PPLCNet_x1_5, PPLCNet_x2_0, PPLCNet_x2_5
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/custom_devices_layers.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/custom_devices_layers.py
new file mode 100755
index 000000000..547646b96
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/custom_devices_layers.py
@@ -0,0 +1,37 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = ["AdaptiveAvgPool2D"]
+
+
+class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2D):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if paddle.device.get_device().startswith("npu"):
+            self.device = "npu"
+        else:
+            self.device = None
+
+        if isinstance(self._output_size, int) and self._output_size == 1:
+            self._gap = True
+        elif isinstance(self._output_size, tuple) and self._output_size[
+                0] == 1 and self._output_size[1] == 1:
+            self._gap = True
+        else:
+            self._gap = False
+
+    def forward(self, x):
+        if self.device == "npu" and self._gap:
+            # Global Average Pooling
+            N, C, _, _ = x.shape
+            x_mean = paddle.mean(x, axis=[2, 3])
+            x_mean = paddle.reshape(x_mean, [N, C, 1, 1])
+            return x_mean
+        else:
+            return F.adaptive_avg_pool2d(
+                x,
+                output_size=self._output_size,
+                data_format=self._data_format,
+                name=self._name, )
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/esnet.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/esnet.py
new file mode 100755
index 000000000..8d2872eae
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/esnet.py
@@ -0,0 +1,369 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import math
+import paddle
+from paddle import ParamAttr, reshape, transpose, concat, split
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ESNet_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_25_pretrained.pdparams",
+    "ESNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_5_pretrained.pdparams",
+    "ESNet_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_75_pretrained.pdparams",
+    "ESNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x1_0_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {"ESNet": ["blocks[2]", "blocks[9]", "blocks[12]"]}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    channels_per_group = num_channels // groups
+    x = reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+    x = transpose(x=x, perm=[0, 2, 1, 3, 4])
+    x = reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 if_act=True):
+        super().__init__()
+        self.conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = BatchNorm(
+            out_channels,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.hardswish = nn.Hardswish()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.hardswish(x)
+        return x
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class ESBlock1(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.pw_1_1 = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+        self.dw_1 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=1,
+            groups=out_channels // 2,
+            if_act=False)
+        self.se = SEModule(out_channels)
+
+        self.pw_1_2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+
+    def forward(self, x):
+        x1, x2 = split(
+            x, num_or_sections=[x.shape[1] // 2, x.shape[1] // 2], axis=1)
+        x2 = self.pw_1_1(x2)
+        x3 = self.dw_1(x2)
+        x3 = concat([x2, x3], axis=1)
+        x3 = self.se(x3)
+        x3 = self.pw_1_2(x3)
+        x = concat([x1, x3], axis=1)
+        return channel_shuffle(x, 2)
+
+
+class ESBlock2(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        # branch1
+        self.dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=2,
+            groups=in_channels,
+            if_act=False)
+        self.pw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+        # branch2
+        self.pw_2_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1)
+        self.dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=2,
+            groups=out_channels // 2,
+            if_act=False)
+        self.se = SEModule(out_channels // 2)
+        self.pw_2_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1)
+        self.concat_dw = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            groups=out_channels)
+        self.concat_pw = ConvBNLayer(
+            in_channels=out_channels, out_channels=out_channels, kernel_size=1)
+
+    def forward(self, x):
+        x1 = self.dw_1(x)
+        x1 = self.pw_1(x1)
+        x2 = self.pw_2_1(x)
+        x2 = self.dw_2(x2)
+        x2 = self.se(x2)
+        x2 = self.pw_2_2(x2)
+        x = concat([x1, x2], axis=1)
+        x = self.concat_dw(x)
+        x = self.concat_pw(x)
+        return x
+
+
+class ESNet(TheseusLayer):
+    def __init__(self,
+                 stages_pattern,
+                 class_num=1000,
+                 scale=1.0,
+                 dropout_prob=0.2,
+                 class_expand=1280,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+        self.scale = scale
+        self.class_num = class_num
+        self.class_expand = class_expand
+        stage_repeats = [3, 7, 3]
+        stage_out_channels = [
+            -1, 24, make_divisible(116 * scale), make_divisible(232 * scale),
+            make_divisible(464 * scale), 1024
+        ]
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2)
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = ESBlock2(
+                        in_channels=stage_out_channels[stage_id + 1],
+                        out_channels=stage_out_channels[stage_id + 2])
+                else:
+                    block = ESBlock1(
+                        in_channels=stage_out_channels[stage_id + 2],
+                        out_channels=stage_out_channels[stage_id + 2])
+                block_list.append(block)
+        self.blocks = nn.Sequential(*block_list)
+
+        self.conv2 = ConvBNLayer(
+            in_channels=stage_out_channels[-2],
+            out_channels=stage_out_channels[-1],
+            kernel_size=1)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(
+            in_channels=stage_out_channels[-1],
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+        self.hardswish = nn.Hardswish()
+        self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = Linear(self.class_expand, self.class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.max_pool(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ESNet_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_25` model depends on args.
+    """
+    model = ESNet(
+        scale=0.25, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_25"], use_ssld)
+    return model
+
+
+def ESNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_5` model depends on args.
+    """
+    model = ESNet(
+        scale=0.5, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_5"], use_ssld)
+    return model
+
+
+def ESNet_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_75` model depends on args.
+    """
+    model = ESNet(
+        scale=0.75, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_75"], use_ssld)
+    return model
+
+
+def ESNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x1_0` model depends on args.
+    """
+    model = ESNet(
+        scale=1.0, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x1_0"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/hrnet.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/hrnet.py
new file mode 100755
index 000000000..fd6d32557
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/hrnet.py
@@ -0,0 +1,797 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1908.07919
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn
+from paddle import ParamAttr
+from paddle.nn.functional import upsample
+from paddle.nn.initializer import Uniform
+
+from ..base.theseus_layer import TheseusLayer, Identity
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "HRNet_W18_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W18_C_pretrained.pdparams",
+    "HRNet_W30_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W30_C_pretrained.pdparams",
+    "HRNet_W32_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W32_C_pretrained.pdparams",
+    "HRNet_W40_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W40_C_pretrained.pdparams",
+    "HRNet_W44_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W44_C_pretrained.pdparams",
+    "HRNet_W48_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W48_C_pretrained.pdparams",
+    "HRNet_W64_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W64_C_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {"HRNet": ["st4"]}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return Identity()
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+        self.bn = nn.BatchNorm(num_filters, act=None)
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 has_se,
+                 stride=1,
+                 downsample=False):
+        super().__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+
+        self.conv1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu")
+        self.conv3 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if self.downsample:
+            self.conv_down = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                act=None)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters * 4,
+                num_filters=num_filters * 4,
+                reduction_ratio=16)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, res_dict=None):
+        residual = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        if self.downsample:
+            residual = self.conv_down(residual)
+        if self.has_se:
+            x = self.se(x)
+        x = paddle.add(x=residual, y=x)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, num_channels, num_filters, has_se=False):
+        super().__init__()
+
+        self.has_se = has_se
+
+        self.conv1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=1,
+            act="relu")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=1,
+            act=None)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters,
+                num_filters=num_filters,
+                reduction_ratio=16)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.has_se:
+            x = self.se(x)
+
+        x = paddle.add(x=residual, y=x)
+        x = self.relu(x)
+        return x
+
+
+class SELayer(TheseusLayer):
+    def __init__(self, num_channels, num_filters, reduction_ratio):
+        super().__init__()
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.fc_squeeze = nn.Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.fc_excitation = nn.Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x, res_dict=None):
+        residual = x
+        x = self.avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self.fc_squeeze(x)
+        x = self.relu(x)
+        x = self.fc_excitation(x)
+        x = self.sigmoid(x)
+        x = paddle.unsqueeze(x, axis=[2, 3])
+        x = residual * x
+        return x
+
+
+class Stage(TheseusLayer):
+    def __init__(self, num_modules, num_filters, has_se=False):
+        super().__init__()
+
+        self._num_modules = num_modules
+
+        self.stage_func_list = nn.LayerList()
+        for i in range(num_modules):
+            self.stage_func_list.append(
+                HighResolutionModule(
+                    num_filters=num_filters, has_se=has_se))
+
+    def forward(self, x, res_dict=None):
+        x = x
+        for idx in range(self._num_modules):
+            x = self.stage_func_list[idx](x)
+        return x
+
+
+class HighResolutionModule(TheseusLayer):
+    def __init__(self, num_filters, has_se=False):
+        super().__init__()
+
+        self.basic_block_list = nn.LayerList()
+
+        for i in range(len(num_filters)):
+            self.basic_block_list.append(
+                nn.Sequential(* [
+                    BasicBlock(
+                        num_channels=num_filters[i],
+                        num_filters=num_filters[i],
+                        has_se=has_se) for j in range(4)
+                ]))
+
+        self.fuse_func = FuseLayers(
+            in_channels=num_filters, out_channels=num_filters)
+
+    def forward(self, x, res_dict=None):
+        out = []
+        for idx, xi in enumerate(x):
+            basic_block_list = self.basic_block_list[idx]
+            for basic_block_func in basic_block_list:
+                xi = basic_block_func(xi)
+            out.append(xi)
+        out = self.fuse_func(out)
+        return out
+
+
+class FuseLayers(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self._actual_ch = len(in_channels)
+        self._in_channels = in_channels
+
+        self.residual_func_list = nn.LayerList()
+        self.relu = nn.ReLU()
+        for i in range(len(in_channels)):
+            for j in range(len(in_channels)):
+                if j > i:
+                    self.residual_func_list.append(
+                        ConvBNLayer(
+                            num_channels=in_channels[j],
+                            num_filters=out_channels[i],
+                            filter_size=1,
+                            stride=1,
+                            act=None))
+                elif j < i:
+                    pre_num_filters = in_channels[j]
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            self.residual_func_list.append(
+                                ConvBNLayer(
+                                    num_channels=pre_num_filters,
+                                    num_filters=out_channels[i],
+                                    filter_size=3,
+                                    stride=2,
+                                    act=None))
+                            pre_num_filters = out_channels[i]
+                        else:
+                            self.residual_func_list.append(
+                                ConvBNLayer(
+                                    num_channels=pre_num_filters,
+                                    num_filters=out_channels[j],
+                                    filter_size=3,
+                                    stride=2,
+                                    act="relu"))
+                            pre_num_filters = out_channels[j]
+
+    def forward(self, x, res_dict=None):
+        out = []
+        residual_func_idx = 0
+        for i in range(len(self._in_channels)):
+            residual = x[i]
+            for j in range(len(self._in_channels)):
+                if j > i:
+                    xj = self.residual_func_list[residual_func_idx](x[j])
+                    residual_func_idx += 1
+
+                    xj = upsample(xj, scale_factor=2**(j - i), mode="nearest")
+                    residual = paddle.add(x=residual, y=xj)
+                elif j < i:
+                    xj = x[j]
+                    for k in range(i - j):
+                        xj = self.residual_func_list[residual_func_idx](xj)
+                        residual_func_idx += 1
+
+                    residual = paddle.add(x=residual, y=xj)
+
+            residual = self.relu(residual)
+            out.append(residual)
+
+        return out
+
+
+class LastClsOut(TheseusLayer):
+    def __init__(self,
+                 num_channel_list,
+                 has_se,
+                 num_filters_list=[32, 64, 128, 256]):
+        super().__init__()
+
+        self.func_list = nn.LayerList()
+        for idx in range(len(num_channel_list)):
+            self.func_list.append(
+                BottleneckBlock(
+                    num_channels=num_channel_list[idx],
+                    num_filters=num_filters_list[idx],
+                    has_se=has_se,
+                    downsample=True))
+
+    def forward(self, x, res_dict=None):
+        out = []
+        for idx, xi in enumerate(x):
+            xi = self.func_list[idx](xi)
+            out.append(xi)
+        return out
+
+
+class HRNet(TheseusLayer):
+    """
+    HRNet
+    Args:
+        width: int=18. Base channel number of HRNet.
+        has_se: bool=False. If 'True', add se module to HRNet.
+        class_num: int=1000. Output num of last fc layer.
+    Returns:
+        model: nn.Layer. Specific HRNet model depends on args.
+    """
+
+    def __init__(self,
+                 stages_pattern,
+                 width=18,
+                 has_se=False,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.width = width
+        self.has_se = has_se
+        self._class_num = class_num
+
+        channels_2 = [self.width, self.width * 2]
+        channels_3 = [self.width, self.width * 2, self.width * 4]
+        channels_4 = [
+            self.width, self.width * 2, self.width * 4, self.width * 8
+        ]
+
+        self.conv_layer1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act="relu")
+
+        self.conv_layer1_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act="relu")
+
+        self.layer1 = nn.Sequential(* [
+            BottleneckBlock(
+                num_channels=64 if i == 0 else 256,
+                num_filters=64,
+                has_se=has_se,
+                stride=1,
+                downsample=True if i == 0 else False) for i in range(4)
+        ])
+
+        self.conv_tr1_1 = ConvBNLayer(
+            num_channels=256, num_filters=width, filter_size=3)
+        self.conv_tr1_2 = ConvBNLayer(
+            num_channels=256, num_filters=width * 2, filter_size=3, stride=2)
+
+        self.st2 = Stage(
+            num_modules=1, num_filters=channels_2, has_se=self.has_se)
+
+        self.conv_tr2 = ConvBNLayer(
+            num_channels=width * 2,
+            num_filters=width * 4,
+            filter_size=3,
+            stride=2)
+        self.st3 = Stage(
+            num_modules=4, num_filters=channels_3, has_se=self.has_se)
+
+        self.conv_tr3 = ConvBNLayer(
+            num_channels=width * 4,
+            num_filters=width * 8,
+            filter_size=3,
+            stride=2)
+
+        self.st4 = Stage(
+            num_modules=3, num_filters=channels_4, has_se=self.has_se)
+
+        # classification
+        num_filters_list = [32, 64, 128, 256]
+        self.last_cls = LastClsOut(
+            num_channel_list=channels_4,
+            has_se=self.has_se,
+            num_filters_list=num_filters_list)
+
+        last_num_filters = [256, 512, 1024]
+        self.cls_head_conv_list = nn.LayerList()
+        for idx in range(3):
+            self.cls_head_conv_list.append(
+                ConvBNLayer(
+                    num_channels=num_filters_list[idx] * 4,
+                    num_filters=last_num_filters[idx],
+                    filter_size=3,
+                    stride=2))
+
+        self.conv_last = ConvBNLayer(
+            num_channels=1024, num_filters=2048, filter_size=1, stride=1)
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = nn.Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv_layer1_1(x)
+        x = self.conv_layer1_2(x)
+
+        x = self.layer1(x)
+
+        tr1_1 = self.conv_tr1_1(x)
+        tr1_2 = self.conv_tr1_2(x)
+        x = self.st2([tr1_1, tr1_2])
+
+        tr2 = self.conv_tr2(x[-1])
+        x.append(tr2)
+        x = self.st3(x)
+
+        tr3 = self.conv_tr3(x[-1])
+        x.append(tr3)
+        x = self.st4(x)
+
+        x = self.last_cls(x)
+
+        y = x[0]
+        for idx in range(3):
+            y = paddle.add(x[idx + 1], self.cls_head_conv_list[idx](y))
+
+        y = self.conv_last(y)
+        y = self.avg_pool(y)
+        y = self.flatten(y)
+        y = self.fc(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def HRNet_W18_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W18_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W18_C` model depends on args.
+    """
+    model = HRNet(
+        width=18, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W18_C"], use_ssld)
+    return model
+
+
+def HRNet_W30_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W30_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W30_C` model depends on args.
+    """
+    model = HRNet(
+        width=30, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W30_C"], use_ssld)
+    return model
+
+
+def HRNet_W32_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W32_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W32_C` model depends on args.
+    """
+    model = HRNet(
+        width=32, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W32_C"], use_ssld)
+    return model
+
+
+def HRNet_W40_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W40_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W40_C` model depends on args.
+    """
+    model = HRNet(
+        width=40, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W40_C"], use_ssld)
+    return model
+
+
+def HRNet_W44_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W44_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W44_C` model depends on args.
+    """
+    model = HRNet(
+        width=44, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W44_C"], use_ssld)
+    return model
+
+
+def HRNet_W48_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W48_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W48_C` model depends on args.
+    """
+    model = HRNet(
+        width=48, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W48_C"], use_ssld)
+    return model
+
+
+def HRNet_W60_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W60_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W60_C` model depends on args.
+    """
+    model = HRNet(
+        width=60, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W60_C"], use_ssld)
+    return model
+
+
+def HRNet_W64_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W64_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W64_C` model depends on args.
+    """
+    model = HRNet(
+        width=64, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W64_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W18_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W18_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W18_C` model depends on args.
+    """
+    model = HRNet(
+        width=18,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W18_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W30_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W30_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W30_C` model depends on args.
+    """
+    model = HRNet(
+        width=30,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W30_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W32_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W32_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W32_C` model depends on args.
+    """
+    model = HRNet(
+        width=32,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W32_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W40_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W40_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W40_C` model depends on args.
+    """
+    model = HRNet(
+        width=40,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W40_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W44_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W44_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W44_C` model depends on args.
+    """
+    model = HRNet(
+        width=44,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W44_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W48_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W48_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W48_C` model depends on args.
+    """
+    model = HRNet(
+        width=48,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W48_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W60_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W60_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W60_C` model depends on args.
+    """
+    model = HRNet(
+        width=60,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W60_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W64_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W64_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W64_C` model depends on args.
+    """
+    model = HRNet(
+        width=64,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W64_C"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/inception_v3.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/inception_v3.py
new file mode 100755
index 000000000..b01887020
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/inception_v3.py
@@ -0,0 +1,559 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1512.00567v3
+
+from __future__ import absolute_import, division, print_function
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "InceptionV3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/InceptionV3_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "InceptionV3": [
+        "inception_block_list[2]", "inception_block_list[3]",
+        "inception_block_list[7]", "inception_block_list[8]",
+        "inception_block_list[10]"
+    ]
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+InceptionV3 config: dict.
+    key: inception blocks of InceptionV3.
+    values: conv num in different blocks.
+'''
+NET_CONFIG = {
+    "inception_a": [[192, 256, 288], [32, 64, 64]],
+    "inception_b": [288],
+    "inception_c": [[768, 768, 768, 768], [128, 160, 160, 192]],
+    "inception_d": [768],
+    "inception_e": [1280, 2048]
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+        self.act = act
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class InceptionStem(TheseusLayer):
+    def __init__(self):
+        super().__init__()
+        self.conv_1a_3x3 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.conv_2a_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act="relu")
+        self.conv_2b_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            padding=1,
+            act="relu")
+
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self.conv_3b_1x1 = ConvBNLayer(
+            num_channels=64, num_filters=80, filter_size=1, act="relu")
+        self.conv_4a_3x3 = ConvBNLayer(
+            num_channels=80, num_filters=192, filter_size=3, act="relu")
+
+    def forward(self, x):
+        x = self.conv_1a_3x3(x)
+        x = self.conv_2a_3x3(x)
+        x = self.conv_2b_3x3(x)
+        x = self.max_pool(x)
+        x = self.conv_3b_1x1(x)
+        x = self.conv_4a_3x3(x)
+        x = self.max_pool(x)
+        return x
+
+
+class InceptionA(TheseusLayer):
+    def __init__(self, num_channels, pool_features):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=48,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_2 = ConvBNLayer(
+            num_channels=48,
+            num_filters=64,
+            filter_size=5,
+            padding=2,
+            act="relu")
+
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=pool_features,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+        x = paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+        return x
+
+
+class InceptionB(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3(x)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+
+        x = paddle.concat([branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+        return x
+
+
+class InceptionC(TheseusLayer):
+    def __init__(self, num_channels, channels_7x7):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+        self.branch7x7_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            stride=1,
+            act="relu")
+        self.branch7x7_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            stride=1,
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(7, 1),
+            stride=1,
+            padding=(3, 0),
+            act="relu")
+
+        self.branch7x7dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            act="relu")
+        self.branch7x7dbl_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7dbl_4 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_5 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+
+        x = paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+
+        return x
+
+
+class InceptionD(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=320,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch7x7x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch7x7x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7x3_3 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7x3_4 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+
+        branch_pool = self.branch_pool(x)
+
+        x = paddle.concat([branch3x3, branch7x7x3, branch_pool], axis=1)
+        return x
+
+
+class InceptionE(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=320,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3_2b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=448,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=448,
+            num_filters=384,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3dbl_3b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = paddle.concat(branch3x3, axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+
+        x = paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        return x
+
+
+class Inception_V3(TheseusLayer):
+    """
+    Inception_V3
+    Args:
+        config: dict. config of Inception_V3.
+        class_num: int=1000. The number of classes.
+        pretrained: (True or False) or path of pretrained_model. Whether to load the pretrained model.
+    Returns:
+        model: nn.Layer. Specific Inception_V3 model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.inception_a_list = config["inception_a"]
+        self.inception_c_list = config["inception_c"]
+        self.inception_b_list = config["inception_b"]
+        self.inception_d_list = config["inception_d"]
+        self.inception_e_list = config["inception_e"]
+
+        self.inception_stem = InceptionStem()
+
+        self.inception_block_list = nn.LayerList()
+        for i in range(len(self.inception_a_list[0])):
+            inception_a = InceptionA(self.inception_a_list[0][i],
+                                     self.inception_a_list[1][i])
+            self.inception_block_list.append(inception_a)
+
+        for i in range(len(self.inception_b_list)):
+            inception_b = InceptionB(self.inception_b_list[i])
+            self.inception_block_list.append(inception_b)
+
+        for i in range(len(self.inception_c_list[0])):
+            inception_c = InceptionC(self.inception_c_list[0][i],
+                                     self.inception_c_list[1][i])
+            self.inception_block_list.append(inception_c)
+
+        for i in range(len(self.inception_d_list)):
+            inception_d = InceptionD(self.inception_d_list[i])
+            self.inception_block_list.append(inception_d)
+
+        for i in range(len(self.inception_e_list)):
+            inception_e = InceptionE(self.inception_e_list[i])
+            self.inception_block_list.append(inception_e)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.dropout = Dropout(p=0.2, mode="downscale_in_infer")
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.fc = Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr())
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.inception_stem(x)
+        for inception_block in self.inception_block_list:
+            x = inception_block(x)
+        x = self.avg_pool(x)
+        x = paddle.reshape(x, shape=[-1, 2048])
+        x = self.dropout(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def InceptionV3(pretrained=False, use_ssld=False, **kwargs):
+    """
+    InceptionV3
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `InceptionV3` model
+    """
+    model = Inception_V3(
+        NET_CONFIG,
+        stages_pattern=MODEL_STAGES_PATTERN["InceptionV3"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["InceptionV3"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v1.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
new file mode 100755
index 000000000..4e6706382
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
@@ -0,0 +1,259 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1704.04861
+
+from __future__ import absolute_import, division, print_function
+
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, ReLU, Flatten
+from paddle.nn import AdaptiveAvgPool2D
+from paddle.nn.initializer import KaimingNormal
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNetV1_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_25_pretrained.pdparams",
+    "MobileNetV1_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_5_pretrained.pdparams",
+    "MobileNetV1_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_75_pretrained.pdparams",
+    "MobileNetV1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV1": ["blocks[0]", "blocks[2]", "blocks[4]", "blocks[10]"]
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 num_groups=1):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class DepthwiseSeparable(TheseusLayer):
+    def __init__(self, num_channels, num_filters1, num_filters2, num_groups,
+                 stride, scale):
+        super().__init__()
+
+        self.depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale))
+
+        self.pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
+
+
+class MobileNet(TheseusLayer):
+    """
+    MobileNet
+    Args:
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific MobileNet model depends on args.
+    """
+
+    def __init__(self,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+        self.scale = scale
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+
+        #num_channels, num_filters1, num_filters2, num_groups, stride
+        self.cfg = [[int(32 * scale), 32, 64, 32, 1],
+                    [int(64 * scale), 64, 128, 64, 2],
+                    [int(128 * scale), 128, 128, 128, 1],
+                    [int(128 * scale), 128, 256, 128, 2],
+                    [int(256 * scale), 256, 256, 256, 1],
+                    [int(256 * scale), 256, 512, 256, 2],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 1024, 512, 2],
+                    [int(1024 * scale), 1024, 1024, 1024, 1]]
+
+        self.blocks = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=params[0],
+                num_filters1=params[1],
+                num_filters2=params[2],
+                num_groups=params[3],
+                stride=params[4],
+                scale=scale) for params in self.cfg
+        ])
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.flatten = Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(
+            int(1024 * scale),
+            class_num,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV1_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_25` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_25"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_5` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_75` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1` model depends on args.
+    """
+    model = MobileNet(
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v3.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
new file mode 100755
index 000000000..a50c2884d
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
@@ -0,0 +1,591 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1905.02244
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import BatchNorm, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+
+from .custom_devices_layers import AdaptiveAvgPool2D
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNetV3_small_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_35_pretrained.pdparams",
+    "MobileNetV3_small_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_5_pretrained.pdparams",
+    "MobileNetV3_small_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_75_pretrained.pdparams",
+    "MobileNetV3_small_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_pretrained.pdparams",
+    "MobileNetV3_small_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_25_pretrained.pdparams",
+    "MobileNetV3_large_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_35_pretrained.pdparams",
+    "MobileNetV3_large_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_5_pretrained.pdparams",
+    "MobileNetV3_large_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_75_pretrained.pdparams",
+    "MobileNetV3_large_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_pretrained.pdparams",
+    "MobileNetV3_large_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_25_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV3_small":
+    ["blocks[0]", "blocks[2]", "blocks[7]", "blocks[10]"],
+    "MobileNetV3_large":
+    ["blocks[0]", "blocks[2]", "blocks[5]", "blocks[11]", "blocks[14]"]
+}
+
+__all__ = MODEL_URLS.keys()
+
+# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively.
+# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.
+# k: kernel_size
+# exp: middle channel number in depthwise block
+# c: output channel number in depthwise block
+# se: whether to use SE block
+# act: which activation to use
+# s: stride in depthwise block
+NET_CONFIG = {
+    "large": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],
+        [3, 240, 80, False, "hardswish", 2],
+        [3, 200, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 480, 112, True, "hardswish", 1],
+        [3, 672, 112, True, "hardswish", 1],
+        [5, 672, 160, True, "hardswish", 2],
+        [5, 960, 160, True, "hardswish", 1],
+        [5, 960, 160, True, "hardswish", 1],
+    ],
+    "small": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 2],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 120, 48, True, "hardswish", 1],
+        [5, 144, 48, True, "hardswish", 1],
+        [5, 288, 96, True, "hardswish", 2],
+        [5, 576, 96, True, "hardswish", 1],
+        [5, 576, 96, True, "hardswish", 1],
+    ]
+}
+# first conv output channel number in MobileNetV3
+STEM_CONV_NUMBER = 16
+# last second conv output channel for "small"
+LAST_SECOND_CONV_SMALL = 576
+# last second conv output channel for "large"
+LAST_SECOND_CONV_LARGE = 960
+# last conv output channel number for "large" and "small"
+LAST_CONV = 1280
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class MobileNetV3(TheseusLayer):
+    """
+    MobileNetV3
+    Args:
+        config: list. MobileNetV3 depthwise blocks config.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+        inplanes: int=16. The output channel number of first convolution layer.
+        class_squeeze: int=960. The output channel number of penultimate convolution layer. 
+        class_expand: int=1280. The output channel number of last convolution layer. 
+        dropout_prob: float=0.2.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV3 model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_squeeze=LAST_SECOND_CONV_LARGE,
+                 class_expand=LAST_CONV,
+                 dropout_prob=0.2,
+                 return_patterns=None,
+                 return_stages=None,
+                 **kwargs):
+        super().__init__()
+
+        self.cfg = config
+        self.scale = scale
+        self.inplanes = inplanes
+        self.class_squeeze = class_squeeze
+        self.class_expand = class_expand
+        self.class_num = class_num
+
+        self.conv = ConvBNLayer(
+            in_c=3,
+            out_c=_make_divisible(self.inplanes * self.scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.blocks = nn.Sequential(*[
+            ResidualUnit(
+                in_c=_make_divisible(self.inplanes * self.scale if i == 0 else
+                                     self.cfg[i - 1][2] * self.scale),
+                mid_c=_make_divisible(self.scale * exp),
+                out_c=_make_divisible(self.scale * c),
+                filter_size=k,
+                stride=s,
+                use_se=se,
+                act=act) for i, (k, exp, c, se, act, s) in enumerate(self.cfg)
+        ])
+
+        self.last_second_conv = ConvBNLayer(
+            in_c=_make_divisible(self.cfg[-1][2] * self.scale),
+            out_c=_make_divisible(self.scale * self.class_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(
+            in_channels=_make_divisible(self.scale * self.class_squeeze),
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+
+        self.hardswish = nn.Hardswish()
+        if dropout_prob is not None:
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        else:
+            self.dropout = None
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(self.class_expand, class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.last_second_conv(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+
+        return x
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        return x
+
+
+class ResidualUnit(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 act=None):
+        super().__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            if_act=True,
+            act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c)
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+
+    def forward(self, x):
+        identity = x
+        x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, x)
+        return x
+
+
+# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid
+class Hardsigmoid(TheseusLayer):
+    def __init__(self, slope=0.2, offset=0.5):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+
+    def forward(self, x):
+        return nn.functional.hardsigmoid(
+            x, slope=self.slope, offset=self.offset)
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV3_small_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.35,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_35"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_75
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_0"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_25"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.35,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_35"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_0"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_25"],
+                     use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v4.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v4.py
new file mode 100755
index 000000000..cad766c86
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/mobilenet_v4.py
@@ -0,0 +1,836 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2404.10518
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import BatchNorm, Conv2D, Dropout, Linear, Identity, Flatten
+from paddle.regularizer import L2Decay
+
+from .custom_devices_layers import AdaptiveAvgPool2D
+from ..base.theseus_layer import TheseusLayer
+from ..model_zoo.vision_transformer import DropPath
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNetV4_conv_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_conv_large_pretrained.pdparams",
+    "MobileNetV4_conv_medium":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_conv_medium_pretrained.pdparams",
+    "MobileNetV4_conv_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_conv_small_pretrained.pdparams",
+    "MobileNetV4_hybrid_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_hybrid_large_pretrained.pdparams",
+    "MobileNetV4_hybrid_medium":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_hybrid_medium_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+STEM_CONV_NUMBER = 32
+LAST_CONV = 1280
+
+NET_CONFIG = {
+    "conv_small": [
+        # stage 0, 112x112 
+        # type, out, kernal_size, act, stride
+        ["cn", 32, 3, "relu", 2],
+        ["cn", 32, 1, "relu", 1],
+        # stage 1, 56x56
+        ["cn", 96, 3, "relu", 2],
+        ["cn", 64, 1, "relu", 1],
+        # stage 2, 28x28
+        # type, out, mid_c, first_kernal_size, mid_kernal_size, act, stride
+        ["uir", 96, 192, 5, 5, "relu", 2],
+        ["uir", 96, 192, 0, 3, "relu", 1],
+        ["uir", 96, 192, 0, 3, "relu", 1],
+        ["uir", 96, 192, 0, 3, "relu", 1],
+        ["uir", 96, 192, 0, 3, "relu", 1],
+        ["uir", 96, 384, 3, 0, "relu", 1],
+        # stage 3,  14x14
+        ["uir", 128, 576, 3, 3, "relu", 2],
+        ["uir", 128, 512, 5, 5, "relu", 1],
+        ["uir", 128, 512, 0, 5, "relu", 1],
+        ["uir", 128, 384, 0, 5, "relu", 1],
+        ["uir", 128, 512, 0, 3, "relu", 1],
+        ["uir", 128, 512, 0, 3, "relu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "relu", 1],
+    ],
+    "conv_medium": [
+        # stage 0, 112x112 
+        ["er", 48, 128, 3, "relu", 2],
+        # stage 1, 56x56
+        ["uir", 80, 192, 3, 5, "relu", 2],
+        ["uir", 80, 160, 3, 3, "relu", 1],
+        # stage 2, 28x28
+        ["uir", 160, 480, 3, 5, "relu", 2],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["uir", 160, 640, 3, 5, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["uir", 160, 640, 3, 0, "relu", 1],
+        ["uir", 160, 320, 0, 0, "relu", 1],
+        ["uir", 160, 640, 3, 0, "relu", 1],
+        # stage 3,  14x14
+        ["uir", 256, 960, 5, 5, "relu", 2],
+        ["uir", 256, 1024, 5, 5, "relu", 1],
+        ["uir", 256, 1024, 3, 5, "relu", 1],
+        ["uir", 256, 1024, 3, 5, "relu", 1],
+        ["uir", 256, 1024, 0, 0, "relu", 1],
+        ["uir", 256, 1024, 3, 0, "relu", 1],
+        ["uir", 256, 512, 3, 5, "relu", 1],
+        ["uir", 256, 1024, 5, 5, "relu", 1],
+        ["uir", 256, 1024, 0, 0, "relu", 1],
+        ["uir", 256, 1024, 0, 0, "relu", 1],
+        ["uir", 256, 512, 5, 0, "relu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "relu", 1],
+    ],
+    "conv_large": [
+        # stem_size = 24
+        ["er", 48, 96, 3, "relu", 2],
+        # stage 1, 56x56
+        ["uir", 96, 192, 3, 5, "relu", 2],
+        ["uir", 96, 384, 3, 3, "relu", 1],
+        # stage 2, 28x28 in
+        ["uir", 192, 384, 3, 5, "relu", 2],
+        ["uir", 192, 768, 3, 3, "relu", 1],
+        ["uir", 192, 768, 3, 3, "relu", 1],
+        ["uir", 192, 768, 3, 3, "relu", 1],
+        ["uir", 192, 768, 3, 5, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 3, 0, "relu", 1],
+        # stage 3,  14x14 in
+        ["uir", 512, 768, 5, 5, "relu", 2],
+        ["uir", 512, 2048, 5, 5, "relu", 1],
+        ["uir", 512, 2048, 5, 5, "relu", 1],
+        ["uir", 512, 2048, 5, 5, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 3, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 3, "relu", 1],
+        ["uir", 512, 2048, 5, 5, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "relu", 1],
+    ],
+    "hybrid_medium": [
+        # stem_size = 32
+        ["er", 48, 128, 3, "relu", 2],
+        # stage 1, 56x56
+        ["uir", 80, 192, 3, 5, "relu", 2],
+        ["uir", 80, 160, 3, 3, "relu", 1],
+        # stage 2, 28x28
+        ["uir", 160, 480, 3, 5, "relu", 2],
+        ["uir", 160, 320, 0, 0, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["uir", 160, 640, 3, 5, "relu", 1],
+        # type, out, kv_dim, kernal_size, kv_stride, act, stride
+        ["mqa", 160, 64, 3, 4, 2, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["mqa", 160, 64, 3, 4, 2, "relu", 1],
+        ["uir", 160, 640, 3, 0, "relu", 1],
+        ["mqa", 160, 64, 3, 4, 2, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["mqa", 160, 64, 3, 4, 2, "relu", 1],
+        ["uir", 160, 640, 3, 0, "relu", 1],
+        # stage 3,  14x14
+        ["uir", 256, 960, 5, 5, "relu", 2],
+        ["uir", 256, 1024, 5, 5, "relu", 1],
+        ["uir", 256, 1024, 3, 5, "relu", 1],
+        ["uir", 256, 1024, 3, 5, "relu", 1],
+        ["uir", 256, 512, 0, 0, "relu", 1],
+        ["uir", 256, 512, 3, 5, "relu", 1],
+        ["uir", 256, 512, 0, 0, "relu", 1],
+        ["uir", 256, 1024, 0, 0, "relu", 1],
+        ["mqa", 256, 64, 3, 4, 1, "relu", 1],
+        ["uir", 256, 1024, 3, 0, "relu", 1],
+        ["mqa", 256, 64, 3, 4, 1, "relu", 1],
+        ["uir", 256, 1024, 5, 5, "relu", 1],
+        ["mqa", 256, 64, 3, 4, 1, "relu", 1],
+        ["uir", 256, 1024, 5, 0, "relu", 1],
+        ["mqa", 256, 64, 3, 4, 1, "relu", 1],
+        ["uir", 256, 1024, 5, 0, "relu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "relu", 1],
+    ],
+    "hybrid_large": [
+        # stem_size = 24
+        ["er", 48, 96, 3, "gelu", 2],
+        # stage 1, 56x56
+        ["uir", 96, 192, 3, 5, "gelu", 2],
+        ["uir", 96, 384, 3, 3, "gelu", 1],
+        # stage 2, 28x28 in
+        ["uir", 192, 384, 3, 5, "gelu", 2],
+        ["uir", 192, 768, 3, 3, "gelu", 1],
+        ["uir", 192, 768, 3, 3, "gelu", 1],
+        ["uir", 192, 768, 3, 3, "gelu", 1],
+        ["uir", 192, 768, 3, 5, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["mqa", 192, 48, 3, 8, 2, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["mqa", 192, 48, 3, 8, 2, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["mqa", 192, 48, 3, 8, 2, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["mqa", 192, 48, 3, 8, 2, "gelu", 1],
+        ["uir", 192, 768, 3, 0, "gelu", 1],
+        # stage 3,  14x14
+        ["uir", 512, 768, 5, 5, "gelu", 2],
+        ["uir", 512, 2048, 5, 5, "gelu", 1],
+        ["uir", 512, 2048, 5, 5, "gelu", 1],
+        ["uir", 512, 2048, 5, 5, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["uir", 512, 2048, 5, 3, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["uir", 512, 2048, 5, 3, "gelu", 1],
+        ["uir", 512, 2048, 5, 5, "gelu", 1],
+        ["mqa", 512, 64, 3, 8, 1, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["mqa", 512, 64, 3, 8, 1, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["mqa", 512, 64, 3, 8, 1, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["mqa", 512, 64, 3, 8, 1, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "gelu", 1],
+    ]
+}
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV4_conv_small":
+    ["blocks[1]", "blocks[3]", "blocks[9]", "blocks[15]", "blocks[16]"],
+    "MobileNetV4_conv_medium":
+    ["blocks[0]", "blocks[2]", "blocks[10]", "blocks[21]", "blocks[22]"],
+    "MobileNetV4_conv_large":
+    ["blocks[0]", "blocks[2]", "blocks[13]", "blocks[26]", "blocks[27]"],
+    "MobileNetV4_hybrid_medium":
+    ["blocks[0]", "blocks[2]", "blocks[14]", "blocks[30]", "blocks[31]"],
+    "MobileNetV4_hybrid_large":
+    ["blocks[0]", "blocks[2]", "blocks[17]", "blocks[35]", "blocks[36]"],
+}
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act == "gelu":
+        return nn.GELU(approximate=False)
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class ConvBnAct(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 drop_path_rate=0.0,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+
+        self.drop_path_rate = drop_path_rate
+        self.conv = Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        if self.if_act:
+            self.act = _create_act(act)
+        if self.drop_path_rate > 0:
+            self.drop_path = DropPath(drop_path_rate)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        if self.drop_path_rate > 0:
+            x = self.drop_path(x)
+        return x
+
+
+class EdgeResidual(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 drop_path_rate=0.0,
+                 if_act=False,
+                 act=None):
+        super(EdgeResidual, self).__init__()
+
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.conv_exp = ConvBnAct(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            if_act=True,
+            act=act)
+        self.conv_pwl = ConvBnAct(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=act)
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate else Identity()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv_exp(x)
+        x = self.conv_pwl(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, self.drop_path(x))
+        return x
+
+
+class UniversalInvertedResidual(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stem_kernel_size=None,
+                 stride=1,
+                 drop_path_rate=0.0,
+                 layer_scale_init_value=0.0,
+                 if_act=False,
+                 act=None):
+        super().__init__()
+
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.layer_scale_init_value = layer_scale_init_value
+        if stem_kernel_size:
+            self.dw_start = ConvBnAct(
+                in_c=in_c,
+                out_c=in_c,
+                filter_size=stem_kernel_size,
+                stride=1,
+                padding=int((stem_kernel_size - 1) // 2),
+                num_groups=in_c,
+                if_act=False,
+                act=None)
+        else:
+            self.dw_start = Identity()
+
+        self.pw_exp = ConvBnAct(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act=act)
+
+        if filter_size:
+            self.dw_mid = ConvBnAct(
+                in_c=mid_c,
+                out_c=mid_c,
+                filter_size=filter_size,
+                stride=stride,
+                padding=int((filter_size - 1) // 2),
+                num_groups=mid_c,
+                if_act=True,
+                act=act)
+        else:
+            self.dw_mid = Identity()
+        self.pw_proj = ConvBnAct(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+        if layer_scale_init_value > 0.0:
+            self.layer_scale = LayerScale2D(out_c, layer_scale_init_value)
+
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate else Identity()
+
+    def forward(self, x):
+        identity = x
+        x = self.dw_start(x)
+        x = self.pw_exp(x)
+        x = self.dw_mid(x)
+        x = self.pw_proj(x)
+        if self.layer_scale_init_value > 0.0:
+            x = self.layer_scale(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, self.drop_path(x))
+        return x
+
+
+class LayerScale2D(nn.Layer):
+    def __init__(self, dim, init_values=1e-05, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = paddle.create_parameter(
+            shape=[dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(init_values))
+
+    def forward(self, x):
+        gamma = self.gamma.reshape([1, -1, 1, 1])
+        return (x.multiply_(y=paddle.to_tensor(gamma))
+                if self.inplace else x * gamma)
+
+
+class MobileAttention(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size=3,
+                 stride=1,
+                 num_head=4,
+                 query_dim=256,
+                 kv_dim=64,
+                 kv_stride=1,
+                 drop_path_rate=0.0,
+                 attn_drop_rate=0.0,
+                 dropout_prob=0.0,
+                 layer_scale_init_value=0.0,
+                 if_act=True,
+                 act=None,
+                 use_fused_attn=False):
+        super(MobileAttention, self).__init__()
+
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.kv_stride = kv_stride
+        self.kv_dim = kv_dim
+        self.num_head = num_head
+        self.query_dim = query_dim
+        self.attn_drop_rate = attn_drop_rate
+        self.use_fused_attn = use_fused_attn
+        self.norm = BatchNorm(
+            num_channels=in_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.query_proj = Conv2D(
+            in_channels=in_c,
+            out_channels=query_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=False)
+        if kv_stride > 1:
+            self.key_down_proj = ConvBnAct(
+                in_c=in_c,
+                out_c=in_c,
+                filter_size=filter_size,
+                stride=kv_stride,
+                padding=int((filter_size - 1) // 2),
+                num_groups=in_c,
+                if_act=False)
+            self.value_down_proj = ConvBnAct(
+                in_c=in_c,
+                out_c=in_c,
+                filter_size=filter_size,
+                stride=kv_stride,
+                padding=int((filter_size - 1) // 2),
+                num_groups=in_c,
+                if_act=False)
+        self.key_proj = Conv2D(
+            in_channels=in_c,
+            out_channels=kv_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=False)
+        self.value_proj = Conv2D(
+            in_channels=in_c,
+            out_channels=kv_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=False)
+        self.proj = Conv2D(
+            in_channels=query_dim,
+            out_channels=out_c,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=False)
+        if not self.use_fused_attn:
+            self.scale = query_dim**-0.5
+            self.softmax = nn.Softmax(-1)
+            self.attn_drop = Dropout(self.attn_drop_rate)
+        self.drop = Dropout(dropout_prob)
+        self.layer_scale_init_value = layer_scale_init_value
+        if layer_scale_init_value > 0.0:
+            self.layer_scale = LayerScale2D(out_c, layer_scale_init_value)
+        self.drop_path = (DropPath(drop_path_rate)
+                          if drop_path_rate else Identity())
+
+    def forward(self, x, attn_mask=None):
+        identity = x
+        x = self.norm(x)
+        B, C, H, W = tuple(x.shape)
+        q = self.query_proj(x).reshape(
+            [B, self.num_head, self.query_dim // self.num_head, H * W])
+        q = q.transpose([0, 3, 1, 2])
+        if self.kv_stride > 1:
+            k = self.key_proj(self.key_down_proj(x))
+            v = self.value_proj(self.value_down_proj(x))
+        else:
+            k = self.key_proj(x)
+            v = self.value_proj(x)
+        k = k.reshape(
+            [B, self.kv_dim, 1, H // self.kv_stride * W // self.kv_stride])
+        k = k.transpose([0, 3, 2, 1])
+        v = v.reshape(
+            [B, self.kv_dim, 1, H // self.kv_stride * W // self.kv_stride])
+        v = v.transpose([0, 3, 2, 1])
+        if self.use_fused_attn:
+            attn = F.scaled_dot_product_attention(
+                query=q,
+                key=k,
+                value=v,
+                attn_mask=attn_mask,
+                dropout_p=self.attn_drop_rate if self.training else 0.0)
+        else:
+            q = q.transpose([0, 2, 1, 3]) * self.scale
+            v = v.transpose([0, 2, 1, 3])
+            attn = q @ k.transpose([0, 2, 3, 1])
+            attn = self.softmax(attn)
+            attn = self.attn_drop(attn)
+            attn = attn @ v
+            attn = attn.transpose([0, 2, 1, 3])
+
+        attn = attn.reshape([B, H, W, self.query_dim])
+        x = self.proj(attn.transpose([0, 3, 1, 2]))
+        x = self.drop(x)
+        if self.layer_scale_init_value > 0.0:
+            x = self.layer_scale(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, self.drop_path(x))
+        return x
+
+
+class MobileNetV4(TheseusLayer):
+    """
+    MobileNetV4
+    Args:
+        config: list. MobileNetV4 depthwise blocks config.
+        stages_pattern: list. The pattern of each stage blocks.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+        inplanes: int=32. The output channel number of first convolution layer.
+        act: str="relu". The activation function.
+        class_expand: int=960. The output channel number of last convolution layer. 
+        drop_path_rate: float=0.0. Probability of dropping path.
+        drop_rate: float=0.0.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV4 model depends on args.
+    """
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_expand=LAST_CONV,
+                 act="relu",
+                 drop_path_rate=0.0,
+                 drop_rate=0.0,
+                 layer_scale_init_value=0.0,
+                 return_patterns=None,
+                 return_stages=None,
+                 use_fused_attn=False,
+                 **kwargs):
+        super(MobileNetV4, self).__init__()
+        self.cfg = config
+        self.scale = scale
+        self.drop_path_rate = drop_path_rate
+        self.inplanes = inplanes
+        self.class_expand = class_expand
+        self.class_num = class_num
+        self.conv_stem = ConvBnAct(
+            in_c=3,
+            out_c=_make_divisible(self.inplanes * self.scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            if_act=True,
+            act=act)
+
+        blocks = []
+        block_count = len(self.cfg)
+        for i in range(block_count):
+            type = self.cfg[i][0]
+            if type == "cn":
+                _, exp, k, act, s = self.cfg[i]
+                block = ConvBnAct(
+                    in_c=_make_divisible(self.inplanes * self.scale if i == 0
+                                         else self.cfg[i - 1][1] * self.scale),
+                    out_c=_make_divisible(exp * self.scale),
+                    filter_size=k,
+                    stride=s,
+                    padding=int((k - 1) // 2),
+                    num_groups=1,
+                    drop_path_rate=self.drop_path_rate * i / block_count,
+                    if_act=True,
+                    act=act)
+            elif type == "uir":
+                _, c, exp, k_start, k, act, s = self.cfg[i]
+                block = UniversalInvertedResidual(
+                    in_c=_make_divisible(self.inplanes * self.scale if i == 0
+                                         else self.cfg[i - 1][1] * self.scale),
+                    mid_c=_make_divisible(self.scale * exp),
+                    out_c=_make_divisible(self.scale * c),
+                    filter_size=k,
+                    stem_kernel_size=k_start,
+                    stride=s,
+                    drop_path_rate=self.drop_path_rate * i / block_count,
+                    layer_scale_init_value=layer_scale_init_value,
+                    if_act=True,
+                    act=act)
+            elif type == "er":
+                _, c, exp, k, act, s = self.cfg[i]
+                block = EdgeResidual(
+                    in_c=_make_divisible(self.inplanes * self.scale if i == 0
+                                         else self.cfg[i - 1][1] * self.scale),
+                    mid_c=_make_divisible(self.scale * exp),
+                    out_c=_make_divisible(self.scale * c),
+                    filter_size=k,
+                    stride=s,
+                    drop_path_rate=self.drop_path_rate * i / block_count,
+                    if_act=True,
+                    act=act)
+            elif type == "mqa":
+                # type, out,kv_dim, kernal_size, kv_stride, act, stride
+                _, c, dim, k, head, kv_stride, act, s = self.cfg[i]
+                block = MobileAttention(
+                    in_c=_make_divisible(self.inplanes * self.scale if i == 0
+                                         else self.cfg[i - 1][1] * self.scale),
+                    out_c=_make_divisible(self.scale * c),
+                    filter_size=k,
+                    stride=s,
+                    num_head=head,
+                    query_dim=_make_divisible(self.scale * head * dim),
+                    kv_dim=_make_divisible(self.scale * dim),
+                    kv_stride=kv_stride,
+                    drop_path_rate=self.drop_path_rate * i / block_count,
+                    layer_scale_init_value=layer_scale_init_value,
+                    if_act=True,
+                    act=act,
+                    use_fused_attn=use_fused_attn)
+            blocks.append(block)
+        self.blocks = nn.Sequential(*blocks)
+        self.global_pool = AdaptiveAvgPool2D(1)
+        self.conv_head = ConvBnAct(
+            in_c=_make_divisible(self.scale * self.cfg[-1][1]),
+            out_c=self.class_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+        self.flatten = Flatten(start_axis=1, stop_axis=-1)
+        self.dropout = Dropout(drop_rate)
+        self.classifier = Linear(self.class_expand,
+                                 class_num) if class_num > 0 else Identity()
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.blocks(x)
+        return x
+
+    def forward_head(self, x, pre_logits=False):
+        x = self.global_pool(x)
+        x = self.conv_head(x)
+        x = self.flatten(x)
+        if pre_logits:
+            return x
+        return self.classifier(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError("pretrained type is not available. ")
+
+
+def MobileNetV4_conv_small(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_conv_small
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_conv_small` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["conv_small"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_conv_small"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV4_conv_small"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV4_conv_medium(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_conv_medium
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_conv_medium` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["conv_medium"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_conv_medium"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV4_conv_medium"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV4_conv_large(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_conv_large
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_conv_large` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["conv_large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_conv_large"],
+        inplanes=24,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV4_conv_large"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV4_hybrid_medium(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_hybrid_medium
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_hybrid_medium` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["hybrid_medium"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_hybrid_medium"],
+        layer_scale_init_value=1e-05,
+        **kwargs)
+    _load_pretrained(pretrained, model,
+                     MODEL_URLS["MobileNetV4_hybrid_medium"], use_ssld)
+    return model
+
+
+def MobileNetV4_hybrid_large(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_hybrid_large
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_hybrid_large` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["hybrid_large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_hybrid_large"],
+        inplanes=24,
+        act="gelu",
+        layer_scale_init_value=1e-05,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV4_hybrid_large"],
+                     use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_hgnet.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_hgnet.py
new file mode 100755
index 000000000..04a936f85
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_hgnet.py
@@ -0,0 +1,376 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal, Constant
+from paddle.nn import Conv2D, BatchNorm2D, ReLU, MaxPool2D
+from .custom_devices_layers import AdaptiveAvgPool2D
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PPHGNet_tiny":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNet_tiny_pretrained.pdparams",
+    "PPHGNet_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNet_small_pretrained.pdparams",
+    "PPHGNet_base": ""
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class ConvBNAct(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_act=True):
+        super().__init__()
+        self.use_act = use_act
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if self.use_act:
+            self.act = ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+        return x
+
+
+class ESEModule(TheseusLayer):
+    def __init__(self, channels):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv = Conv2D(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.sigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+
+
+class HG_Block(TheseusLayer):
+    def __init__(
+            self,
+            in_channels,
+            mid_channels,
+            out_channels,
+            layer_num,
+            identity=False, ):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        self.layers.append(
+            ConvBNAct(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=3,
+                stride=1))
+        for _ in range(layer_num - 1):
+            self.layers.append(
+                ConvBNAct(
+                    in_channels=mid_channels,
+                    out_channels=mid_channels,
+                    kernel_size=3,
+                    stride=1))
+
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1)
+        self.att = ESEModule(out_channels)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_conv(x)
+        x = self.att(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HG_Stage(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num,
+                 downsample=True):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False)
+
+        blocks_list = []
+        blocks_list.append(
+            HG_Block(
+                in_channels,
+                mid_channels,
+                out_channels,
+                layer_num,
+                identity=False))
+        for _ in range(block_num - 1):
+            blocks_list.append(
+                HG_Block(
+                    out_channels,
+                    mid_channels,
+                    out_channels,
+                    layer_num,
+                    identity=True))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+class PPHGNet(TheseusLayer):
+    """
+    PPHGNet
+    Args:
+        stem_channels: list. Stem channel list of PPHGNet.
+        stage_config: dict. The configuration of each stage of PPHGNet. such as the number of channels, stride, etc.
+        layer_num: int. Number of layers of HG_Block.
+        use_last_conv: boolean. Whether to use a 1x1 convolutional layer before the classification layer.
+        class_expand: int=2048. Number of channels for the last 1x1 convolutional layer.
+        dropout_prob: float. Parameters of dropout, 0.0 means dropout is not used.
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific PPHGNet model depends on args.
+    """
+
+    def __init__(self,
+                 stem_channels,
+                 stage_config,
+                 layer_num,
+                 use_last_conv=True,
+                 class_expand=2048,
+                 dropout_prob=0.0,
+                 class_num=1000,
+                 **kwargs):
+        super().__init__()
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+
+        # stem
+        stem_channels.insert(0, 3)
+        self.stem = nn.Sequential(* [
+            ConvBNAct(
+                in_channels=stem_channels[i],
+                out_channels=stem_channels[i + 1],
+                kernel_size=3,
+                stride=2 if i == 0 else 1) for i in range(
+                    len(stem_channels) - 1)
+        ])
+        self.pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        # stages
+        self.stages = nn.LayerList()
+        for k in stage_config:
+            in_channels, mid_channels, out_channels, block_num, downsample = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(in_channels, mid_channels, out_channels, block_num,
+                         layer_num, downsample))
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=out_channels,
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.act = nn.ReLU()
+            self.dropout = nn.Dropout(
+                p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = nn.Linear(self.class_expand
+                            if self.use_last_conv else out_channels, class_num)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.pool(x)
+
+        for stage in self.stages:
+            x = stage(x)
+
+        x = self.avg_pool(x)
+        if self.use_last_conv:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPHGNet_tiny(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNet_tiny
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNet_tiny` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, blocks, downsample
+        "stage1": [96, 96, 224, 1, False],
+        "stage2": [224, 128, 448, 1, True],
+        "stage3": [448, 160, 512, 2, True],
+        "stage4": [512, 192, 768, 1, True],
+    }
+
+    model = PPHGNet(
+        stem_channels=[48, 48, 96],
+        stage_config=stage_config,
+        layer_num=5,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNet_tiny"], use_ssld)
+    return model
+
+
+def PPHGNet_small(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNet_small
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNet_small` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, blocks, downsample
+        "stage1": [128, 128, 256, 1, False],
+        "stage2": [256, 160, 512, 1, True],
+        "stage3": [512, 192, 768, 2, True],
+        "stage4": [768, 224, 1024, 1, True],
+    }
+
+    model = PPHGNet(
+        stem_channels=[64, 64, 128],
+        stage_config=stage_config,
+        layer_num=6,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNet_small"], use_ssld)
+    return model
+
+
+def PPHGNet_base(pretrained=False, use_ssld=True, **kwargs):
+    """
+    PPHGNet_base
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNet_base` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, blocks, downsample
+        "stage1": [160, 192, 320, 1, False],
+        "stage2": [320, 224, 640, 2, True],
+        "stage3": [640, 256, 960, 3, True],
+        "stage4": [960, 288, 1280, 2, True],
+    }
+
+    model = PPHGNet(
+        stem_channels=[96, 96, 160],
+        stage_config=stage_config,
+        layer_num=7,
+        dropout_prob=0.2,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNet_base"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_hgnet_v2.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_hgnet_v2.py
new file mode 100755
index 000000000..3f554c82c
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_hgnet_v2.py
@@ -0,0 +1,706 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal, Constant
+from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PPHGNetV2_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B0_ssld_pretrained.pdparams",
+    "PPHGNetV2_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B1_ssld_pretrained.pdparams",
+    "PPHGNetV2_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B2_ssld_pretrained.pdparams",
+    "PPHGNetV2_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B3_ssld_pretrained.pdparams",
+    "PPHGNetV2_B4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B4_ssld_pretrained.pdparams",
+    "PPHGNetV2_B5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B5_ssld_pretrained.pdparams",
+    "PPHGNetV2_B6":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B6_ssld_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class LearnableAffineBlock(TheseusLayer):
+    """  
+    Create a learnable affine block module. This module can significantly improve accuracy on smaller models.
+
+    Args:  
+        scale_value (float): The initial value of the scale parameter, default is 1.0.  
+        bias_value (float): The initial value of the bias parameter, default is 0.0.  
+        lr_mult (float): The learning rate multiplier, default is 1.0.  
+        lab_lr (float): The learning rate, default is 0.01.  
+    """
+
+    def __init__(self,
+                 scale_value=1.0,
+                 bias_value=0.0,
+                 lr_mult=1.0,
+                 lab_lr=0.01):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(TheseusLayer):
+    """  
+    ConvBNAct is a combination of convolution and batchnorm layers.
+  
+    Args:  
+        in_channels (int): Number of input channels.  
+        out_channels (int): Number of output channels.  
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        stride (int): Stride of the convolution. Defaults to 1.
+        padding (int/str): Padding or padding type for the convolution. Defaults to 1.
+        groups (int): Number of groups for the convolution. Defaults to 1.
+        use_act: (bool): Whether to use activation function. Defaults to True.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 groups=1,
+                 use_act=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding
+            if isinstance(padding, str) else (kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False)
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult))
+        if self.use_act:
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(TheseusLayer):
+    """  
+    LightConvBNAct is a combination of pw and dw layers.
+  
+    Args:  
+        in_channels (int): Number of input channels.  
+        out_channels (int): Number of output channels.  
+        kernel_size (int): Size of the depth-wise convolution kernel.  
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 use_lab=False,
+                 lr_mult=1.0,
+                 **kwargs):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(TheseusLayer):
+    """  
+    StemBlock for PP-HGNetV2.
+  
+    Args:  
+        in_channels (int): Number of input channels.  
+        mid_channels (int): Number of middle channels.
+        out_channels (int): Number of output channels.  
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HGV2_Block(TheseusLayer):
+    """  
+    HGV2_Block, the basic unit that constitutes the HGV2_Stage.
+  
+    Args:  
+        in_channels (int): Number of input channels. 
+        mid_channels (int): Number of middle channels.  
+        out_channels (int): Number of output channels.  
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
+        stride (int): Stride of the convolution. Defaults to 1.
+        padding (int/str): Padding or padding type for the convolution. Defaults to 1.
+        groups (int): Number of groups for the convolution. Defaults to 1.
+        use_act (bool): Whether to use activation function. Defaults to True.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size=3,
+                 layer_num=6,
+                 identity=False,
+                 light_block=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(in_channels=in_channels
+                                 if i == 0 else mid_channels,
+                                 out_channels=mid_channels,
+                                 stride=1,
+                                 kernel_size=kernel_size,
+                                 use_lab=use_lab,
+                                 lr_mult=lr_mult))
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HGV2_Stage(TheseusLayer):
+    """  
+    HGV2_Stage, the basic unit that constitutes the PPHGNetV2.
+  
+    Args:  
+        in_channels (int): Number of input channels. 
+        mid_channels (int): Number of middle channels.  
+        out_channels (int): Number of output channels.  
+        block_num (int): Number of blocks in the HGV2 stage. 
+        layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
+        is_downsample (bool): Whether to use downsampling operation. Defaults to False.
+        light_block (bool): Whether to use light block. Defaults to True.
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        use_lab (bool, optional): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float, optional): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num=6,
+                 is_downsample=True,
+                 light_block=True,
+                 kernel_size=3,
+                 use_lab=False,
+                 lr_mult=1.0):
+
+        super().__init__()
+        self.is_downsample = is_downsample
+        if self.is_downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult)
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HGV2_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.is_downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+class PPHGNetV2(TheseusLayer):
+    """
+    PPHGNetV2
+
+    Args:
+        stage_config (dict): Config for PPHGNetV2 stages. such as the number of channels, stride, etc.
+        stem_channels: (list): Number of channels of the stem of the PPHGNetV2.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        use_last_conv (bool): Whether to use the last conv layer as the output channel. Defaults to True.
+        class_expand (int): Number of channels for the last 1x1 convolutional layer.
+        drop_prob (float): Dropout probability for the last 1x1 convolutional layer. Defaults to 0.0.
+        class_num (int): The number of classes for the classification layer. Defaults to 1000.
+        lr_mult_list (list): Learning rate multiplier for the stages. Defaults to [1.0, 1.0, 1.0, 1.0, 1.0].
+    Returns:
+        model: nn.Layer. Specific PPHGNetV2 model depends on args.
+    """
+
+    def __init__(self,
+                 stage_config,
+                 stem_channels=[3, 32, 64],
+                 use_lab=False,
+                 use_last_conv=True,
+                 class_expand=2048,
+                 dropout_prob=0.0,
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 **kwargs):
+        super().__init__()
+        self.use_lab = use_lab
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+        self.class_num = class_num
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0])
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, is_downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HGV2_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    is_downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    lr_mult=lr_mult_list[i + 1]))
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=out_channels,
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock()
+            self.dropout = nn.Dropout(
+                p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = nn.Linear(self.class_expand if self.use_last_conv else
+                            out_channels, self.class_num)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.stem(x)
+
+        for stage in self.stages:
+            x = stage(x)
+        x = self.avg_pool(x)
+
+        if self.use_last_conv:
+            x = self.last_conv(x)
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(
+            model,
+            model_url,
+            use_ssld=use_ssld,
+            use_ssld_stage1_pretrained=True)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPHGNetV2_B0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B0
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B0` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [16, 16, 64, 1, False, False, 3, 3],
+        "stage2": [64, 32, 256, 1, True, False, 3, 3],
+        "stage3": [256, 64, 512, 2, True, True, 5, 3],
+        "stage4": [512, 128, 1024, 1, True, True, 5, 3],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 16, 16],
+        stage_config=stage_config,
+        use_lab=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B0"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B1(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B1
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B1` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [32, 32, 64, 1, False, False, 3, 3],
+        "stage2": [64, 48, 256, 1, True, False, 3, 3],
+        "stage3": [256, 96, 512, 2, True, True, 5, 3],
+        "stage4": [512, 192, 1024, 1, True, True, 5, 3],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 24, 32],
+        stage_config=stage_config,
+        use_lab=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B1"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B2(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B2
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B2` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [32, 32, 96, 1, False, False, 3, 4],
+        "stage2": [96, 64, 384, 1, True, False, 3, 4],
+        "stage3": [384, 128, 768, 3, True, True, 5, 4],
+        "stage4": [768, 256, 1536, 1, True, True, 5, 4],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 24, 32],
+        stage_config=stage_config,
+        use_lab=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B2"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B3(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B3
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B3` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [32, 32, 128, 1, False, False, 3, 5],
+        "stage2": [128, 64, 512, 1, True, False, 3, 5],
+        "stage3": [512, 128, 1024, 3, True, True, 5, 5],
+        "stage4": [1024, 256, 2048, 1, True, True, 5, 5],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 24, 32],
+        stage_config=stage_config,
+        use_lab=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B3"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B4(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B4
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B4` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [48, 48, 128, 1, False, False, 3, 6],
+        "stage2": [128, 96, 512, 1, True, False, 3, 6],
+        "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+        "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 32, 48],
+        stage_config=stage_config,
+        use_lab=False,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B4"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B5
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B5` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [64, 64, 128, 1, False, False, 3, 6],
+        "stage2": [128, 128, 512, 2, True, False, 3, 6],
+        "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+        "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 32, 64],
+        stage_config=stage_config,
+        use_lab=False,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B5"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B6(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B6
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B6` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [96, 96, 192, 2, False, False, 3, 6],
+        "stage2": [192, 192, 512, 3, True, False, 3, 6],
+        "stage3": [512, 384, 1024, 6, True, True, 5, 6],
+        "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 48, 96],
+        stage_config=stage_config,
+        use_lab=False,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B6"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_lcnet.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_lcnet.py
new file mode 100755
index 000000000..cc672b0f4
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_lcnet.py
@@ -0,0 +1,520 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import BatchNorm2D, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+
+from .custom_devices_layers import AdaptiveAvgPool2D
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PPLCNet_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_25_pretrained.pdparams",
+    "PPLCNet_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_35_pretrained.pdparams",
+    "PPLCNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_5_pretrained.pdparams",
+    "PPLCNet_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_75_pretrained.pdparams",
+    "PPLCNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams",
+    "PPLCNet_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_5_pretrained.pdparams",
+    "PPLCNet_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_0_pretrained.pdparams",
+    "PPLCNet_x2_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_5_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"]
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se.
+# k: kernel_size
+# in_c: input channel number in depthwise block
+# out_c: output channel number in depthwise block
+# s: stride in depthwise block
+# use_se: whether to use SE block
+
+NET_CONFIG = {
+    # [k, in_c, out_c, s, use_se]
+    "blocks2": [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
+    "blocks5": [[3, 128, 256, 2, False], [5, 256, 256, 1, False],
+                [5, 256, 256, 1, False], [5, 256, 256, 1, False],
+                [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act == "relu6":
+        return nn.ReLU6()
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+def _create_model_urls(model_scale):
+    model_scale_str = "PPLCNet_x" + str(model_scale).replace('.', '_')
+    if model_scale_str in MODEL_URLS:
+        return MODEL_URLS[model_scale_str]
+    else:
+        return None
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 num_groups=1,
+                 lr_mult=1.0,
+                 act="hardswish"):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), learning_rate=lr_mult),
+            bias_attr=False)
+
+        self.bn = BatchNorm2D(
+            num_filters,
+            weight_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult))
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class DepthwiseSeparable(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 dw_size=3,
+                 use_se=False,
+                 lr_mult=1.0,
+                 act="hardswish"):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_channels,
+            filter_size=dw_size,
+            stride=stride,
+            num_groups=num_channels,
+            lr_mult=lr_mult,
+            act=act)
+        if use_se:
+            self.se = SEModule(num_channels, lr_mult=lr_mult)
+        self.pw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1,
+            lr_mult=lr_mult,
+            act=act)
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4, lr_mult=1.0):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class PPLCNet(TheseusLayer):
+    def __init__(self,
+                 scale=1.0,
+                 class_num=1000,
+                 dropout_prob=0.2,
+                 class_expand=1280,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+                 stride_list=[2, 2, 2, 2, 2],
+                 use_last_conv=True,
+                 act="hardswish",
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.scale = scale
+        self.class_expand = class_expand
+        self.lr_mult_list = lr_mult_list
+        self.use_last_conv = use_last_conv
+        self.stride_list = stride_list
+        self.net_config = NET_CONFIG
+        if isinstance(self.lr_mult_list, str):
+            self.lr_mult_list = eval(self.lr_mult_list)
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 6, "lr_mult_list length should be 6 but got {}".format(
+                       len(self.lr_mult_list))
+
+        assert isinstance(self.stride_list, (
+            list, tuple
+        )), "stride_list should be in (list, tuple) but got {}".format(
+            type(self.stride_list))
+        assert len(self.stride_list
+                   ) == 5, "stride_list length should be 5 but got {}".format(
+                       len(self.stride_list))
+
+        for i, stride in enumerate(stride_list[1:]):
+            self.net_config["blocks{}".format(i + 3)][0][3] = stride
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=make_divisible(16 * scale),
+            stride=stride_list[0],
+            lr_mult=self.lr_mult_list[0],
+            act=act)
+
+        self.blocks2 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[1],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks2"])
+        ])
+
+        self.blocks3 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[2],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks3"])
+        ])
+
+        self.blocks4 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[3],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks4"])
+        ])
+
+        self.blocks5 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[4],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks5"])
+        ])
+
+        self.blocks6 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[5],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks6"])
+        ])
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=make_divisible(self.net_config["blocks6"][-1][2] *
+                                           scale),
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.act = _create_act(act)
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        else:
+            self.last_conv = None
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = Linear(
+            self.class_expand if self.use_last_conv else
+            make_divisible(self.net_config["blocks6"][-1][2] * scale),
+            class_num)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        x = self.blocks4(x)
+        x = self.blocks5(x)
+        x = self.blocks6(x)
+
+        x = self.avg_pool(x)
+        if self.last_conv is not None:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPLCNetBaseNet(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNetBaseNet
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer.
+    """
+    if "scale" in kwargs:
+        scale = kwargs["scale"]
+        kwargs.pop("scale")
+    else:
+        scale = 1.0
+
+    model = PPLCNet(scale=scale, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    model_url = _create_model_urls(scale)
+    _load_pretrained(pretrained, model, model_url, use_ssld)
+    return model
+
+
+def PPLCNet_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_25` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.25, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_25"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_35` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.35, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_35"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_5"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_75` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.75, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_75"], use_ssld)
+    return model
+
+
+def PPLCNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x1_0` model depends on args.
+    """
+    model = PPLCNet(
+        scale=1.0, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x1_0"], use_ssld)
+    return model
+
+
+def PPLCNet_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x1_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x1_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=1.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x1_5"], use_ssld)
+    return model
+
+
+def PPLCNet_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x2_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x2_0` model depends on args.
+    """
+    model = PPLCNet(
+        scale=2.0, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x2_0"], use_ssld)
+    return model
+
+
+def PPLCNet_x2_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x2_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x2_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=2.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x2_5"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_lcnet_v2.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_lcnet_v2.py
new file mode 100755
index 000000000..e44ef097a
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/pp_lcnet_v2.py
@@ -0,0 +1,394 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PPLCNetV2_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_small_pretrained.pdparams",
+    "PPLCNetV2_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_pretrained.pdparams",
+    "PPLCNetV2_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_large_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+NET_CONFIG = {
+    # in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut
+    "stage1": [64, 3, False, False, False, False],
+    "stage2": [128, 3, False, False, False, False],
+    "stage3": [256, 5, True, True, True, False],
+    "stage4": [512, 5, False, True, False, True],
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_act=True):
+        super().__init__()
+        self.use_act = use_act
+        self.conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if self.use_act:
+            self.act = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+        return x
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class RepDepthwiseSeparable(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 dw_size=3,
+                 split_pw=False,
+                 use_rep=False,
+                 use_se=False,
+                 use_shortcut=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.is_repped = False
+
+        self.dw_size = dw_size
+        self.split_pw = split_pw
+        self.use_rep = use_rep
+        self.use_se = use_se
+        self.use_shortcut = True if use_shortcut and stride == 1 and in_channels == out_channels else False
+
+        if self.use_rep:
+            self.dw_conv_list = nn.LayerList()
+            for kernel_size in range(self.dw_size, 0, -2):
+                if kernel_size == 1 and stride != 1:
+                    continue
+                dw_conv = ConvBNLayer(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    groups=in_channels,
+                    use_act=False)
+                self.dw_conv_list.append(dw_conv)
+            self.dw_conv = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=dw_size,
+                stride=stride,
+                padding=(dw_size - 1) // 2,
+                groups=in_channels)
+        else:
+            self.dw_conv = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=dw_size,
+                stride=stride,
+                groups=in_channels)
+
+        self.act = nn.ReLU()
+
+        if use_se:
+            self.se = SEModule(in_channels)
+
+        if self.split_pw:
+            pw_ratio = 0.5
+            self.pw_conv_1 = ConvBNLayer(
+                in_channels=in_channels,
+                kernel_size=1,
+                out_channels=int(out_channels * pw_ratio),
+                stride=1)
+            self.pw_conv_2 = ConvBNLayer(
+                in_channels=int(out_channels * pw_ratio),
+                kernel_size=1,
+                out_channels=out_channels,
+                stride=1)
+        else:
+            self.pw_conv = ConvBNLayer(
+                in_channels=in_channels,
+                kernel_size=1,
+                out_channels=out_channels,
+                stride=1)
+
+    def forward(self, x):
+        if self.use_rep:
+            input_x = x
+            if self.is_repped:
+                x = self.act(self.dw_conv(x))
+            else:
+                y = self.dw_conv_list[0](x)
+                for dw_conv in self.dw_conv_list[1:]:
+                    y += dw_conv(x)
+                x = self.act(y)
+        else:
+            x = self.dw_conv(x)
+
+        if self.use_se:
+            x = self.se(x)
+        if self.split_pw:
+            x = self.pw_conv_1(x)
+            x = self.pw_conv_2(x)
+        else:
+            x = self.pw_conv(x)
+        if self.use_shortcut:
+            x = x + input_x
+        return x
+
+    def re_parameterize(self):
+        if self.use_rep:
+            self.is_repped = True
+            kernel, bias = self._get_equivalent_kernel_bias()
+            self.dw_conv.weight.set_value(kernel)
+            self.dw_conv.bias.set_value(bias)
+
+    def _get_equivalent_kernel_bias(self):
+        kernel_sum = 0
+        bias_sum = 0
+        for dw_conv in self.dw_conv_list:
+            kernel, bias = self._fuse_bn_tensor(dw_conv)
+            kernel = self._pad_tensor(kernel, to_size=self.dw_size)
+            kernel_sum += kernel
+            bias_sum += bias
+        return kernel_sum, bias_sum
+
+    def _fuse_bn_tensor(self, branch):
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+    def _pad_tensor(self, tensor, to_size):
+        from_size = tensor.shape[-1]
+        if from_size == to_size:
+            return tensor
+        pad = (to_size - from_size) // 2
+        return F.pad(tensor, [pad, pad, pad, pad])
+
+
+class PPLCNetV2(TheseusLayer):
+    def __init__(self,
+                 scale,
+                 depths,
+                 class_num=1000,
+                 dropout_prob=0,
+                 use_last_conv=True,
+                 class_expand=1280,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.scale = scale
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                in_channels=3,
+                kernel_size=3,
+                out_channels=make_divisible(32 * scale),
+                stride=2), RepDepthwiseSeparable(
+                    in_channels=make_divisible(32 * scale),
+                    out_channels=make_divisible(64 * scale),
+                    stride=1,
+                    dw_size=3)
+        ])
+
+        # stages
+        self.stages = nn.LayerList()
+        for depth_idx, k in enumerate(NET_CONFIG):
+            in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut = NET_CONFIG[
+                k]
+            self.stages.append(
+                nn.Sequential(* [
+                    RepDepthwiseSeparable(
+                        in_channels=make_divisible((in_channels if i == 0 else
+                                                    in_channels * 2) * scale),
+                        out_channels=make_divisible(in_channels * 2 * scale),
+                        stride=2 if i == 0 else 1,
+                        dw_size=kernel_size,
+                        split_pw=split_pw,
+                        use_rep=use_rep,
+                        use_se=use_se,
+                        use_shortcut=use_shortcut)
+                    for i in range(depths[depth_idx])
+                ]))
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=make_divisible(NET_CONFIG["stage4"][0] * 2 *
+                                           scale),
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.act = nn.ReLU()
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        in_features = self.class_expand if self.use_last_conv else make_divisible(
+            NET_CONFIG["stage4"][0] * 2 * scale)
+        self.fc = Linear(in_features, class_num)
+
+    def forward(self, x):
+        x = self.stem(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = self.avg_pool(x)
+        if self.use_last_conv:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPLCNetV2_small(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNetV2_small
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNetV2_base` model depends on args.
+    """
+    model = PPLCNetV2(
+        scale=0.75, depths=[2, 2, 4, 2], dropout_prob=0.2, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNetV2_small"],
+                     use_ssld)
+    return model
+
+
+def PPLCNetV2_base(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNetV2_base
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNetV2_base` model depends on args.
+    """
+    model = PPLCNetV2(
+        scale=1.0, depths=[2, 2, 6, 2], dropout_prob=0.2, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNetV2_base"], use_ssld)
+    return model
+
+
+def PPLCNetV2_large(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNetV2_large
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNetV2_base` model depends on args.
+    """
+    model = PPLCNetV2(
+        scale=1.25, depths=[2, 2, 8, 2], dropout_prob=0.2, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNetV2_large"],
+                     use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/resnet.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/resnet.py
new file mode 100755
index 000000000..6176aa268
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/resnet.py
@@ -0,0 +1,649 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/pdf/1512.03385
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, BatchNorm2D
+from paddle.nn import MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+from paddle.regularizer import L2Decay
+import math
+
+from .custom_devices_layers import AdaptiveAvgPool2D
+from ....utils import logger
+from ..base.theseus_layer import TheseusLayer
+from ..base.dbb.dbb_block import DiverseBranchBlock
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_dbb":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_dbb_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "ResNet18": ["blocks[1]", "blocks[3]", "blocks[5]", "blocks[7]"],
+    "ResNet34": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet50": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet101": ["blocks[2]", "blocks[6]", "blocks[29]", "blocks[32]"],
+    "ResNet152": ["blocks[2]", "blocks[10]", "blocks[46]", "blocks[49]"],
+    "ResNet200": ["blocks[2]", "blocks[14]", "blocks[62]", "blocks[65]"]
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2,
+            stride=stride,
+            padding="SAME",
+            ceil_mode=True,
+            data_format=data_format)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=1 if is_vd_mode else stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 layer=ConvBNLayer,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.conv0 = layer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = layer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = layer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 layer=ConvBNLayer,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = layer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = layer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better.
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 version="vb",
+                 stem_act="relu",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 stride_list=[2, 2, 2, 2, 2],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None,
+                 return_stages=None,
+                 layer_type="ConvBNLayer",
+                 use_first_short_conv=True,
+                 **kargs):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.stride_list = stride_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        if layer_type == "ConvBNLayer":
+            layer = ConvBNLayer
+        elif layer_type == "DiverseBranchBlock":
+            layer = DiverseBranchBlock
+        else:
+            raise Exception()
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        if len(self.lr_mult_list) != 5:
+            msg = "lr_mult_list length should be 5 but got {}, default lr_mult_list used".format(
+                len(self.lr_mult_list))
+            logger.warning(msg)
+            self.lr_mult_list = [1.0, 1.0, 1.0, 1.0, 1.0]
+
+        assert isinstance(self.stride_list, (
+            list, tuple
+        )), "stride_list should be in (list, tuple) but got {}".format(
+            type(self.stride_list))
+        assert len(self.stride_list
+                   ) == 5, "stride_list length should be 5 but got {}".format(
+                       len(self.stride_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, self.stride_list[0]]],
+            "vd": [[input_image_channel, 32, 3, self.stride_list[0]],
+                   [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(*[
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act=stem_act,
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3,
+            stride=stride_list[1],
+            padding=1,
+            data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            # paddleclas' special improvement version
+            shortcut = False
+            # official resnet_vb version
+            if not use_first_short_conv and block_idx == 0:
+                shortcut = True
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=self.stride_list[block_idx + 1]
+                    if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    layer=layer,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            return self._forward(x)
+
+    def _forward(self, x):
+        if self.data_format == "NHWC":
+            x = paddle.transpose(x, [0, 2, 3, 1])
+            x.stop_gradient = True
+        x = self.stem(x)
+        x = self.max_pool(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False,
+             use_ssld=False,
+             layer_type="ConvBNLayer",
+             **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vb",
+        layer_type=layer_type,
+        **kwargs)
+    if layer_type == "DiverseBranchBlock":
+        _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_dbb"],
+                         use_ssld)
+    else:
+        _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["200"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet200"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/swin_transformer.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/swin_transformer.py
new file mode 100755
index 000000000..9a464e9dd
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/swin_transformer.py
@@ -0,0 +1,1002 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/microsoft/Swin-Transformer
+# reference: https://arxiv.org/abs/2103.14030
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ..model_zoo.vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+from ....utils import logger
+
+
+MODEL_URLS = {
+    "SwinTransformer_tiny_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_tiny_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_small_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_small_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_base_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_base_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_base_patch4_window12_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_base_patch4_window12_384_pretrained.pdparams",
+    "SwinTransformer_large_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_large_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_large_patch4_window12_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_large_patch4_window12_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+Linear = nn.Linear
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def check_support_fused_op(use_fused_linear):
+    if use_fused_linear:
+        if paddle.device.cuda.get_device_capability()[0] >= 8:
+            return True
+        else:
+            logger.warning("The current device don't support Fused OP! Using the general Linear instead.")
+    return False
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def pading_for_not_divisible(pixel_values,
+                             height,
+                             width,
+                             patch_size,
+                             format="BCHW",
+                             function="split"):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    if height % patch_size[0] == 0 and width % patch_size[1] == 0:
+        return pixel_values, (0, 0, 0, 0, 0, 0, 0, 0)
+    if function == "split":
+        pading_width = patch_size[1] - width % patch_size[1]
+        pading_height = patch_size[0] - height % patch_size[0]
+    elif function == "merge":
+        pading_width = width % 2
+        pading_height = height % 2
+    if format == "BCHW":
+        pad_index = (0, 0, 0, 0, 0, pading_height, 0, pading_width)
+    elif format == "BHWC":
+        pad_index = (0, 0, 0, pading_height, 0, pading_width, 0, 0)
+    else:
+        assert ("vaild format")
+
+    return F.pad(pixel_values, pad_index), pad_index
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W, C):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 use_fused_attn=False):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        # 2*Wh-1 * 2*Ww-1, nH
+        self.relative_position_bias_table = self.create_parameter(
+            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                   num_heads),
+            default_initializer=zeros_)
+        self.add_parameter("relative_position_bias_table",
+                           self.relative_position_bias_table)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+        self.qkv = Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+        self.use_fused_attn = use_fused_attn
+
+    def eval(self, ):
+        # this is used to re-param swin for model export
+        relative_position_bias_table = self.relative_position_bias_table
+        window_size = self.window_size
+        index = self.relative_position_index.reshape([-1])
+
+        relative_position_bias = paddle.index_select(
+            relative_position_bias_table, index)
+        relative_position_bias = relative_position_bias.reshape([
+            window_size[0] * window_size[1], window_size[0] * window_size[1],
+            -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        relative_position_bias = relative_position_bias.unsqueeze(0)
+        self.register_buffer("relative_position_bias", relative_position_bias)
+
+    def get_relative_position_bias(self):
+        if self.training or not hasattr(self, "relative_position_bias"):
+            index = self.relative_position_index.reshape([-1])
+
+            relative_position_bias = paddle.index_select(
+                self.relative_position_bias_table, index)
+            relative_position_bias = relative_position_bias.reshape([
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1], -1
+            ])  # Wh*Ww,Wh*Ww,nH
+
+            relative_position_bias = relative_position_bias.transpose(
+                [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+            return relative_position_bias.unsqueeze(0)
+        else:
+            return self.relative_position_bias
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [B_, N, 3, self.num_heads, C // self.num_heads])
+
+        if self.use_fused_attn:
+            qkv = qkv.transpose((2, 0, 1, 3, 4))
+            q, k, v = qkv[0], qkv[1], qkv[2]
+            attn_mask = self.get_relative_position_bias()
+            if mask is not None:
+                nW = mask.shape[0]
+                mask = mask.reshape((1, nW, 1, N, N)).expand((B_ // nW, -1, self.num_heads, -1, -1))
+                attn_mask = attn_mask + mask.reshape((-1, self.num_heads, N, N))
+            attn_mask = attn_mask.expand((B_, -1, -1, -1))
+            attn = paddle.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop.p if self.training else 0., attn_mask=attn_mask)
+        else:
+            qkv = qkv.transpose((2, 0, 3, 1, 4))
+            q, k, v = qkv[0], qkv[1], qkv[2]
+            q = q * self.scale
+            attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+            attn = attn + self.get_relative_position_bias()
+
+            if mask is not None:
+                nW = mask.shape[0]
+                attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                    ]) + mask.unsqueeze(1).unsqueeze(0)
+                attn = attn.reshape([B_, self.num_heads, N, N])
+            attn = self.softmax(attn)
+            attn = self.attn_drop(attn)
+            attn = paddle.mm(attn, v)
+            attn = attn.transpose([0, 2, 1, 3])
+        x = attn.reshape([B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self):
+        return "dim={}, window_size={}, num_heads={}".format(
+            self.dim, self.window_size, self.num_heads)
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class SwinTransformerBlock(nn.Layer):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_fused_attn=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_fused_attn=use_fused_attn)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        H, W = self.input_resolution
+        attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def get_attn_mask(self, height, width, dtype):
+        if self.shift_size > 0:
+            # calculate attention mask for shifted window multihead self attention
+            img_mask = paddle.zeros((1, height, width, 1), dtype=dtype)
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None), )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None), )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.reshape(
+                (-1, self.window_size * self.window_size))
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = masked_fill(attn_mask, attn_mask != 0, float(-100.0))
+            attn_mask = masked_fill(attn_mask, attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def forward(self, x, input_dimensions):
+        H, W = input_dimensions
+        B, L, C = x.shape
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([B, H, W, C])
+
+        x, pad_values = pading_for_not_divisible(x, H, W, self.window_size,
+                                                 "BHWC")
+        _, height_pad, width_pad, _ = x.shape
+
+        padding_state = pad_values[3] > 0 or pad_values[
+            5] > 0  # change variable name
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [-1, self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        #check did it need to calculate again
+        attn_mask = self.get_attn_mask(height_pad, width_pad, x.dtype)
+
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, height_pad,
+                                   width_pad, C)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if padding_state:
+            x = x[:, :H, :W, :]
+        x = x.reshape([B, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+    def extra_repr(self):
+        return "dim={}, input_resolution={}, num_heads={}, window_size={}, shift_size={}, mlp_ratio={}".format(
+            self.dim, self.input_resolution, self.num_heads, self.window_size,
+            self.shift_size, self.mlp_ratio)
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, input_dimensions):
+        """
+        x: B, H*W, C
+        """
+        H, W = input_dimensions
+        B, L, C = x.shape
+        x = x.reshape([B, H, W, C])
+        x, _ = pading_for_not_divisible(x, H, W, 2, "BHWC", function="merge")
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.reshape(x, x.shape)
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+
+        # x = x.reshape([B, H // 2, 2, W // 2, 2, C])
+        # x = x.transpose((0, 1, 3, 4, 2, 5))
+
+        x = x.reshape([B,  -1, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self):
+        return "input_resolution={}, dim={}".format(self.input_resolution,
+                                                    self.dim)
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 use_fused_attn=False):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_fused_attn=use_fused_attn) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, input_dimensions):
+        H, W = input_dimensions
+        for blk in self.blocks:
+            x = blk(x, input_dimensions)
+        if self.downsample is not None:
+            H, W = (H + 1) // 2, (W + 1) // 2
+            x = self.downsample(x, input_dimensions)
+        return x, (H, W)
+
+    def extra_repr(self):
+        return "dim={}, input_resolution={}, depth={}".format(
+            self.dim, self.input_resolution, self.depth)
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x, _ = pading_for_not_divisible(x, H, W, self.patch_size, "BCHW")
+        x = self.proj(x)
+        _, _, height, width = x.shape
+        output_dimensions = (height, width)
+        x = x.flatten(2).transpose([0, 2, 1])  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, output_dimensions
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (
+            self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+
+class SwinTransformer(TheseusLayer):
+    """ Swin Transformer
+        A PaddlePaddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 use_checkpoint=False,
+                 **kwargs):
+        super(SwinTransformer, self).__init__()
+
+        self.num_classes = num_classes = class_num
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+        use_fused_attn = check_support_fused_op(kwargs.get('use_fused_attn', False))
+        use_fused_linear = kwargs.get('use_fused_linear', False)
+        global Linear
+        Linear = paddle.incubate.nn.FusedLinear if use_fused_linear else nn.Linear
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = self.create_parameter(
+                shape=(1, num_patches, embed_dim), default_initializer=zeros_)
+            self.add_parameter("absolute_pos_embed", self.absolute_pos_embed)
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths)).tolist()  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                input_resolution=(patches_resolution[0] // (2**i_layer),
+                                  patches_resolution[1] // (2**i_layer)),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                use_fused_attn=use_fused_attn)
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1D(1)
+
+        self.head = Linear(
+            self.num_features,
+            num_classes) if self.num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, paddle.incubate.nn.FusedLinear)):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x, output_dimensions = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x, output_dimensions = layer(x, output_dimensions)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose([0, 2, 1]))  # B C 1
+        x = paddle.flatten(x, 1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for _, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += self.num_features * self.patches_resolution[
+            0] * self.patches_resolution[1] // (2**self.num_layers)
+        flops += self.num_features * self.num_classes
+        return flops
+
+
+def _load_pretrained(pretrained,
+                     model,
+                     model_url,
+                     use_ssld=False,
+                     use_imagenet22k_pretrained=False,
+                     use_imagenet22kto1k_pretrained=False,
+                     **kwargs):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(
+            model,
+            model_url,
+            use_ssld=use_ssld,
+            use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+            use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SwinTransformer_tiny_patch4_window7_224(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformer(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.2,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.1
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_tiny_patch4_window7_224"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_small_patch4_window7_224(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformer(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.3,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_small_patch4_window7_224"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_base_patch4_window7_224(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformer(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        drop_path_rate=0.5,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_base_patch4_window7_224"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_base_patch4_window12_384(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformer(
+        img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.5,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_base_patch4_window12_384"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_large_patch4_window7_224(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformer(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_large_patch4_window7_224"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_large_patch4_window12_384(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformer(
+        img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_large_patch4_window12_384"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/legendary_models/vgg.py b/example/low_precision_training/ppcls/arch/backbone/legendary_models/vgg.py
new file mode 100755
index 000000000..8a0f0156e
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/legendary_models/vgg.py
@@ -0,0 +1,261 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1409.1556
+
+from __future__ import absolute_import, division, print_function
+
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import MaxPool2D
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "VGG11":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG11_pretrained.pdparams",
+    "VGG13":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG13_pretrained.pdparams",
+    "VGG16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG16_pretrained.pdparams",
+    "VGG19":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG19_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "VGG": [
+        "conv_block_1", "conv_block_2", "conv_block_3", "conv_block_4",
+        "conv_block_5"
+    ]
+}
+
+__all__ = MODEL_URLS.keys()
+
+# VGG config
+# key: VGG network depth
+# value: conv num in different blocks
+NET_CONFIG = {
+    11: [1, 1, 2, 2, 2],
+    13: [2, 2, 2, 2, 2],
+    16: [2, 2, 3, 3, 3],
+    19: [2, 2, 4, 4, 4]
+}
+
+
+class ConvBlock(TheseusLayer):
+    def __init__(self, input_channels, output_channels, groups):
+        super().__init__()
+
+        self.groups = groups
+        self.conv1 = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        if groups == 2 or groups == 3 or groups == 4:
+            self.conv2 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+        if groups == 3 or groups == 4:
+            self.conv3 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+        if groups == 4:
+            self.conv4 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+
+        self.max_pool = MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self.relu = nn.ReLU()
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        x = self.relu(x)
+        if self.groups == 2 or self.groups == 3 or self.groups == 4:
+            x = self.conv2(x)
+            x = self.relu(x)
+        if self.groups == 3 or self.groups == 4:
+            x = self.conv3(x)
+            x = self.relu(x)
+        if self.groups == 4:
+            x = self.conv4(x)
+            x = self.relu(x)
+        x = self.max_pool(x)
+        return x
+
+
+class VGGNet(TheseusLayer):
+    """
+    VGGNet
+    Args:
+        config: list. VGGNet config.
+        stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific VGG model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 stop_grad_layers=0,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.stop_grad_layers = stop_grad_layers
+
+        self.conv_block_1 = ConvBlock(3, 64, config[0])
+        self.conv_block_2 = ConvBlock(64, 128, config[1])
+        self.conv_block_3 = ConvBlock(128, 256, config[2])
+        self.conv_block_4 = ConvBlock(256, 512, config[3])
+        self.conv_block_5 = ConvBlock(512, 512, config[4])
+
+        self.relu = nn.ReLU()
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        for idx, block in enumerate([
+                self.conv_block_1, self.conv_block_2, self.conv_block_3,
+                self.conv_block_4, self.conv_block_5
+        ]):
+            if self.stop_grad_layers >= idx + 1:
+                for param in block.parameters():
+                    param.trainable = False
+
+        self.drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self.fc1 = Linear(7 * 7 * 512, 4096)
+        self.fc2 = Linear(4096, 4096)
+        self.fc3 = Linear(4096, class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, inputs):
+        x = self.conv_block_1(inputs)
+        x = self.conv_block_2(x)
+        x = self.conv_block_3(x)
+        x = self.conv_block_4(x)
+        x = self.conv_block_5(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        x = self.fc3(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def VGG11(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG11
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG11` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[11],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG11"], use_ssld)
+    return model
+
+
+def VGG13(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG13
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG13` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[13],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG13"], use_ssld)
+    return model
+
+
+def VGG16(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG16
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG16` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[16],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG16"], use_ssld)
+    return model
+
+
+def VGG19(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG19
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG19` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[19],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG19"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/__init__.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/adaface_ir_net.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/adaface_ir_net.py
new file mode 100755
index 000000000..47de152b6
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/adaface_ir_net.py
@@ -0,0 +1,529 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this code is based on AdaFace(https://github.com/mk-minchul/AdaFace)
+from collections import namedtuple
+import paddle
+import paddle.nn as nn
+from paddle.nn import Dropout
+from paddle.nn import MaxPool2D
+from paddle.nn import Sequential
+from paddle.nn import Conv2D, Linear
+from paddle.nn import BatchNorm1D, BatchNorm2D
+from paddle.nn import ReLU, Sigmoid
+from paddle.nn import Layer
+from paddle.nn import PReLU
+
+# from ppcls.arch.backbone.legendary_models.resnet import _load_pretrained
+
+
+class Flatten(Layer):
+    """ Flat tensor
+    """
+
+    def forward(self, input):
+        return paddle.reshape(input, [input.shape[0], -1])
+
+
+class LinearBlock(Layer):
+    """ Convolution block without no-linear activation layer
+    """
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 groups=1):
+        super(LinearBlock, self).__init__()
+        self.conv = Conv2D(
+            in_c,
+            out_c,
+            kernel,
+            stride,
+            padding,
+            groups=groups,
+            weight_attr=nn.initializer.KaimingNormal(),
+            bias_attr=None)
+        weight_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=0.0))
+        self.bn = BatchNorm2D(
+            out_c, weight_attr=weight_attr, bias_attr=bias_attr)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class GNAP(Layer):
+    """ Global Norm-Aware Pooling block
+    """
+
+    def __init__(self, in_c):
+        super(GNAP, self).__init__()
+        self.bn1 = BatchNorm2D(in_c, weight_attr=False, bias_attr=False)
+        self.pool = nn.AdaptiveAvgPool2D((1, 1))
+        self.bn2 = BatchNorm1D(in_c, weight_attr=False, bias_attr=False)
+
+    def forward(self, x):
+        x = self.bn1(x)
+        x_norm = paddle.norm(x, 2, 1, True)
+        x_norm_mean = paddle.mean(x_norm)
+        weight = x_norm_mean / x_norm
+        x = x * weight
+        x = self.pool(x)
+        x = x.view(x.shape[0], -1)
+        feature = self.bn2(x)
+        return feature
+
+
+class GDC(Layer):
+    """ Global Depthwise Convolution block
+    """
+
+    def __init__(self, in_c, embedding_size):
+        super(GDC, self).__init__()
+        self.conv_6_dw = LinearBlock(
+            in_c,
+            in_c,
+            groups=in_c,
+            kernel=(7, 7),
+            stride=(1, 1),
+            padding=(0, 0))
+        self.conv_6_flatten = Flatten()
+        self.linear = Linear(
+            in_c,
+            embedding_size,
+            weight_attr=nn.initializer.KaimingNormal(),
+            bias_attr=False)
+        self.bn = BatchNorm1D(
+            embedding_size, weight_attr=False, bias_attr=False)
+
+    def forward(self, x):
+        x = self.conv_6_dw(x)
+        x = self.conv_6_flatten(x)
+        x = self.linear(x)
+        x = self.bn(x)
+        return x
+
+
+class SELayer(Layer):
+    """ SE block
+    """
+
+    def __init__(self, channels, reduction):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform())
+        self.fc1 = Conv2D(
+            channels,
+            channels // reduction,
+            kernel_size=1,
+            padding=0,
+            weight_attr=weight_attr,
+            bias_attr=False)
+
+        self.relu = ReLU()
+        self.fc2 = Conv2D(
+            channels // reduction,
+            channels,
+            kernel_size=1,
+            padding=0,
+            weight_attr=nn.initializer.KaimingNormal(),
+            bias_attr=False)
+
+        self.sigmoid = Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+
+        return module_input * x
+
+
+class BasicBlockIR(Layer):
+    """ BasicBlock for IRNet
+    """
+
+    def __init__(self, in_channel, depth, stride):
+        super(BasicBlockIR, self).__init__()
+
+        weight_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=0.0))
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2D(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2D(
+                    in_channel,
+                    depth, (1, 1),
+                    stride,
+                    weight_attr=nn.initializer.KaimingNormal(),
+                    bias_attr=False),
+                BatchNorm2D(
+                    depth, weight_attr=weight_attr, bias_attr=bias_attr))
+        self.res_layer = Sequential(
+            BatchNorm2D(
+                in_channel, weight_attr=weight_attr, bias_attr=bias_attr),
+            Conv2D(
+                in_channel,
+                depth, (3, 3), (1, 1),
+                1,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                depth, weight_attr=weight_attr, bias_attr=bias_attr),
+            PReLU(depth),
+            Conv2D(
+                depth,
+                depth, (3, 3),
+                stride,
+                1,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                depth, weight_attr=weight_attr, bias_attr=bias_attr))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+
+        return res + shortcut
+
+
+class BottleneckIR(Layer):
+    """ BasicBlock with bottleneck for IRNet
+    """
+
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIR, self).__init__()
+        reduction_channel = depth // 4
+        weight_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=0.0))
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2D(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2D(
+                    in_channel,
+                    depth, (1, 1),
+                    stride,
+                    weight_attr=nn.initializer.KaimingNormal(),
+                    bias_attr=False),
+                BatchNorm2D(
+                    depth, weight_attr=weight_attr, bias_attr=bias_attr))
+        self.res_layer = Sequential(
+            BatchNorm2D(
+                in_channel, weight_attr=weight_attr, bias_attr=bias_attr),
+            Conv2D(
+                in_channel,
+                reduction_channel, (1, 1), (1, 1),
+                0,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                reduction_channel,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr),
+            PReLU(reduction_channel),
+            Conv2D(
+                reduction_channel,
+                reduction_channel, (3, 3), (1, 1),
+                1,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                reduction_channel,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr),
+            PReLU(reduction_channel),
+            Conv2D(
+                reduction_channel,
+                depth, (1, 1),
+                stride,
+                0,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                depth, weight_attr=weight_attr, bias_attr=bias_attr))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+
+        return res + shortcut
+
+
+class BasicBlockIRSE(BasicBlockIR):
+    def __init__(self, in_channel, depth, stride):
+        super(BasicBlockIRSE, self).__init__(in_channel, depth, stride)
+        self.res_layer.add_sublayer("se_block", SELayer(depth, 16))
+
+
+class BottleneckIRSE(BottleneckIR):
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIRSE, self).__init__(in_channel, depth, stride)
+        self.res_layer.add_sublayer("se_block", SELayer(depth, 16))
+
+
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+    '''A named tuple describing a ResNet block.'''
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+
+    return [Bottleneck(in_channel, depth, stride)] +\
+           [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+    if num_layers == 18:
+        blocks = [
+            get_block(
+                in_channel=64, depth=64, num_units=2), get_block(
+                    in_channel=64, depth=128, num_units=2), get_block(
+                        in_channel=128, depth=256, num_units=2), get_block(
+                            in_channel=256, depth=512, num_units=2)
+        ]
+    elif num_layers == 34:
+        blocks = [
+            get_block(
+                in_channel=64, depth=64, num_units=3), get_block(
+                    in_channel=64, depth=128, num_units=4), get_block(
+                        in_channel=128, depth=256, num_units=6), get_block(
+                            in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 50:
+        blocks = [
+            get_block(
+                in_channel=64, depth=64, num_units=3), get_block(
+                    in_channel=64, depth=128, num_units=4), get_block(
+                        in_channel=128, depth=256, num_units=14), get_block(
+                            in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(
+                in_channel=64, depth=64, num_units=3), get_block(
+                    in_channel=64, depth=128, num_units=13), get_block(
+                        in_channel=128, depth=256, num_units=30), get_block(
+                            in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(
+                in_channel=64, depth=256, num_units=3), get_block(
+                    in_channel=256, depth=512, num_units=8), get_block(
+                        in_channel=512, depth=1024, num_units=36), get_block(
+                            in_channel=1024, depth=2048, num_units=3)
+        ]
+    elif num_layers == 200:
+        blocks = [
+            get_block(
+                in_channel=64, depth=256, num_units=3), get_block(
+                    in_channel=256, depth=512, num_units=24), get_block(
+                        in_channel=512, depth=1024, num_units=36), get_block(
+                            in_channel=1024, depth=2048, num_units=3)
+        ]
+
+    return blocks
+
+
+class Backbone(Layer):
+    def __init__(self, input_size, num_layers, mode='ir'):
+        """ Args:
+            input_size: input_size of backbone
+            num_layers: num_layers of backbone
+            mode: support ir or irse
+        """
+        super(Backbone, self).__init__()
+        assert input_size[0] in [112, 224], \
+            "input_size should be [112, 112] or [224, 224]"
+        assert num_layers in [18, 34, 50, 100, 152, 200], \
+            "num_layers should be 18, 34, 50, 100 or 152"
+        assert mode in ['ir', 'ir_se'], \
+            "mode should be ir or ir_se"
+        weight_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=0.0))
+        self.input_layer = Sequential(
+            Conv2D(
+                3,
+                64, (3, 3),
+                1,
+                1,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                64, weight_attr=weight_attr, bias_attr=bias_attr),
+            PReLU(64))
+        blocks = get_blocks(num_layers)
+        if num_layers <= 100:
+            if mode == 'ir':
+                unit_module = BasicBlockIR
+            elif mode == 'ir_se':
+                unit_module = BasicBlockIRSE
+            output_channel = 512
+        else:
+            if mode == 'ir':
+                unit_module = BottleneckIR
+            elif mode == 'ir_se':
+                unit_module = BottleneckIRSE
+            output_channel = 2048
+
+        if input_size[0] == 112:
+            self.output_layer = Sequential(
+                BatchNorm2D(
+                    output_channel,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr),
+                Dropout(0.4),
+                Flatten(),
+                Linear(
+                    output_channel * 7 * 7,
+                    512,
+                    weight_attr=nn.initializer.KaimingNormal()),
+                BatchNorm1D(
+                    512, weight_attr=False, bias_attr=False))
+        else:
+            self.output_layer = Sequential(
+                BatchNorm2D(
+                    output_channel,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr),
+                Dropout(0.4),
+                Flatten(),
+                Linear(
+                    output_channel * 14 * 14,
+                    512,
+                    weight_attr=nn.initializer.KaimingNormal()),
+                BatchNorm1D(
+                    512, weight_attr=False, bias_attr=False))
+
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel, bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+
+        # initialize_weights(self.modules())
+
+    def forward(self, x):
+
+        # current code only supports one extra image
+        # it comes with a extra dimension for number of extra image. We will just squeeze it out for now
+        x = self.input_layer(x)
+
+        for idx, module in enumerate(self.body):
+            x = module(x)
+
+        x = self.output_layer(x)
+        # norm = paddle.norm(x, 2, 1, True)
+        # output = paddle.divide(x, norm)
+        # return output, norm
+        return x
+
+
+def AdaFace_IR_18(input_size=(112, 112)):
+    """ Constructs a ir-18 model.
+    """
+    model = Backbone(input_size, 18, 'ir')
+    return model
+
+
+def AdaFace_IR_34(input_size=(112, 112)):
+    """ Constructs a ir-34 model.
+    """
+    model = Backbone(input_size, 34, 'ir')
+
+    return model
+
+
+def AdaFace_IR_50(input_size=(112, 112)):
+    """ Constructs a ir-50 model.
+    """
+    model = Backbone(input_size, 50, 'ir')
+
+    return model
+
+
+def AdaFace_IR_101(input_size=(112, 112)):
+    """ Constructs a ir-101 model.
+    """
+    model = Backbone(input_size, 100, 'ir')
+
+    return model
+
+
+def AdaFace_IR_152(input_size=(112, 112)):
+    """ Constructs a ir-152 model.
+    """
+    model = Backbone(input_size, 152, 'ir')
+
+    return model
+
+
+def AdaFace_IR_200(input_size=(112, 112)):
+    """ Constructs a ir-200 model.
+    """
+    model = Backbone(input_size, 200, 'ir')
+
+    return model
+
+
+def AdaFace_IR_SE_50(input_size=(112, 112)):
+    """ Constructs a ir_se-50 model.
+    """
+    model = Backbone(input_size, 50, 'ir_se')
+
+    return model
+
+
+def AdaFace_IR_SE_101(input_size=(112, 112)):
+    """ Constructs a ir_se-101 model.
+    """
+    model = Backbone(input_size, 100, 'ir_se')
+
+    return model
+
+
+def AdaFace_IR_SE_152(input_size=(112, 112)):
+    """ Constructs a ir_se-152 model.
+    """
+    model = Backbone(input_size, 152, 'ir_se')
+
+    return model
+
+
+def AdaFace_IR_SE_200(input_size=(112, 112)):
+    """ Constructs a ir_se-200 model.
+    """
+    model = Backbone(input_size, 200, 'ir_se')
+
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/alexnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/alexnet.py
new file mode 100755
index 000000000..7020d5db7
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/alexnet.py
@@ -0,0 +1,170 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout, ReLU
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "AlexNet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/AlexNet_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvPoolLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride,
+                 padding,
+                 stdv,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvPoolLayer, self).__init__()
+
+        self.relu = ReLU() if act == "relu" else None
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(
+                name=name + "_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name=name + "_offset", initializer=Uniform(-stdv, stdv)))
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        if self.relu is not None:
+            x = self.relu(x)
+        x = self._pool(x)
+        return x
+
+
+class AlexNetDY(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(AlexNetDY, self).__init__()
+
+        stdv = 1.0 / math.sqrt(3 * 11 * 11)
+        self._conv1 = ConvPoolLayer(
+            3, 64, 11, 4, 2, stdv, act="relu", name="conv1")
+        stdv = 1.0 / math.sqrt(64 * 5 * 5)
+        self._conv2 = ConvPoolLayer(
+            64, 192, 5, 1, 2, stdv, act="relu", name="conv2")
+        stdv = 1.0 / math.sqrt(192 * 3 * 3)
+        self._conv3 = Conv2D(
+            192,
+            384,
+            3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(
+                name="conv3_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="conv3_offset", initializer=Uniform(-stdv, stdv)))
+        stdv = 1.0 / math.sqrt(384 * 3 * 3)
+        self._conv4 = Conv2D(
+            384,
+            256,
+            3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(
+                name="conv4_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="conv4_offset", initializer=Uniform(-stdv, stdv)))
+        stdv = 1.0 / math.sqrt(256 * 3 * 3)
+        self._conv5 = ConvPoolLayer(
+            256, 256, 3, 1, 1, stdv, act="relu", name="conv5")
+        stdv = 1.0 / math.sqrt(256 * 6 * 6)
+
+        self._drop1 = Dropout(p=0.5, mode="downscale_in_infer")
+        self._fc6 = Linear(
+            in_features=256 * 6 * 6,
+            out_features=4096,
+            weight_attr=ParamAttr(
+                name="fc6_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc6_offset", initializer=Uniform(-stdv, stdv)))
+
+        self._drop2 = Dropout(p=0.5, mode="downscale_in_infer")
+        self._fc7 = Linear(
+            in_features=4096,
+            out_features=4096,
+            weight_attr=ParamAttr(
+                name="fc7_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc7_offset", initializer=Uniform(-stdv, stdv)))
+        self._fc8 = Linear(
+            in_features=4096,
+            out_features=class_num,
+            weight_attr=ParamAttr(
+                name="fc8_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc8_offset", initializer=Uniform(-stdv, stdv)))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        x = self._conv3(x)
+        x = F.relu(x)
+        x = self._conv4(x)
+        x = F.relu(x)
+        x = self._conv5(x)
+        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
+        x = self._drop1(x)
+        x = self._fc6(x)
+        x = F.relu(x)
+        x = self._drop2(x)
+        x = self._fc7(x)
+        x = F.relu(x)
+        x = self._fc8(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def AlexNet(pretrained=False, use_ssld=False, **kwargs):
+    model = AlexNetDY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["AlexNet"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/cae.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/cae.py
new file mode 100755
index 000000000..ac0f044bb
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/cae.py
@@ -0,0 +1,860 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/PaddlePaddle/VIMER/blob/main/CAE/models/modeling_finetune.py
+# reference: https://arxiv.org/abs/2202.03026
+
+import collections
+from itertools import repeat
+import math
+import numpy as np
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....utils.download import get_weights_path_from_url
+
+MODEL_URLS = {
+    "cae_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/cae_base_patch16_224_pretrained.pdparams",
+    "cae_large_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/cae_large_patch16_224_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+def trunc_normal_(tensor, mean=0., std=1.):
+    nn.initializer.TruncatedNormal(mean=mean, std=std)(tensor)
+
+
+def drop_path(x, drop_prob: float=0., training: bool=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor.floor_()  # binarize
+    output = x / keep_prob * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias_attr=True)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias_attr=True)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.zeros_ = nn.initializer.Constant(value=0.)
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias_attr=False)
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                [all_head_dim], default_initializer=self.zeros_)
+            self.v_bias = self.create_parameter(
+                [all_head_dim], default_initializer=self.zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = self.create_parameter(
+                [self.num_relative_distance, num_heads],
+                default_initializer=self.zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size[0])
+            coords_w = paddle.arange(window_size[1])
+            coords = paddle.stack(paddle.meshgrid(
+                [coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :,
+                                             None] - coords_flatten[:,
+                                                                    None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.transpose(
+                [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[
+                0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                paddle.zeros((window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index",
+                                 relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim, bias_attr=True)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            k_bias = paddle.zeros_like(self.v_bias)
+            k_bias.stop_gradient = True
+            qkv_bias = paddle.concat((self.q_bias, k_bias, self.v_bias))
+        # qkv = self.qkv(x).reshape([B, N, 3, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        qkv = F.linear(x=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape([B, N, 3, self.num_heads, -1]).transpose(
+            [2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @k.transpose([0, 1, 3, 2]))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose(
+                [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @v).transpose([0, 2, 1, 3]).reshape([B, N, -1])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+        if init_values > 0:
+            self.gamma_1 = self.create_parameter(
+                [dim],
+                default_initializer=nn.initializer.Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                [dim],
+                default_initializer=nn.initializer.Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        to_2tuple = _ntuple(2)
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0],
+                            img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.in_chans = in_chans
+        self.out_chans = embed_dim
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias_attr=True)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose([0, 2, 1])
+        return x
+
+    def _init_weights(self):
+        fan_out = self.out_chans
+        fan_in = self.patch_size[0] * self.patch_size[1] * self.in_chans
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.XavierUniform(fan_in, fan_out))  # MAE
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0))
+        return weight_attr, bias_attr
+
+
+class RelativePositionBias(nn.Layer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.zeros_ = nn.initializer.Constant(value=0.)
+        self.relative_position_bias_table = self.create_parameter(
+            [self.num_relative_distance, num_heads],
+            default_initializer=self.zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+
+
+def get_sinusoid_encoding_table(n_position, d_hid, token=False):
+    ''' Sinusoid position encoding table '''
+
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+    if token:
+        sinusoid_table = np.concatenate(
+            [sinusoid_table, np.zeros([1, d_hid])], dim=0)
+
+    return paddle.to_tensor(sinusoid_table).unsqueeze(0)
+
+
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 init_values=None,
+                 use_abs_pos_emb=True,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True,
+                 init_scale=0.001,
+                 lin_probe=False,
+                 sin_pos_emb=True,
+                 args=None):
+        super().__init__()
+        self.class_num = class_num
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.use_mean_pooling = use_mean_pooling
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.zeros_ = nn.initializer.Constant(value=0.)
+        self.ones_ = nn.initializer.Constant(value=1.)
+
+        self.cls_token = self.create_parameter(
+            [1, 1, embed_dim], default_initializer=self.zeros_)
+
+        self.use_abs_pos_emb = use_abs_pos_emb
+        if use_abs_pos_emb:
+            self.pos_embed = self.create_parameter(
+                [1, num_patches + 1, embed_dim],
+                default_initializer=self.zeros_)
+        elif sin_pos_emb:
+            # sine-cosine positional embeddings is on the way
+            self.pos_embed = self.create_parameter(
+                [1, num_patches + 1, embed_dim],
+                default_initializer=self.zeros_)
+            self.pos_embed.set_value(
+                self.build_2d_sincos_position_embedding(embed_dim))
+            self.pos_embed.stop_gradient = True  # fixed sin-cos embedding
+        else:
+            self.pos_embed = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None) for i in range(depth)
+        ])
+        self.norm = nn.Identity() if use_mean_pooling else norm_layer(
+            embed_dim)
+
+        self.lin_probe = lin_probe
+        # NOTE: batch norm
+        if lin_probe:
+            # TODO
+            from models.lincls_bn import LP_BatchNorm
+            self.fc_norm = LP_BatchNorm(embed_dim, affine=False)
+        else:
+            if use_mean_pooling:
+                self.fc_norm = norm_layer(embed_dim)
+            else:
+                self.fc_norm = None
+        self.head = nn.Linear(embed_dim,
+                              class_num) if class_num > 0 else nn.Identity()
+
+        if self.pos_embed is not None and use_abs_pos_emb:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+        self.head.weight.set_value(self.head.weight * init_scale)
+        self.head.bias.set_value(self.head.bias * init_scale)
+
+    def build_2d_sincos_position_embedding(self,
+                                           embed_dim=768,
+                                           temperature=10000.):
+        h, w = self.patch_embed.patch_shape
+        grid_w = paddle.arange(w, dtype=paddle.float32)
+        grid_h = paddle.arange(h, dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+        out_w = paddle.einsum('m,d->md', grid_w.flatten(), omega)
+        out_h = paddle.einsum('m,d->md', grid_h.flatten(), omega)
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+        # if not self.use_mean_pooling:
+        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
+        pos_emb = paddle.concat([pe_token, pos_emb], axis=1)
+        return pos_emb
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.set_value(param / math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight, layer_id + 1)
+            rescale(layer.mlp.fc2.weight, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                self.zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            self.zeros_(m.bias)
+            self.ones_(m.weight)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, class_num, global_pool=''):
+        self.class_num = class_num
+        self.head = nn.Linear(self.embed_dim,
+                              class_num) if class_num > 0 else nn.Identity()
+
+    def forward_features(self, x, is_train=True):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.shape
+
+        cls_tokens = self.cls_token.expand([
+            batch_size, -1, -1
+        ]).astype(x.dtype)  # stole cls_tokens impl from Phil Wang, thanks
+        x = paddle.concat((cls_tokens, x), axis=1)
+        if self.pos_embed is not None:
+            if self.use_abs_pos_emb:
+                x = x + self.pos_embed.expand(
+                    [batch_size, -1, -1]).astype(x.dtype).clone().detach()
+            else:
+                x = x + self.pos_embed.expand(
+                    [batch_size, -1, -1]).astype(x.dtype).clone().detach()
+
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias=rel_pos_bias)
+
+        x = self.norm(x)
+        if self.fc_norm is not None:
+            t = x[:, 1:, :]
+            if self.lin_probe:
+                if self.use_mean_pooling:
+                    return self.fc_norm(t.mean(1), is_train=is_train)
+                else:
+                    return self.fc_norm(x[:, 0], is_train=is_train)
+            else:
+                return self.fc_norm(t.mean(1))
+
+        else:
+            return x[:, 0]
+
+    def forward(self, x, is_train=True):
+        x = self.forward_features(x, is_train)
+        x = self.head(x)
+        return x
+
+
+def _enable_linear_eval(model):
+    zeros_ = nn.initializer.Constant(value=0.)
+    normal_ = nn.initializer.Normal(mean=0.0, std=0.01)
+    linear_keyword = 'head'
+    head_norm = 'fc_norm'
+    requires_grad = []
+    for name, param in model.named_parameters():
+        if name not in [
+                '%s.weight' % linear_keyword, '%s.bias' % linear_keyword
+        ] and head_norm not in name:
+            param.stop_gradient = True
+        else:
+            requires_grad.append(name)
+    # init the fc layer
+    normal_(getattr(model, linear_keyword).weight)
+    zeros_(getattr(model, linear_keyword).bias)
+
+    return
+
+
+def _load_pretrained(pretrained,
+                     pretrained_url,
+                     model,
+                     model_keys,
+                     model_ema_configs,
+                     use_abs_pos_emb,
+                     use_rel_pos_bias,
+                     use_ssld=False):
+    if pretrained is False:
+        return
+    elif pretrained is True:
+        local_weight_path = get_weights_path_from_url(pretrained_url).replace(
+            ".pdparams", "")
+        checkpoint = paddle.load(local_weight_path + ".pdparams")
+    elif isinstance(pretrained, str):
+        checkpoint = paddle.load(pretrained + ".pdparams")
+
+    checkpoint_model = None
+    for model_key in model_keys.split('|'):
+        if model_key in checkpoint:
+            checkpoint_model = checkpoint[model_key]
+            break
+
+    if checkpoint_model is None:
+        checkpoint_model = checkpoint
+    state_dict = model.state_dict()
+    all_keys = list(checkpoint_model.keys())
+    # NOTE: remove all decoder keys
+    all_keys = [key for key in all_keys if key.startswith('encoder.')]
+    for key in all_keys:
+        new_key = key.replace('encoder.', '')
+        checkpoint_model[new_key] = checkpoint_model[key]
+        checkpoint_model.pop(key)
+
+    for key in list(checkpoint_model.keys()):
+        if key.startswith('regressor_and_decoder.'):
+            checkpoint_model.pop(key)
+        if key.startswith('teacher_network.'):
+            checkpoint_model.pop(key)
+
+        # NOTE: replace norm with fc_norm
+    for key in list(checkpoint_model.keys()):
+        if key.startswith('norm.'):
+            new_key = key.replace('norm.', 'fc_norm.')
+            checkpoint_model[new_key] = checkpoint_model[key]
+            checkpoint_model.pop(key)
+
+    for k in ['head.weight', 'head.bias']:
+        if k in checkpoint_model and checkpoint_model[k].shape != state_dict[
+                k].shape:
+            del checkpoint_model[k]
+
+    if model.use_rel_pos_bias and "rel_pos_bias.relative_position_bias_table" in checkpoint_model:
+        num_layers = model.get_num_layers()
+        rel_pos_bias = checkpoint_model[
+            "rel_pos_bias.relative_position_bias_table"]
+        for i in range(num_layers):
+            checkpoint_model["blocks.%d.attn.relative_position_bias_table" %
+                             i] = rel_pos_bias.clone()
+
+        checkpoint_model.pop("rel_pos_bias.relative_position_bias_table")
+
+    all_keys = list(checkpoint_model.keys())
+
+    for key in all_keys:
+        if "relative_position_index" in key:
+            checkpoint_model.pop(key)
+
+        if "relative_position_bias_table" in key and use_rel_pos_bias:
+            rel_pos_bias = checkpoint_model[key]
+            src_num_pos, num_attn_heads = rel_pos_bias.shape
+            dst_num_pos, _ = model.state_dict()[key].shape
+            dst_patch_shape = model.patch_embed.patch_shape
+            if dst_patch_shape[0] != dst_patch_shape[1]:
+                raise NotImplementedError()
+            num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (
+                dst_patch_shape[1] * 2 - 1)
+            src_size = int((src_num_pos - num_extra_tokens)**0.5)
+            dst_size = int((dst_num_pos - num_extra_tokens)**0.5)
+            if src_size != dst_size:
+                extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+
+                def geometric_progression(a, r, n):
+                    return a * (1.0 - r**n) / (1.0 - r)
+
+                left, right = 1.01, 1.5
+                while right - left > 1e-6:
+                    q = (left + right) / 2.0
+                    gp = geometric_progression(1, q, src_size // 2)
+                    if gp > dst_size // 2:
+                        right = q
+                    else:
+                        left = q
+
+                dis = []
+                cur = 1
+                for i in range(src_size // 2):
+                    dis.append(cur)
+                    cur += q**(i + 1)
+
+                r_ids = [-_ for _ in reversed(dis)]
+
+                x = r_ids + [0] + dis
+                y = r_ids + [0] + dis
+
+                t = dst_size // 2.0
+                dx = np.arange(-t, t + 0.1, 1.0)
+                dy = np.arange(-t, t + 0.1, 1.0)
+
+                all_rel_pos_bias = []
+
+                for i in range(num_attn_heads):
+                    z = rel_pos_bias[:, i].view(src_size,
+                                                src_size).float().numpy()
+                    f = interpolate.interp2d(x, y, z, kind='cubic')
+                    all_rel_pos_bias.append(
+                        paddle.Tensor(f(dx, dy)).astype('float32').reshape(
+                            [-1, 1]))
+
+                rel_pos_bias = paddle.concat(all_rel_pos_bias, axis=-1)
+
+                new_rel_pos_bias = paddle.concat(
+                    (rel_pos_bias, extra_tokens), axis=0)
+                checkpoint_model[key] = new_rel_pos_bias
+
+    # interpolate position embedding
+    if 'pos_embed' in checkpoint_model and use_abs_pos_emb:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)**
+                        0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches**0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size,
+                                            embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = paddle.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size, new_size),
+                mode='bicubic',
+                align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+    msg = model.set_state_dict(checkpoint_model)
+
+    model_without_ddp = model
+    n_parameters = sum(p.numel() for p in model.parameters()
+                       if not p.stop_gradient).item()
+
+    return
+
+
+def cae_base_patch16_224(pretrained=True, use_ssld=False, **kwargs):
+    config = kwargs.copy()
+    enable_linear_eval = config.pop('enable_linear_eval')
+    model_keys = config.pop('model_key')
+    model_ema_configs = config.pop('model_ema')
+    use_abs_pos_emb = config.get('use_abs_pos_emb', False)
+    use_rel_pos_bias = config.get('use_rel_pos_bias', True)
+    if pretrained in config:
+        pretrained = config.pop('pretrained')
+
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        **config)
+
+    if enable_linear_eval:
+        _enable_linear_eval(model)
+
+    _load_pretrained(
+        pretrained,
+        MODEL_URLS["cae_base_patch16_224"],
+        model,
+        model_keys,
+        model_ema_configs,
+        use_abs_pos_emb,
+        use_rel_pos_bias,
+        use_ssld=False)
+
+    return model
+
+
+def cae_large_patch16_224(pretrained=True, use_ssld=False, **kwargs):
+    config = kwargs.copy()
+    enable_linear_eval = config.pop('enable_linear_eval')
+    model_keys = config.pop('model_key')
+    model_ema_configs = config.pop('model_ema')
+    use_abs_pos_emb = config.get('use_abs_pos_emb', False)
+    use_rel_pos_bias = config.get('use_rel_pos_bias', True)
+    if pretrained in config:
+        pretrained = config.pop('pretrained')
+
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        **config)
+
+    if enable_linear_eval:
+        _enable_linear_eval(model)
+
+    _load_pretrained(
+        pretrained,
+        MODEL_URLS["cae_large_patch16_224"],
+        model,
+        model_keys,
+        model_ema_configs,
+        use_abs_pos_emb,
+        use_rel_pos_bias,
+        use_ssld=False)
+
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/convnext.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/convnext.py
new file mode 100755
index 000000000..3773fac56
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/convnext.py
@@ -0,0 +1,282 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Code was heavily based on https://github.com/facebookresearch/ConvNeXt
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ConvNeXt_tiny":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_tiny_pretrained.pdparams",
+    "ConvNeXt_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_small_pretrained.pdparams",
+    "ConvNeXt_base_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_base_224_pretrained.pdparams",
+    "ConvNeXt_base_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_base_384_pretrained.pdparams",
+    "ConvNeXt_large_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_large_224_pretrained.pdparams",
+    "ConvNeXt_large_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_large_384_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class ChannelsFirstLayerNorm(nn.Layer):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, epsilon=1e-5):
+        super().__init__()
+        self.weight = self.create_parameter(
+            shape=[normalized_shape], default_initializer=ones_)
+        self.bias = self.create_parameter(
+            shape=[normalized_shape], default_initializer=zeros_)
+        self.epsilon = epsilon
+        self.normalized_shape = [normalized_shape]
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / paddle.sqrt(s + self.epsilon)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class Block(nn.Layer):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2D(
+            dim, dim, 7, padding=3, groups=dim)  # depthwise conv
+        self.norm = nn.LayerNorm(dim, epsilon=1e-6)
+        # pointwise/1x1 convs, implemented with linear layers
+        self.pwconv1 = nn.Linear(dim, 4 * dim)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        if layer_scale_init_value > 0:
+            self.gamma = self.create_parameter(
+                shape=[dim],
+                default_initializer=Constant(value=layer_scale_init_value))
+        else:
+            self.gamma = None
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.transpose([0, 2, 3, 1])  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose([0, 3, 1, 2])  # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class ConvNeXt(nn.Layer):
+    r""" ConvNeXt
+        A PaddlePaddle impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        class_num (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+
+    def __init__(self,
+                 in_chans=3,
+                 class_num=1000,
+                 depths=[3, 3, 9, 3],
+                 dims=[96, 192, 384, 768],
+                 drop_path_rate=0.,
+                 layer_scale_init_value=1e-6,
+                 head_init_scale=1.):
+        super().__init__()
+
+        # stem and 3 intermediate downsampling conv layers
+        self.downsample_layers = nn.LayerList()
+        stem = nn.Sequential(
+            nn.Conv2D(
+                in_chans, dims[0], 4, stride=4),
+            ChannelsFirstLayerNorm(
+                dims[0], epsilon=1e-6))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                ChannelsFirstLayerNorm(
+                    dims[i], epsilon=1e-6),
+                nn.Conv2D(
+                    dims[i], dims[i + 1], 2, stride=2), )
+            self.downsample_layers.append(downsample_layer)
+
+        # 4 feature resolution stages, each consisting of multiple residual blocks
+        self.stages = nn.LayerList()
+        dp_rates = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(* [
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.norm = nn.LayerNorm(dims[-1], epsilon=1e-6)  # final norm layer
+        self.head = nn.Linear(dims[-1], class_num)
+
+        self.apply(self._init_weights)
+        self.head.weight.set_value(self.head.weight * head_init_scale)
+        self.head.bias.set_value(self.head.bias * head_init_scale)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2D, nn.Linear)):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        # global average pooling, (N, C, H, W) -> (N, C)
+        return self.norm(x.mean([-2, -1]))
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ConvNeXt_tiny(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_tiny"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_small(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_small"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_base_224(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(
+        depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_base_224"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_base_384(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(
+        depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_base_384"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_large_224(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(
+        depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_large_224"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_large_384(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(
+        depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_large_384"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/cspnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/cspnet.py
new file mode 100755
index 000000000..c62206328
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/cspnet.py
@@ -0,0 +1,377 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/rwightman/pytorch-image-models
+# reference: https://arxiv.org/abs/1911.11929
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "CSPDarkNet53":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSPDarkNet53_pretrained.pdparams"
+}
+
+MODEL_CFGS = {
+    "CSPDarkNet53": dict(
+        stem=dict(
+            out_chs=32, kernel_size=3, stride=1, pool=''),
+        stage=dict(
+            out_chs=(64, 128, 256, 512, 1024),
+            depth=(1, 2, 8, 8, 4),
+            stride=(2, ) * 5,
+            exp_ratio=(2., ) + (1., ) * 4,
+            bottle_ratio=(0.5, ) + (1.0, ) * 4,
+            block_ratio=(1., ) + (0.5, ) * 4,
+            down_growth=True, ))
+}
+
+__all__ = ['CSPDarkNet53'
+           ]  # model_registry will add each entrypoint fn to this
+
+
+class ConvBnAct(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=None,
+                 dilation=1,
+                 groups=1,
+                 act_layer=nn.LeakyReLU,
+                 norm_layer=nn.BatchNorm2D):
+        super().__init__()
+        if padding is None:
+            padding = (kernel_size - 1) // 2
+        self.conv = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=ParamAttr(),
+            bias_attr=False)
+
+        self.bn = norm_layer(num_features=output_channels)
+        self.act = act_layer()
+
+    def forward(self, inputs):
+        x = self.conv(inputs)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+def create_stem(in_chans=3,
+                out_chs=32,
+                kernel_size=3,
+                stride=2,
+                pool='',
+                act_layer=None,
+                norm_layer=None):
+    stem = nn.Sequential()
+    if not isinstance(out_chs, (tuple, list)):
+        out_chs = [out_chs]
+    assert len(out_chs)
+    in_c = in_chans
+    for i, out_c in enumerate(out_chs):
+        conv_name = f'conv{i + 1}'
+        stem.add_sublayer(
+            conv_name,
+            ConvBnAct(
+                in_c,
+                out_c,
+                kernel_size,
+                stride=stride if i == 0 else 1,
+                act_layer=act_layer,
+                norm_layer=norm_layer))
+        in_c = out_c
+        last_conv = conv_name
+    if pool:
+        stem.add_sublayer(
+            'pool', nn.MaxPool2D(
+                kernel_size=3, stride=2, padding=1))
+    return stem, dict(
+        num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv]))
+
+
+class DarkBlock(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 dilation=1,
+                 bottle_ratio=0.5,
+                 groups=1,
+                 act_layer=nn.ReLU,
+                 norm_layer=nn.BatchNorm2D,
+                 attn_layer=None,
+                 drop_block=None):
+        super(DarkBlock, self).__init__()
+        mid_chs = int(round(out_chs * bottle_ratio))
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.conv2 = ConvBnAct(
+            mid_chs,
+            out_chs,
+            kernel_size=3,
+            dilation=dilation,
+            groups=groups,
+            **ckwargs)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = x + shortcut
+        return x
+
+
+class CrossStage(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 stride,
+                 dilation,
+                 depth,
+                 block_ratio=1.,
+                 bottle_ratio=1.,
+                 exp_ratio=1.,
+                 groups=1,
+                 first_dilation=None,
+                 down_growth=False,
+                 cross_linear=False,
+                 block_dpr=None,
+                 block_fn=DarkBlock,
+                 **block_kwargs):
+        super(CrossStage, self).__init__()
+        first_dilation = first_dilation or dilation
+        down_chs = out_chs if down_growth else in_chs
+        exp_chs = int(round(out_chs * exp_ratio))
+        block_out_chs = int(round(out_chs * block_ratio))
+        conv_kwargs = dict(
+            act_layer=block_kwargs.get('act_layer'),
+            norm_layer=block_kwargs.get('norm_layer'))
+
+        if stride != 1 or first_dilation != dilation:
+            self.conv_down = ConvBnAct(
+                in_chs,
+                down_chs,
+                kernel_size=3,
+                stride=stride,
+                dilation=first_dilation,
+                groups=groups,
+                **conv_kwargs)
+            prev_chs = down_chs
+        else:
+            self.conv_down = None
+            prev_chs = in_chs
+
+        self.conv_exp = ConvBnAct(
+            prev_chs, exp_chs, kernel_size=1, **conv_kwargs)
+        prev_chs = exp_chs // 2  # output of conv_exp is always split in two
+
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_sublayer(
+                str(i),
+                block_fn(prev_chs, block_out_chs, dilation, bottle_ratio,
+                         groups, **block_kwargs))
+            prev_chs = block_out_chs
+
+        # transition convs
+        self.conv_transition_b = ConvBnAct(
+            prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
+        self.conv_transition = ConvBnAct(
+            exp_chs, out_chs, kernel_size=1, **conv_kwargs)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        x = self.conv_exp(x)
+        split = x.shape[1] // 2
+        xs, xb = x[:, :split], x[:, split:]
+        xb = self.blocks(xb)
+        xb = self.conv_transition_b(xb)
+        out = self.conv_transition(paddle.concat([xs, xb], axis=1))
+        return out
+
+
+class DarkStage(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 stride,
+                 dilation,
+                 depth,
+                 block_ratio=1.,
+                 bottle_ratio=1.,
+                 groups=1,
+                 first_dilation=None,
+                 block_fn=DarkBlock,
+                 block_dpr=None,
+                 **block_kwargs):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+
+        self.conv_down = ConvBnAct(
+            in_chs,
+            out_chs,
+            kernel_size=3,
+            stride=stride,
+            dilation=first_dilation,
+            groups=groups,
+            act_layer=block_kwargs.get('act_layer'),
+            norm_layer=block_kwargs.get('norm_layer'))
+
+        prev_chs = out_chs
+        block_out_chs = int(round(out_chs * block_ratio))
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_sublayer(
+                str(i),
+                block_fn(prev_chs, block_out_chs, dilation, bottle_ratio,
+                         groups, **block_kwargs))
+            prev_chs = block_out_chs
+
+    def forward(self, x):
+        x = self.conv_down(x)
+        x = self.blocks(x)
+        return x
+
+
+def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32):
+    # get per stage args for stage and containing blocks, calculate strides to meet target output_stride
+    num_stages = len(cfg['depth'])
+    if 'groups' not in cfg:
+        cfg['groups'] = (1, ) * num_stages
+    if 'down_growth' in cfg and not isinstance(cfg['down_growth'],
+                                               (list, tuple)):
+        cfg['down_growth'] = (cfg['down_growth'], ) * num_stages
+    stage_strides = []
+    stage_dilations = []
+    stage_first_dilations = []
+    dilation = 1
+    for cfg_stride in cfg['stride']:
+        stage_first_dilations.append(dilation)
+        if curr_stride >= output_stride:
+            dilation *= cfg_stride
+            stride = 1
+        else:
+            stride = cfg_stride
+            curr_stride *= stride
+        stage_strides.append(stride)
+        stage_dilations.append(dilation)
+    cfg['stride'] = stage_strides
+    cfg['dilation'] = stage_dilations
+    cfg['first_dilation'] = stage_first_dilations
+    stage_args = [
+        dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())
+    ]
+    return stage_args
+
+
+class CSPNet(nn.Layer):
+    def __init__(self,
+                 cfg,
+                 in_chans=3,
+                 class_num=1000,
+                 output_stride=32,
+                 global_pool='avg',
+                 drop_rate=0.,
+                 act_layer=nn.LeakyReLU,
+                 norm_layer=nn.BatchNorm2D,
+                 zero_init_last_bn=True,
+                 stage_fn=CrossStage,
+                 block_fn=DarkBlock):
+        super().__init__()
+        self.class_num = class_num
+        self.drop_rate = drop_rate
+        assert output_stride in (8, 16, 32)
+        layer_args = dict(act_layer=act_layer, norm_layer=norm_layer)
+
+        # Construct the stem
+        self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'],
+                                                **layer_args)
+        self.feature_info = [stem_feat_info]
+        prev_chs = stem_feat_info['num_chs']
+        curr_stride = stem_feat_info[
+            'reduction']  # reduction does not include pool
+        if cfg['stem']['pool']:
+            curr_stride *= 2
+
+        # Construct the stages
+        per_stage_args = _cfg_to_stage_args(
+            cfg['stage'], curr_stride=curr_stride, output_stride=output_stride)
+        self.stages = nn.LayerList()
+        for i, sa in enumerate(per_stage_args):
+            self.stages.add_sublayer(
+                str(i),
+                stage_fn(
+                    prev_chs, **sa, **layer_args, block_fn=block_fn))
+            prev_chs = sa['out_chs']
+            curr_stride *= sa['stride']
+            self.feature_info += [
+                dict(
+                    num_chs=prev_chs,
+                    reduction=curr_stride,
+                    module=f'stages.{i}')
+            ]
+
+        # Construct the head
+        self.num_features = prev_chs
+
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        self.flatten = nn.Flatten(1)
+        self.fc = nn.Linear(
+            prev_chs,
+            class_num,
+            weight_attr=ParamAttr(),
+            bias_attr=ParamAttr())
+
+    def forward(self, x):
+        x = self.stem(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = self.pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CSPDarkNet53(pretrained=False, use_ssld=False, **kwargs):
+    model = CSPNet(MODEL_CFGS["CSPDarkNet53"], block_fn=DarkBlock, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["CSPDarkNet53"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/cswin_transformer.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/cswin_transformer.py
new file mode 100755
index 000000000..9d6d26cbe
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/cswin_transformer.py
@@ -0,0 +1,651 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/BR-IDL/PaddleViT/blob/develop/image_classification/CSwin/cswin.py
+# reference: https://arxiv.org/abs/2107.00652
+
+import copy
+import numpy as np
+import paddle
+import paddle.nn as nn
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "CSWinTransformer_tiny_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_tiny_224_pretrained.pdparams",
+    "CSWinTransformer_small_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_small_224_pretrained.pdparams",
+    "CSWinTransformer_base_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_base_224_pretrained.pdparams",
+    "CSWinTransformer_large_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_large_224_pretrained.pdparams",
+    "CSWinTransformer_base_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_base_384_pretrained.pdparams",
+    "CSWinTransformer_large_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_large_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class PatchEmbedding(nn.Layer):
+    """CSwin Patch Embedding
+    This patch embedding has a 7x7 conv + layernorm, the output tensor
+    is reshaped to [Batch, H*W, embed_dim]. Note that the patch is applied
+    by a conv with overlap (using patch_stride).
+    Args:
+        patch_stride: int, patch stride size, default: 4
+        in_channels: int, number of channels of input image, default: 3
+        embed_dim: int, output feature dimension, default: 96
+    """
+
+    def __init__(self, patch_stride=4, in_channels=3, embed_dim=96):
+        super().__init__()
+        self.patch_embed = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=embed_dim,
+            kernel_size=7,
+            stride=patch_stride,
+            padding=2)
+
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.patch_embed(
+            x)  # [batch, embed_dim, h, w], h = w = image_size / 4
+        x = x.flatten(start_axis=2, stop_axis=-1)  # [batch, embed_dim, h*w]
+        x = x.transpose([0, 2, 1])  # [batch, h*w, embed_dim]
+        x = self.norm(x)
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features, dropout):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, in_features)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+def img2windows(img, h_split, w_split):
+    """Convert input tensor into split stripes
+    Args:
+        img: tensor, image tensor with shape [B, C, H, W]
+        h_split: int, splits width in height direction
+        w_split: int, splits width in width direction
+    Returns:
+        out: tensor, splitted image
+    """
+    B, C, H, W = img.shape
+    out = img.reshape([B, C, H // h_split, h_split, W // w_split, w_split])
+    out = out.transpose(
+        [0, 2, 4, 3, 5, 1])  # [B, H//h_split, W//w_split, h_split, w_split, C]
+    out = out.reshape([-1, h_split * w_split,
+                       C])  # [B, H//h_split, W//w_split, h_split*w_split, C]
+    return out
+
+
+def windows2img(img_splits, h_split, w_split, img_h, img_w):
+    """Convert splitted stripes back
+    Args:
+        img_splits: tensor, image tensor with shape [B, C, H, W]
+        h_split: int, splits width in height direction
+        w_split: int, splits width in width direction
+        img_h: int, original tensor height
+        img_w: int, original tensor width
+    Returns:
+        img: tensor, original tensor
+    """
+    B = paddle.to_tensor(img_splits.shape[0] //
+                         (img_h // h_split * img_w // w_split), "int32")
+    img = img_splits.reshape([
+        B, img_h // h_split, img_w // w_split, h_split, w_split,
+        img_splits.shape[-1]
+    ])
+    img = img.transpose(
+        [0, 1, 3, 2, 4,
+         5])  #[B,img_h//h_split, h_split, img_w//w_split, w_split,C]
+    img = img.reshape(
+        [B, img_h, img_w, img_splits.shape[-1]])  # [B, img_h, img_w, C]
+    return img
+
+
+class LePEAttention(nn.Layer):
+    """Cross Shaped Window self-attention with Locally enhanced positional encoding"""
+
+    def __init__(self,
+                 dim,
+                 resolution,
+                 h_split=7,
+                 w_split=7,
+                 num_heads=8,
+                 attention_dropout=0.,
+                 dropout=0.,
+                 qk_scale=None):
+        super().__init__()
+        self.dim = dim
+        self.resolution = resolution
+        self.num_heads = num_heads
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head**-0.5
+        self.h_split = h_split
+        self.w_split = w_split
+
+        self.get_v = nn.Conv2D(
+            in_channels=dim,
+            out_channels=dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=dim)
+
+        self.softmax = nn.Softmax(axis=-1)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+
+    def im2cswin(self, x):
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        x = x.transpose([0, 2, 1])  # [B, C, H*W]
+        x = x.reshape([B, C, H, W])  # [B, C, H, W]
+        x = img2windows(x, self.h_split, self.w_split)
+        x = x.reshape(
+            [-1, self.h_split * self.w_split, self.num_heads, self.dim_head])
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def get_lepe(self, x, func):
+        """Locally Enhanced Positional Encoding (LePE)
+        This module applies a depthwise conv on V and returns the lepe
+        Args:
+            x: tensor, the input tensor V
+            func: nn.Layer, a depth wise conv of kernel 3 stride 1 and padding 1
+        """
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        h_split = self.h_split
+        w_split = self.w_split
+
+        x = x.transpose([0, 2, 1])  # [B, C, H*W]
+        x = x.reshape([B, C, H, W])  # [B, C, H, W]
+        x = x.reshape([B, C, H // h_split, h_split, W // w_split, w_split])
+        x = x.transpose(
+            [0, 2, 4, 1, 3,
+             5])  # [B, H//h_split, W//w_split, C, h_split, w_split]
+        x = x.reshape(
+            [-1, C, h_split,
+             w_split])  # [B*(H//h_split)*(W//w_split), h_split, w_split]
+
+        lepe = func(x)  # depth wise conv does not change shape
+        #lepe = lepe.reshape([-1, self.num_heads, C // self.num_heads, h_split * w_split])
+        lepe = lepe.reshape(
+            [-1, self.num_heads, self.dim_head, h_split * w_split])
+        lepe = lepe.transpose(
+            [0, 1, 3, 2])  # [B, num_heads, h_spllit*w_split, dim_head]
+
+        x = x.reshape([-1, self.num_heads, self.dim_head, h_split * w_split])
+        x = x.transpose(
+            [0, 1, 3, 2])  # [B, num_heads, h_split*wsplit, dim_head]
+        return x, lepe
+
+    def forward(self, q, k, v):
+        B, HW, C = q.shape
+        H = W = self.resolution
+        q = self.im2cswin(q)
+        k = self.im2cswin(k)
+        v, lepe = self.get_lepe(v, self.get_v)
+
+        q = q * self.scale
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z + lepe
+        z = z.transpose([0, 2, 1, 3])
+        z = z.reshape([-1, self.h_split * self.w_split, C])
+
+        z = windows2img(z, self.h_split, self.w_split, H, W)
+        z = z.reshape([B, z.shape[1] * z.shape[2], C])
+        return z
+
+
+class CSwinBlock(nn.Layer):
+    """CSwin Block
+    CSwin block contains a LePE attention modual, a linear projection,
+    a mlp layer, and related norms layers. In the first 3 stages, the
+    LePE attention moduals used 2 branches, where horizontal and
+    vertical split stripes are used for self attention and a concat
+    op is applied to combine the outputs. The last stage does not
+    have branche in LePE attention.
+    Args:
+        dim: int, input feature dimension
+        input_resolution: int, input feature spatial size.
+        num_heads: int, num of attention heads in current stage
+        split_size: int, the split size in current stage
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+        split_heads: bool, if True, split heads is applied (True for 1,2,3 stages), default: True
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 split_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.,
+                 droppath=0.,
+                 split_heads=True):
+        super().__init__()
+        self.dim = dim
+        # NOTE: here assume image_h == imgae_w
+        self.input_resolution = (input_resolution, input_resolution)
+        self.num_heads = num_heads
+        self.dim_head = dim // num_heads
+        self.mlp_ratio = mlp_ratio
+        self.split_size = split_size
+        self.norm1 = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(
+            dim, dim * 3, bias_attr=None if qkv_bias else False)
+        self.attns = nn.LayerList()
+        self.split_heads = split_heads
+
+        num_branches = 2 if split_heads else 1
+        if split_heads:  # first 3 stages
+            splits = [self.input_resolution[0],
+                      self.split_size]  # horizantal splits
+        else:  # last stage
+            splits = [self.input_resolution[0], self.input_resolution[0]]
+        for _ in range(num_branches):
+            attn = LePEAttention(
+                dim=dim // num_branches,
+                resolution=input_resolution,
+                h_split=splits[0],
+                w_split=splits[1],
+                num_heads=num_heads // num_branches,
+                qk_scale=qk_scale,
+                attention_dropout=attention_dropout,
+                dropout=dropout)
+            self.attns.append(copy.deepcopy(attn))
+            # switch splits from horizantal to vertical
+            # NOTE: may need to change for different H and W
+            splits[0], splits[1] = splits[1], splits[0]
+
+        self.proj = nn.Linear(dim, dim)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       dropout=dropout)
+
+    def chunk_qkv(self, x, chunks=1, axis=-1):
+        x = x.chunk(chunks, axis=axis)
+        return x
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, HW, C = x.shape
+        # cswin attention
+        h = x
+        x = self.norm1(x)
+        qkv = self.qkv(x).chunk(3, axis=-1)  # qkv is a tuple of [q, k, v]
+
+        if self.split_heads:
+            q, k, v = map(self.chunk_qkv, qkv,
+                          (2, 2, 2))  # map requries list/tuple inputs
+        else:
+            q, k, v = map(lambda x: [x], qkv)
+
+        if self.split_heads:  # first 3 stages
+            h_attn = self.attns[0](q[0], k[0], v[0])
+            w_attn = self.attns[1](q[1], k[1], v[1])
+            attn = paddle.concat([h_attn, w_attn], axis=2)
+        else:  # last stage
+            attn = self.attns[0](q[0], k[0], v[0])
+        attn = self.proj(attn)
+        attn = self.drop_path(attn)
+        x = h + attn
+        # mlp + residual
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+        return x
+
+
+class MergeBlock(nn.Layer):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.norm = nn.LayerNorm(dim_out)
+
+    def forward(self, x):
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        x = x.transpose([0, 2, 1])  # [B, C, HW]
+        x = x.reshape([B, C, H, W])  # [B, C, H, W]
+        x = self.conv(x)
+        new_shape = [x.shape[0], x.shape[1],
+                     x.shape[2] * x.shape[3]]  # [B, C', H*W]
+        x = x.reshape(new_shape)  # [B, C', H*W]
+        x = x.transpose([0, 2, 1])  # [B, H*W, C']
+        x = self.norm(x)
+        return x
+
+
+class CSwinStage(nn.Layer):
+    """ CSwin Stage, each stage contains multi blocks
+    CSwin has 4 stages, the first 3 stages are using head split. The last
+    stage does not have head split. There is a merge block between each
+    2 stages.
+    Args:
+        dim: int, input feature dimension
+        depth: int, number of blocks in current stage
+        num_heads: int, num of attention heads in current stage
+        split_size: int, the split size in current stage
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+        last_stage: bool, if current stage is the last stage, default: False
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 split_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 last_stage=False):
+        super().__init__()
+        self.blocks = nn.LayerList()
+        for i in range(depth):
+            block = CSwinBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                split_size=split_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attention_dropout=attention_dropout,
+                dropout=dropout,
+                droppath=droppath[i]
+                if isinstance(droppath, list) else droppath,
+                split_heads=not last_stage)
+            self.blocks.append(copy.deepcopy(block))
+        # last stage does not need merge layer
+        self.merge = MergeBlock(
+            dim_in=dim, dim_out=dim * 2) if not last_stage else Identity()
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.merge(x)
+        return x
+
+
+class CSwinTransformer(nn.Layer):
+    """CSwin Transformer class
+    Args:
+        image_size: int, input image size, default: 224
+        patch_stride: int, stride for patch embedding, default: 4
+        in_channels: int, num of channels of input image, default: 3
+        num_classes: int, num of classes, default: 1000
+        embed_dim: int, embedding dim (patch embed out dim), default: 96
+        depths: list/tuple(int), number of blocks in each stage, default: [2, 4, 32, 2]
+        splits: list/tuple(int), the split number in each stage, default: [1, 2, 7, 7]
+        num_heads: list/tuple(int), num of attention heads in each stage, default: [4, 8, 16, 32]
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+    """
+
+    def __init__(self,
+                 image_size=224,
+                 patch_stride=4,
+                 in_channels=3,
+                 class_num=1000,
+                 embed_dim=96,
+                 depths=[2, 4, 32, 2],
+                 splits=[1, 2, 7, 7],
+                 num_heads=[4, 8, 16, 32],
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        # token embedding
+        self.patch_embedding = PatchEmbedding(
+            patch_stride=patch_stride,
+            in_channels=in_channels,
+            embed_dim=embed_dim)
+        # drop path decay by stage
+        depth_decay = [
+            x.item() for x in paddle.linspace(0, droppath, sum(depths))
+        ]
+        dim = embed_dim
+        resolution = image_size // 4
+        self.stages = nn.LayerList()
+        num_stages = len(depths)
+        # construct CSwin stages: each stage has multiple blocks
+        for stage_idx in range(num_stages):
+            stage = CSwinStage(
+                dim=dim,
+                input_resolution=resolution,
+                depth=depths[stage_idx],
+                num_heads=num_heads[stage_idx],
+                split_size=splits[stage_idx],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                dropout=dropout,
+                attention_dropout=attention_dropout,
+                droppath=depth_decay[sum(depths[:stage_idx]):sum(
+                    depths[:stage_idx + 1])],
+                last_stage=stage_idx == num_stages - 1)
+            self.stages.append(stage)
+            if stage_idx != num_stages - 1:
+                dim = dim * 2
+                resolution = resolution // 2
+        # last norm and classification head layers
+        self.norm = nn.LayerNorm(dim)
+        self.head = nn.Linear(dim, class_num)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x = self.patch_embedding(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = self.norm(x)
+        return paddle.mean(x, axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CSWinTransformer_tiny_224(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=224,
+        embed_dim=64,
+        depths=[1, 2, 21, 1],
+        splits=[1, 2, 7, 7],
+        num_heads=[2, 4, 8, 16],
+        droppath=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_tiny_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_small_224(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=224,
+        embed_dim=64,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 7, 7],
+        num_heads=[2, 4, 8, 16],
+        droppath=0.4,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_small_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_base_224(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=224,
+        embed_dim=96,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 7, 7],
+        num_heads=[4, 8, 16, 32],
+        droppath=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_base_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_base_384(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=384,
+        embed_dim=96,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 12, 12],
+        num_heads=[4, 8, 16, 32],
+        droppath=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_base_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_large_224(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=224,
+        embed_dim=144,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 7, 7],
+        num_heads=[6, 12, 24, 24],
+        droppath=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_large_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_large_384(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=384,
+        embed_dim=144,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 12, 12],
+        num_heads=[6, 12, 24, 24],
+        droppath=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_large_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/cvt.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/cvt.py
new file mode 100755
index 000000000..0092fb3d7
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/cvt.py
@@ -0,0 +1,723 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Code was heavily based on https://github.com/microsoft/CvT
+# reference: https://arxiv.org/abs/2103.15808
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import XavierUniform, TruncatedNormal, Constant
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "CvT_13_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_13_224_pretrained.pdparams",
+    "CvT_13_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_13_384_pretrained.pdparams",
+    "CvT_21_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_21_224_pretrained.pdparams",
+    "CvT_21_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_21_384_pretrained.pdparams",
+    "CvT_W24_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_W24_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+xavier_uniform_ = XavierUniform()
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self):
+        return f'drop_prob={self.drop_prob:.3f}'
+
+
+def rearrange(x, pattern, **axes_lengths):
+    if 'b (h w) c -> b c h w' == pattern:
+        b, _, c = x.shape
+        h, w = axes_lengths.pop('h', -1), axes_lengths.pop('w', -1)
+        return x.transpose([0, 2, 1]).reshape([b, c, h, w])
+    if 'b c h w -> b (h w) c' == pattern:
+        b, c, h, w = x.shape
+        return x.reshape([b, c, h * w]).transpose([0, 2, 1])
+    if 'b t (h d) -> b h t d' == pattern:
+        b, t, h_d = x.shape
+        h = axes_lengths['h']
+        return x.reshape([b, t, h, h_d // h]).transpose([0, 2, 1, 3])
+    if 'b h t d -> b t (h d)' == pattern:
+        b, h, t, d = x.shape
+        return x.transpose([0, 2, 1, 3]).reshape([b, t, h * d])
+
+    raise NotImplementedError(
+        f"Rearrangement '{pattern}' has not been implemented.")
+
+
+class Rearrange(nn.Layer):
+    def __init__(self, pattern, **axes_lengths):
+        super().__init__()
+        self.pattern = pattern
+        self.axes_lengths = axes_lengths
+
+    def forward(self, x):
+        return rearrange(x, self.pattern, **self.axes_lengths)
+
+    def extra_repr(self):
+        return self.pattern
+
+
+class QuickGELU(nn.Layer):
+    def forward(self, x):
+        return x * nn.functional.sigmoid(1.702 * x)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 num_heads,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 method='dw_bn',
+                 kernel_size=3,
+                 stride_kv=1,
+                 stride_q=1,
+                 padding_kv=1,
+                 padding_q=1,
+                 with_cls_token=True,
+                 **kwargs):
+        super().__init__()
+        self.stride_kv = stride_kv
+        self.stride_q = stride_q
+        self.dim = dim_out
+        self.num_heads = num_heads
+        # head_dim = self.qkv_dim // num_heads
+        self.scale = dim_out**-0.5
+        self.with_cls_token = with_cls_token
+
+        self.conv_proj_q = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_q, stride_q, 'linear'
+            if method == 'avg' else method)
+        self.conv_proj_k = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_kv, stride_kv, method)
+        self.conv_proj_v = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_kv, stride_kv, method)
+
+        self.proj_q = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias)
+        self.proj_k = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias)
+        self.proj_v = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim_out, dim_out)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def _build_projection(self, dim_in, dim_out, kernel_size, padding, stride,
+                          method):
+        if method == 'dw_bn':
+            proj = nn.Sequential(
+                ('conv', nn.Conv2D(
+                    dim_in,
+                    dim_in,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    bias_attr=False,
+                    groups=dim_in)), ('bn', nn.BatchNorm2D(dim_in)),
+                ('rearrage', Rearrange('b c h w -> b (h w) c')))
+        elif method == 'avg':
+            proj = nn.Sequential(
+                ('avg', nn.AvgPool2D(
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    ceil_mode=True)),
+                ('rearrage', Rearrange('b c h w -> b (h w) c')))
+        elif method == 'linear':
+            proj = None
+        else:
+            raise ValueError('Unknown method ({})'.format(method))
+
+        return proj
+
+    def forward_conv(self, x, h, w):
+        if self.with_cls_token:
+            cls_token, x = paddle.split(x, [1, h * w], 1)
+
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+
+        if self.conv_proj_q is not None:
+            q = self.conv_proj_q(x)
+        else:
+            q = rearrange(x, 'b c h w -> b (h w) c')
+
+        if self.conv_proj_k is not None:
+            k = self.conv_proj_k(x)
+        else:
+            k = rearrange(x, 'b c h w -> b (h w) c')
+
+        if self.conv_proj_v is not None:
+            v = self.conv_proj_v(x)
+        else:
+            v = rearrange(x, 'b c h w -> b (h w) c')
+
+        if self.with_cls_token:
+            q = paddle.concat((cls_token, q), axis=1)
+            k = paddle.concat((cls_token, k), axis=1)
+            v = paddle.concat((cls_token, v), axis=1)
+
+        return q, k, v
+
+    def forward(self, x, h, w):
+        if (self.conv_proj_q is not None or self.conv_proj_k is not None or
+                self.conv_proj_v is not None):
+            q, k, v = self.forward_conv(x, h, w)
+
+        q = rearrange(self.proj_q(q), 'b t (h d) -> b h t d', h=self.num_heads)
+        k = rearrange(self.proj_k(k), 'b t (h d) -> b h t d', h=self.num_heads)
+        v = rearrange(self.proj_v(v), 'b t (h d) -> b h t d', h=self.num_heads)
+
+        attn_score = (q @k.transpose([0, 1, 3, 2])) * self.scale
+        attn = nn.functional.softmax(attn_score, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @v
+        x = rearrange(x, 'b h t d -> b t (h d)')
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 **kwargs):
+        super().__init__()
+
+        self.with_cls_token = kwargs['with_cls_token']
+
+        self.norm1 = norm_layer(dim_in)
+        self.attn = Attention(dim_in, dim_out, num_heads, qkv_bias, attn_drop,
+                              drop, **kwargs)
+
+        self.drop_path = DropPath(drop_path) \
+            if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim_out)
+
+        dim_mlp_hidden = int(dim_out * mlp_ratio)
+        self.mlp = Mlp(in_features=dim_out,
+                       hidden_features=dim_mlp_hidden,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x, h, w):
+        x = x + self.drop_path(self.attn(self.norm1(x), h, w))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class ConvEmbed(nn.Layer):
+    def __init__(self,
+                 patch_size=7,
+                 in_chans=3,
+                 embed_dim=64,
+                 stride=4,
+                 padding=2,
+                 norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=padding)
+        self.norm = norm_layer(embed_dim) if norm_layer else None
+
+    def forward(self, x):
+        x = self.proj(x)
+
+        B, C, H, W = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        if self.norm:
+            x = self.norm(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
+
+        return x
+
+
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 patch_size=16,
+                 patch_stride=16,
+                 patch_padding=0,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 init='trunc_norm',
+                 **kwargs):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.rearrage = None
+
+        self.patch_embed = ConvEmbed(
+            # img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            stride=patch_stride,
+            padding=patch_padding,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer)
+
+        with_cls_token = kwargs['with_cls_token']
+        if with_cls_token:
+            self.cls_token = self.create_parameter(
+                shape=[1, 1, embed_dim], default_initializer=trunc_normal_)
+        else:
+            self.cls_token = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+
+        blocks = []
+        for j in range(depth):
+            blocks.append(
+                Block(
+                    dim_in=embed_dim,
+                    dim_out=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[j],
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    **kwargs))
+        self.blocks = nn.LayerList(blocks)
+
+        if init == 'xavier':
+            self.apply(self._init_weights_xavier)
+        else:
+            self.apply(self._init_weights_trunc_normal)
+
+    def _init_weights_trunc_normal(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2D)):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def _init_weights_xavier(self, m):
+        if isinstance(m, nn.Linear):
+            xavier_uniform_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2D)):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        B, C, H, W = x.shape
+
+        x = rearrange(x, 'b c h w -> b (h w) c')
+
+        cls_tokens = None
+        if self.cls_token is not None:
+            # stole cls_tokens impl from Phil Wang, thanks
+            cls_tokens = self.cls_token.expand([B, -1, -1])
+            x = paddle.concat((cls_tokens, x), axis=1)
+
+        x = self.pos_drop(x)
+
+        for i, blk in enumerate(self.blocks):
+            x = blk(x, H, W)
+
+        if self.cls_token is not None:
+            cls_tokens, x = paddle.split(x, [1, H * W], 1)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
+
+        return x, cls_tokens
+
+
+class ConvolutionalVisionTransformer(nn.Layer):
+    def __init__(self,
+                 in_chans=3,
+                 class_num=1000,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 init='trunc_norm',
+                 spec=None):
+        super().__init__()
+        self.class_num = class_num
+
+        self.num_stages = spec['NUM_STAGES']
+        for i in range(self.num_stages):
+            kwargs = {
+                'patch_size': spec['PATCH_SIZE'][i],
+                'patch_stride': spec['PATCH_STRIDE'][i],
+                'patch_padding': spec['PATCH_PADDING'][i],
+                'embed_dim': spec['DIM_EMBED'][i],
+                'depth': spec['DEPTH'][i],
+                'num_heads': spec['NUM_HEADS'][i],
+                'mlp_ratio': spec['MLP_RATIO'][i],
+                'qkv_bias': spec['QKV_BIAS'][i],
+                'drop_rate': spec['DROP_RATE'][i],
+                'attn_drop_rate': spec['ATTN_DROP_RATE'][i],
+                'drop_path_rate': spec['DROP_PATH_RATE'][i],
+                'with_cls_token': spec['CLS_TOKEN'][i],
+                'method': spec['QKV_PROJ_METHOD'][i],
+                'kernel_size': spec['KERNEL_QKV'][i],
+                'padding_q': spec['PADDING_Q'][i],
+                'padding_kv': spec['PADDING_KV'][i],
+                'stride_kv': spec['STRIDE_KV'][i],
+                'stride_q': spec['STRIDE_Q'][i],
+            }
+
+            stage = VisionTransformer(
+                in_chans=in_chans,
+                init=init,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                **kwargs)
+            setattr(self, f'stage{i}', stage)
+
+            in_chans = spec['DIM_EMBED'][i]
+
+        dim_embed = spec['DIM_EMBED'][-1]
+        self.norm = norm_layer(dim_embed)
+        self.cls_token = spec['CLS_TOKEN'][-1]
+
+        # Classifier head
+        self.head = nn.Linear(dim_embed,
+                              class_num) if class_num > 0 else nn.Identity()
+        trunc_normal_(self.head.weight)
+
+        bound = 1 / dim_embed**.5
+        nn.initializer.Uniform(-bound, bound)(self.head.bias)
+
+    def no_weight_decay(self):
+        layers = set()
+        for i in range(self.num_stages):
+            layers.add(f'stage{i}.pos_embed')
+            layers.add(f'stage{i}.cls_token')
+        return layers
+
+    def forward_features(self, x):
+        for i in range(self.num_stages):
+            x, cls_tokens = getattr(self, f'stage{i}')(x)
+
+        if self.cls_token:
+            x = self.norm(cls_tokens)
+            x = paddle.squeeze(x, axis=1)
+        else:
+            x = rearrange(x, 'b c h w -> b (h w) c')
+            x = self.norm(x)
+            x = paddle.mean(x, axis=1)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained,
+                     model,
+                     model_url,
+                     use_ssld=False,
+                     use_imagenet22kto1k_pretrained=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(
+            model,
+            model_url,
+            use_ssld=use_ssld,
+            use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CvT_13_224(pretrained=False, use_ssld=False, **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[64, 192, 384],
+        NUM_HEADS=[1, 3, 6],
+        DEPTH=[1, 2, 10],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.1],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["CvT_13_224"], use_ssld=use_ssld)
+    return model
+
+
+def CvT_13_384(pretrained=False,
+               use_ssld=False,
+               use_imagenet22kto1k_pretrained=False,
+               **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[64, 192, 384],
+        NUM_HEADS=[1, 3, 6],
+        DEPTH=[1, 2, 10],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.1],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CvT_13_384"],
+        use_ssld=use_ssld,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def CvT_21_224(pretrained=False, use_ssld=False, **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[64, 192, 384],
+        NUM_HEADS=[1, 3, 6],
+        DEPTH=[1, 4, 16],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.1],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["CvT_21_224"], use_ssld=use_ssld)
+    return model
+
+
+def CvT_21_384(pretrained=False,
+               use_ssld=False,
+               use_imagenet22kto1k_pretrained=False,
+               **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[64, 192, 384],
+        NUM_HEADS=[1, 3, 6],
+        DEPTH=[1, 4, 16],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.1],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CvT_21_384"],
+        use_ssld=use_ssld,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def CvT_W24_384(pretrained=False, use_ssld=False, **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[192, 768, 1024],
+        NUM_HEADS=[3, 12, 16],
+        DEPTH=[2, 2, 20],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.3],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CvT_W24_384"],
+        use_ssld=use_ssld,
+        use_imagenet22kto1k_pretrained=True)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/darknet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/darknet.py
new file mode 100755
index 000000000..2474c1587
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/darknet.py
@@ -0,0 +1,199 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1804.02767
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DarkNet53":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DarkNet53_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride,
+                 padding,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + ".conv.weights"),
+            bias_attr=False)
+
+        bn_name = name + ".bn"
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act="relu",
+            param_attr=ParamAttr(name=bn_name + ".scale"),
+            bias_attr=ParamAttr(name=bn_name + ".offset"),
+            moving_mean_name=bn_name + ".mean",
+            moving_variance_name=bn_name + ".var")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._bn(x)
+        return x
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, name=None):
+        super(BasicBlock, self).__init__()
+
+        self._conv1 = ConvBNLayer(
+            input_channels, output_channels, 1, 1, 0, name=name + ".0")
+        self._conv2 = ConvBNLayer(
+            output_channels, output_channels * 2, 3, 1, 1, name=name + ".1")
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        return paddle.add(x=inputs, y=x)
+
+
+class DarkNet(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(DarkNet, self).__init__()
+
+        self.stages = [1, 2, 8, 8, 4]
+        self._conv1 = ConvBNLayer(3, 32, 3, 1, 1, name="yolo_input")
+        self._conv2 = ConvBNLayer(
+            32, 64, 3, 2, 1, name="yolo_input.downsample")
+
+        self._basic_block_01 = BasicBlock(64, 32, name="stage.0.0")
+        self._downsample_0 = ConvBNLayer(
+            64, 128, 3, 2, 1, name="stage.0.downsample")
+
+        self._basic_block_11 = BasicBlock(128, 64, name="stage.1.0")
+        self._basic_block_12 = BasicBlock(128, 64, name="stage.1.1")
+        self._downsample_1 = ConvBNLayer(
+            128, 256, 3, 2, 1, name="stage.1.downsample")
+
+        self._basic_block_21 = BasicBlock(256, 128, name="stage.2.0")
+        self._basic_block_22 = BasicBlock(256, 128, name="stage.2.1")
+        self._basic_block_23 = BasicBlock(256, 128, name="stage.2.2")
+        self._basic_block_24 = BasicBlock(256, 128, name="stage.2.3")
+        self._basic_block_25 = BasicBlock(256, 128, name="stage.2.4")
+        self._basic_block_26 = BasicBlock(256, 128, name="stage.2.5")
+        self._basic_block_27 = BasicBlock(256, 128, name="stage.2.6")
+        self._basic_block_28 = BasicBlock(256, 128, name="stage.2.7")
+        self._downsample_2 = ConvBNLayer(
+            256, 512, 3, 2, 1, name="stage.2.downsample")
+
+        self._basic_block_31 = BasicBlock(512, 256, name="stage.3.0")
+        self._basic_block_32 = BasicBlock(512, 256, name="stage.3.1")
+        self._basic_block_33 = BasicBlock(512, 256, name="stage.3.2")
+        self._basic_block_34 = BasicBlock(512, 256, name="stage.3.3")
+        self._basic_block_35 = BasicBlock(512, 256, name="stage.3.4")
+        self._basic_block_36 = BasicBlock(512, 256, name="stage.3.5")
+        self._basic_block_37 = BasicBlock(512, 256, name="stage.3.6")
+        self._basic_block_38 = BasicBlock(512, 256, name="stage.3.7")
+        self._downsample_3 = ConvBNLayer(
+            512, 1024, 3, 2, 1, name="stage.3.downsample")
+
+        self._basic_block_41 = BasicBlock(1024, 512, name="stage.4.0")
+        self._basic_block_42 = BasicBlock(1024, 512, name="stage.4.1")
+        self._basic_block_43 = BasicBlock(1024, 512, name="stage.4.2")
+        self._basic_block_44 = BasicBlock(1024, 512, name="stage.4.3")
+
+        self._pool = AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(1024.0)
+        self._out = Linear(
+            1024,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+
+        x = self._basic_block_01(x)
+        x = self._downsample_0(x)
+
+        x = self._basic_block_11(x)
+        x = self._basic_block_12(x)
+        x = self._downsample_1(x)
+
+        x = self._basic_block_21(x)
+        x = self._basic_block_22(x)
+        x = self._basic_block_23(x)
+        x = self._basic_block_24(x)
+        x = self._basic_block_25(x)
+        x = self._basic_block_26(x)
+        x = self._basic_block_27(x)
+        x = self._basic_block_28(x)
+        x = self._downsample_2(x)
+
+        x = self._basic_block_31(x)
+        x = self._basic_block_32(x)
+        x = self._basic_block_33(x)
+        x = self._basic_block_34(x)
+        x = self._basic_block_35(x)
+        x = self._basic_block_36(x)
+        x = self._basic_block_37(x)
+        x = self._basic_block_38(x)
+        x = self._downsample_3(x)
+
+        x = self._basic_block_41(x)
+        x = self._basic_block_42(x)
+        x = self._basic_block_43(x)
+        x = self._basic_block_44(x)
+
+        x = self._pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DarkNet53(pretrained=False, use_ssld=False, **kwargs):
+    model = DarkNet(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DarkNet53"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/densenet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/densenet.py
new file mode 100755
index 000000000..314579a88
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/densenet.py
@@ -0,0 +1,346 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1608.06993
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DenseNet121":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet121_pretrained.pdparams",
+    "DenseNet161":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet161_pretrained.pdparams",
+    "DenseNet169":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet169_pretrained.pdparams",
+    "DenseNet201":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet201_pretrained.pdparams",
+    "DenseNet264":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet264_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class BNACConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(BNACConvLayer, self).__init__()
+
+        self._batch_norm = BatchNorm(
+            num_channels,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+    def forward(self, input):
+        y = self._batch_norm(input)
+        y = self._conv(y)
+        return y
+
+
+class DenseLayer(nn.Layer):
+    def __init__(self, num_channels, growth_rate, bn_size, dropout, name=None):
+        super(DenseLayer, self).__init__()
+        self.dropout = dropout
+
+        self.bn_ac_func1 = BNACConvLayer(
+            num_channels=num_channels,
+            num_filters=bn_size * growth_rate,
+            filter_size=1,
+            pad=0,
+            stride=1,
+            name=name + "_x1")
+
+        self.bn_ac_func2 = BNACConvLayer(
+            num_channels=bn_size * growth_rate,
+            num_filters=growth_rate,
+            filter_size=3,
+            pad=1,
+            stride=1,
+            name=name + "_x2")
+
+        if dropout:
+            self.dropout_func = Dropout(p=dropout, mode="downscale_in_infer")
+
+    def forward(self, input):
+        conv = self.bn_ac_func1(input)
+        conv = self.bn_ac_func2(conv)
+        if self.dropout:
+            conv = self.dropout_func(conv)
+        conv = paddle.concat([input, conv], axis=1)
+        return conv
+
+
+class DenseBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_layers,
+                 bn_size,
+                 growth_rate,
+                 dropout,
+                 name=None):
+        super(DenseBlock, self).__init__()
+        self.dropout = dropout
+
+        self.dense_layer_func = []
+
+        pre_channel = num_channels
+        for layer in range(num_layers):
+            self.dense_layer_func.append(
+                self.add_sublayer(
+                    "{}_{}".format(name, layer + 1),
+                    DenseLayer(
+                        num_channels=pre_channel,
+                        growth_rate=growth_rate,
+                        bn_size=bn_size,
+                        dropout=dropout,
+                        name=name + '_' + str(layer + 1))))
+            pre_channel = pre_channel + growth_rate
+
+    def forward(self, input):
+        conv = input
+        for func in self.dense_layer_func:
+            conv = func(conv)
+        return conv
+
+
+class TransitionLayer(nn.Layer):
+    def __init__(self, num_channels, num_output_features, name=None):
+        super(TransitionLayer, self).__init__()
+
+        self.conv_ac_func = BNACConvLayer(
+            num_channels=num_channels,
+            num_filters=num_output_features,
+            filter_size=1,
+            pad=0,
+            stride=1,
+            name=name)
+
+        self.pool2d_avg = AvgPool2D(kernel_size=2, stride=2, padding=0)
+
+    def forward(self, input):
+        y = self.conv_ac_func(input)
+        y = self.pool2d_avg(y)
+        return y
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+    def forward(self, input):
+        y = self._conv(input)
+        y = self._batch_norm(y)
+        return y
+
+
+class DenseNet(nn.Layer):
+    def __init__(self, layers=60, bn_size=4, dropout=0, class_num=1000):
+        super(DenseNet, self).__init__()
+
+        supported_layers = [121, 161, 169, 201, 264]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        densenet_spec = {
+            121: (64, 32, [6, 12, 24, 16]),
+            161: (96, 48, [6, 12, 36, 24]),
+            169: (64, 32, [6, 12, 32, 32]),
+            201: (64, 32, [6, 12, 48, 32]),
+            264: (64, 32, [6, 12, 64, 48])
+        }
+        num_init_features, growth_rate, block_config = densenet_spec[layers]
+
+        self.conv1_func = ConvBNLayer(
+            num_channels=3,
+            num_filters=num_init_features,
+            filter_size=7,
+            stride=2,
+            pad=3,
+            act='relu',
+            name="conv1")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_config = block_config
+
+        self.dense_block_func_list = []
+        self.transition_func_list = []
+        pre_num_channels = num_init_features
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            self.dense_block_func_list.append(
+                self.add_sublayer(
+                    "db_conv_{}".format(i + 2),
+                    DenseBlock(
+                        num_channels=pre_num_channels,
+                        num_layers=num_layers,
+                        bn_size=bn_size,
+                        growth_rate=growth_rate,
+                        dropout=dropout,
+                        name='conv' + str(i + 2))))
+
+            num_features = num_features + num_layers * growth_rate
+            pre_num_channels = num_features
+
+            if i != len(block_config) - 1:
+                self.transition_func_list.append(
+                    self.add_sublayer(
+                        "tr_conv{}_blk".format(i + 2),
+                        TransitionLayer(
+                            num_channels=pre_num_channels,
+                            num_output_features=num_features // 2,
+                            name='conv' + str(i + 2) + "_blk")))
+                pre_num_channels = num_features // 2
+                num_features = num_features // 2
+
+        self.batch_norm = BatchNorm(
+            num_features,
+            act="relu",
+            param_attr=ParamAttr(name='conv5_blk_bn_scale'),
+            bias_attr=ParamAttr(name='conv5_blk_bn_offset'),
+            moving_mean_name='conv5_blk_bn_mean',
+            moving_variance_name='conv5_blk_bn_variance')
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(num_features * 1.0)
+
+        self.out = Linear(
+            num_features,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, input):
+        conv = self.conv1_func(input)
+        conv = self.pool2d_max(conv)
+
+        for i, num_layers in enumerate(self.block_config):
+            conv = self.dense_block_func_list[i](conv)
+            if i != len(self.block_config) - 1:
+                conv = self.transition_func_list[i](conv)
+
+        conv = self.batch_norm(conv)
+        y = self.pool2d_avg(conv)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DenseNet121(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=121, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet121"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet161(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=161, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet161"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet169(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=169, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet169"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet201(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=201, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet201"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet264(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=264, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet264"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py
new file mode 100755
index 000000000..1805b1d03
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/distilled_vision_transformer.py
@@ -0,0 +1,273 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/facebookresearch/deit
+# reference: https://arxiv.org/abs/2012.12877
+
+import paddle
+import paddle.nn as nn
+from .vision_transformer import VisionTransformer, Identity, trunc_normal_, zeros_
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DeiT_tiny_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_tiny_patch16_224_pretrained.pdparams",
+    "DeiT_small_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_small_patch16_224_pretrained.pdparams",
+    "DeiT_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_patch16_224_pretrained.pdparams",
+    "DeiT_tiny_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_tiny_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_small_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_small_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_base_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_base_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_patch16_384_pretrained.pdparams",
+    "DeiT_base_distilled_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_distilled_patch16_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class DistilledVisionTransformer(VisionTransformer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 class_num=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 **kwargs):
+        super().__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            class_num=class_num,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            epsilon=epsilon,
+            **kwargs)
+        self.pos_embed = self.create_parameter(
+            shape=(1, self.patch_embed.num_patches + 2, self.embed_dim),
+            default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+
+        self.dist_token = self.create_parameter(
+            shape=(1, 1, self.embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+
+        self.head_dist = nn.Linear(
+            self.embed_dim,
+            self.class_num) if self.class_num > 0 else Identity()
+
+        trunc_normal_(self.dist_token)
+        trunc_normal_(self.pos_embed)
+        self.head_dist.apply(self._init_weights)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand((B, -1, -1)).astype(x.dtype)
+        dist_token = self.dist_token.expand((B, -1, -1)).astype(x.dtype)
+        x = paddle.concat((cls_tokens, dist_token, x), axis=1)
+
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0], x[:, 1]
+
+    def forward(self, x):
+        x, x_dist = self.forward_features(x)
+        x = self.head(x)
+        x_dist = self.head_dist(x_dist)
+        return (x + x_dist) / 2
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DeiT_tiny_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_tiny_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_small_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_small_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_tiny_distilled_patch16_224(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_tiny_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_small_distilled_patch16_224(pretrained=False,
+                                     use_ssld=False,
+                                     **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_small_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_distilled_patch16_224(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_distilled_patch16_384(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_distilled_patch16_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/dla.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/dla.py
new file mode 100755
index 000000000..03a73bcfb
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/dla.py
@@ -0,0 +1,529 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/ucbdrive/dla
+# reference: https://arxiv.org/abs/1707.06484
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn.initializer import Normal, Constant
+
+from ..base.theseus_layer import Identity
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DLA34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA34_pretrained.pdparams",
+    "DLA46_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA46_c_pretrained.pdparams",
+    "DLA46x_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA46x_c_pretrained.pdparams",
+    "DLA60":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60_pretrained.pdparams",
+    "DLA60x":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60x_pretrained.pdparams",
+    "DLA60x_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60x_c_pretrained.pdparams",
+    "DLA102":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102_pretrained.pdparams",
+    "DLA102x":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102x_pretrained.pdparams",
+    "DLA102x2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102x2_pretrained.pdparams",
+    "DLA169":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA169_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class DlaBasic(nn.Layer):
+    def __init__(self, inplanes, planes, stride=1, dilation=1, **cargs):
+        super(DlaBasic, self).__init__()
+        self.conv1 = nn.Conv2D(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation)
+        self.bn1 = nn.BatchNorm2D(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation)
+        self.bn2 = nn.BatchNorm2D(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DlaBottleneck(nn.Layer):
+    expansion = 2
+
+    def __init__(self,
+                 inplanes,
+                 outplanes,
+                 stride=1,
+                 dilation=1,
+                 cardinality=1,
+                 base_width=64):
+        super(DlaBottleneck, self).__init__()
+        self.stride = stride
+        mid_planes = int(
+            math.floor(outplanes * (base_width / 64)) * cardinality)
+        mid_planes = mid_planes // self.expansion
+
+        self.conv1 = nn.Conv2D(
+            inplanes, mid_planes, kernel_size=1, bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(mid_planes)
+        self.conv2 = nn.Conv2D(
+            mid_planes,
+            mid_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation,
+            groups=cardinality)
+        self.bn2 = nn.BatchNorm2D(mid_planes)
+        self.conv3 = nn.Conv2D(
+            mid_planes, outplanes, kernel_size=1, bias_attr=False)
+        self.bn3 = nn.BatchNorm2D(outplanes)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DlaRoot(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(DlaRoot, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            1,
+            stride=1,
+            bias_attr=False,
+            padding=(kernel_size - 1) // 2)
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(paddle.concat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class DlaTree(nn.Layer):
+    def __init__(self,
+                 levels,
+                 block,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 dilation=1,
+                 cardinality=1,
+                 base_width=64,
+                 level_root=False,
+                 root_dim=0,
+                 root_kernel_size=1,
+                 root_residual=False):
+        super(DlaTree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+
+        self.downsample = nn.MaxPool2D(
+            stride, stride=stride) if stride > 1 else Identity()
+        self.project = Identity()
+        cargs = dict(
+            dilation=dilation, cardinality=cardinality, base_width=base_width)
+
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride, **cargs)
+            self.tree2 = block(out_channels, out_channels, 1, **cargs)
+            if in_channels != out_channels:
+                self.project = nn.Sequential(
+                    nn.Conv2D(
+                        in_channels,
+                        out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        bias_attr=False),
+                    nn.BatchNorm2D(out_channels))
+        else:
+            cargs.update(
+                dict(
+                    root_kernel_size=root_kernel_size,
+                    root_residual=root_residual))
+            self.tree1 = DlaTree(
+                levels - 1,
+                block,
+                in_channels,
+                out_channels,
+                stride,
+                root_dim=0,
+                **cargs)
+            self.tree2 = DlaTree(
+                levels - 1,
+                block,
+                out_channels,
+                out_channels,
+                root_dim=root_dim + out_channels,
+                **cargs)
+
+        if levels == 1:
+            self.root = DlaRoot(root_dim, out_channels, root_kernel_size,
+                                root_residual)
+
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.levels = levels
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x)
+        residual = self.project(bottom)
+
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Layer):
+    def __init__(self,
+                 levels,
+                 channels,
+                 in_chans=3,
+                 cardinality=1,
+                 base_width=64,
+                 block=DlaBottleneck,
+                 residual_root=False,
+                 drop_rate=0.0,
+                 class_num=1000,
+                 with_pool=True):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.class_num = class_num
+        self.with_pool = with_pool
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.drop_rate = drop_rate
+
+        self.base_layer = nn.Sequential(
+            nn.Conv2D(
+                in_chans,
+                channels[0],
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                bias_attr=False),
+            nn.BatchNorm2D(channels[0]),
+            nn.ReLU())
+
+        self.level0 = self._make_conv_level(channels[0], channels[0],
+                                            levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+
+        cargs = dict(
+            cardinality=cardinality,
+            base_width=base_width,
+            root_residual=residual_root)
+
+        self.level2 = DlaTree(
+            levels[2],
+            block,
+            channels[1],
+            channels[2],
+            2,
+            level_root=False,
+            **cargs)
+        self.level3 = DlaTree(
+            levels[3],
+            block,
+            channels[2],
+            channels[3],
+            2,
+            level_root=True,
+            **cargs)
+        self.level4 = DlaTree(
+            levels[4],
+            block,
+            channels[3],
+            channels[4],
+            2,
+            level_root=True,
+            **cargs)
+        self.level5 = DlaTree(
+            levels[5],
+            block,
+            channels[4],
+            channels[5],
+            2,
+            level_root=True,
+            **cargs)
+
+        self.feature_info = [
+            # rare to have a meaningful stride 1 level
+            dict(
+                num_chs=channels[0], reduction=1, module='level0'),
+            dict(
+                num_chs=channels[1], reduction=2, module='level1'),
+            dict(
+                num_chs=channels[2], reduction=4, module='level2'),
+            dict(
+                num_chs=channels[3], reduction=8, module='level3'),
+            dict(
+                num_chs=channels[4], reduction=16, module='level4'),
+            dict(
+                num_chs=channels[5], reduction=32, module='level5'),
+        ]
+
+        self.num_features = channels[-1]
+
+        if with_pool:
+            self.global_pool = nn.AdaptiveAvgPool2D(1)
+
+        if class_num > 0:
+            self.fc = nn.Conv2D(self.num_features, class_num, 1)
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                normal_ = Normal(mean=0.0, std=math.sqrt(2. / n))
+                normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                ones_(m.weight)
+                zeros_(m.bias)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2D(
+                    inplanes,
+                    planes,
+                    kernel_size=3,
+                    stride=stride if i == 0 else 1,
+                    padding=dilation,
+                    bias_attr=False,
+                    dilation=dilation), nn.BatchNorm2D(planes), nn.ReLU()
+            ])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward_features(self, x):
+        x = self.base_layer(x)
+
+        x = self.level0(x)
+        x = self.level1(x)
+        x = self.level2(x)
+        x = self.level3(x)
+        x = self.level4(x)
+        x = self.level5(x)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        if self.with_pool:
+            x = self.global_pool(x)
+
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+
+        if self.class_num > 0:
+            x = self.fc(x)
+            x = x.flatten(1)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DLA34(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 128, 256, 512),
+                block=DlaBasic,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA34"])
+    return model
+
+
+def DLA46_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA46_c"])
+    return model
+
+
+def DLA46x_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA46x_c"])
+    return model
+
+
+def DLA60(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60"])
+    return model
+
+
+def DLA60x(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60x"])
+    return model
+
+
+def DLA60x_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60x_c"])
+    return model
+
+
+def DLA102(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102"])
+    return model
+
+
+def DLA102x(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102x"])
+    return model
+
+
+def DLA102x2(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=64,
+                base_width=4,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102x2"])
+    return model
+
+
+def DLA169(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 2, 3, 5, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA169"])
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/dpn.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/dpn.py
new file mode 100755
index 000000000..ef322dea9
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/dpn.py
@@ -0,0 +1,453 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1707.01629
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import sys
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DPN68":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN68_pretrained.pdparams",
+    "DPN92":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN92_pretrained.pdparams",
+    "DPN98":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN98_pretrained.pdparams",
+    "DPN107":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN107_pretrained.pdparams",
+    "DPN131":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN131_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+    def forward(self, input):
+        y = self._conv(input)
+        y = self._batch_norm(y)
+        return y
+
+
+class BNACConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(BNACConvLayer, self).__init__()
+        self.num_channels = num_channels
+
+        self._batch_norm = BatchNorm(
+            num_channels,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+    def forward(self, input):
+        y = self._batch_norm(input)
+        y = self._conv(y)
+        return y
+
+
+class DualPathFactory(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_1x1_a,
+                 num_3x3_b,
+                 num_1x1_c,
+                 inc,
+                 G,
+                 _type='normal',
+                 name=None):
+        super(DualPathFactory, self).__init__()
+
+        self.num_1x1_c = num_1x1_c
+        self.inc = inc
+        self.name = name
+
+        kw = 3
+        kh = 3
+        pw = (kw - 1) // 2
+        ph = (kh - 1) // 2
+
+        # type
+        if _type == 'proj':
+            key_stride = 1
+            self.has_proj = True
+        elif _type == 'down':
+            key_stride = 2
+            self.has_proj = True
+        elif _type == 'normal':
+            key_stride = 1
+            self.has_proj = False
+        else:
+            print("not implemented now!!!")
+            sys.exit(1)
+
+        data_in_ch = sum(num_channels) if isinstance(num_channels,
+                                                     list) else num_channels
+
+        if self.has_proj:
+            self.c1x1_w_func = BNACConvLayer(
+                num_channels=data_in_ch,
+                num_filters=num_1x1_c + 2 * inc,
+                filter_size=(1, 1),
+                pad=(0, 0),
+                stride=(key_stride, key_stride),
+                name=name + "_match")
+
+        self.c1x1_a_func = BNACConvLayer(
+            num_channels=data_in_ch,
+            num_filters=num_1x1_a,
+            filter_size=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv1")
+
+        self.c3x3_b_func = BNACConvLayer(
+            num_channels=num_1x1_a,
+            num_filters=num_3x3_b,
+            filter_size=(kw, kh),
+            pad=(pw, ph),
+            stride=(key_stride, key_stride),
+            groups=G,
+            name=name + "_conv2")
+
+        self.c1x1_c_func = BNACConvLayer(
+            num_channels=num_3x3_b,
+            num_filters=num_1x1_c + inc,
+            filter_size=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv3")
+
+    def forward(self, input):
+        # PROJ
+        if isinstance(input, list):
+            data_in = paddle.concat([input[0], input[1]], axis=1)
+        else:
+            data_in = input
+
+        if self.has_proj:
+            c1x1_w = self.c1x1_w_func(data_in)
+            data_o1, data_o2 = paddle.split(
+                c1x1_w, num_or_sections=[self.num_1x1_c, 2 * self.inc], axis=1)
+        else:
+            data_o1 = input[0]
+            data_o2 = input[1]
+
+        c1x1_a = self.c1x1_a_func(data_in)
+        c3x3_b = self.c3x3_b_func(c1x1_a)
+        c1x1_c = self.c1x1_c_func(c3x3_b)
+
+        c1x1_c1, c1x1_c2 = paddle.split(
+            c1x1_c, num_or_sections=[self.num_1x1_c, self.inc], axis=1)
+
+        # OUTPUTS
+        summ = paddle.add(x=data_o1, y=c1x1_c1)
+        dense = paddle.concat([data_o2, c1x1_c2], axis=1)
+        # tensor, channels
+        return [summ, dense]
+
+
+class DPN(nn.Layer):
+    def __init__(self, layers=68, class_num=1000):
+        super(DPN, self).__init__()
+
+        self._class_num = class_num
+
+        args = self.get_net_args(layers)
+        bws = args['bw']
+        inc_sec = args['inc_sec']
+        rs = args['r']
+        k_r = args['k_r']
+        k_sec = args['k_sec']
+        G = args['G']
+        init_num_filter = args['init_num_filter']
+        init_filter_size = args['init_filter_size']
+        init_padding = args['init_padding']
+
+        self.k_sec = k_sec
+
+        self.conv1_x_1_func = ConvBNLayer(
+            num_channels=3,
+            num_filters=init_num_filter,
+            filter_size=init_filter_size,
+            stride=2,
+            pad=init_padding,
+            act='relu',
+            name="conv1")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        num_channel_dpn = init_num_filter
+
+        self.dpn_func_list = []
+        #conv2 - conv5
+        match_list, num = [], 0
+        for gc in range(4):
+            bw = bws[gc]
+            inc = inc_sec[gc]
+            R = (k_r * bw) // rs[gc]
+            if gc == 0:
+                _type1 = 'proj'
+                _type2 = 'normal'
+                match = 1
+            else:
+                _type1 = 'down'
+                _type2 = 'normal'
+                match = match + k_sec[gc - 1]
+            match_list.append(match)
+            self.dpn_func_list.append(
+                self.add_sublayer(
+                    "dpn{}".format(match),
+                    DualPathFactory(
+                        num_channels=num_channel_dpn,
+                        num_1x1_a=R,
+                        num_3x3_b=R,
+                        num_1x1_c=bw,
+                        inc=inc,
+                        G=G,
+                        _type=_type1,
+                        name="dpn" + str(match))))
+            num_channel_dpn = [bw, 3 * inc]
+
+            for i_ly in range(2, k_sec[gc] + 1):
+                num += 1
+                if num in match_list:
+                    num += 1
+                self.dpn_func_list.append(
+                    self.add_sublayer(
+                        "dpn{}".format(num),
+                        DualPathFactory(
+                            num_channels=num_channel_dpn,
+                            num_1x1_a=R,
+                            num_3x3_b=R,
+                            num_1x1_c=bw,
+                            inc=inc,
+                            G=G,
+                            _type=_type2,
+                            name="dpn" + str(num))))
+
+                num_channel_dpn = [
+                    num_channel_dpn[0], num_channel_dpn[1] + inc
+                ]
+
+        out_channel = sum(num_channel_dpn)
+
+        self.conv5_x_x_bn = BatchNorm(
+            num_channels=sum(num_channel_dpn),
+            act="relu",
+            param_attr=ParamAttr(name='final_concat_bn_scale'),
+            bias_attr=ParamAttr('final_concat_bn_offset'),
+            moving_mean_name='final_concat_bn_mean',
+            moving_variance_name='final_concat_bn_variance')
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        stdv = 0.01
+
+        self.out = Linear(
+            out_channel,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, input):
+        conv1_x_1 = self.conv1_x_1_func(input)
+        convX_x_x = self.pool2d_max(conv1_x_1)
+
+        dpn_idx = 0
+        for gc in range(4):
+            convX_x_x = self.dpn_func_list[dpn_idx](convX_x_x)
+            dpn_idx += 1
+            for i_ly in range(2, self.k_sec[gc] + 1):
+                convX_x_x = self.dpn_func_list[dpn_idx](convX_x_x)
+                dpn_idx += 1
+
+        conv5_x_x = paddle.concat(convX_x_x, axis=1)
+        conv5_x_x = self.conv5_x_x_bn(conv5_x_x)
+
+        y = self.pool2d_avg(conv5_x_x)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+    def get_net_args(self, layers):
+        if layers == 68:
+            k_r = 128
+            G = 32
+            k_sec = [3, 4, 12, 3]
+            inc_sec = [16, 32, 32, 64]
+            bw = [64, 128, 256, 512]
+            r = [64, 64, 64, 64]
+            init_num_filter = 10
+            init_filter_size = 3
+            init_padding = 1
+        elif layers == 92:
+            k_r = 96
+            G = 32
+            k_sec = [3, 4, 20, 3]
+            inc_sec = [16, 32, 24, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 64
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 98:
+            k_r = 160
+            G = 40
+            k_sec = [3, 6, 20, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 96
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 107:
+            k_r = 200
+            G = 50
+            k_sec = [4, 8, 20, 3]
+            inc_sec = [20, 64, 64, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 131:
+            k_r = 160
+            G = 40
+            k_sec = [4, 8, 28, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        else:
+            raise NotImplementedError
+        net_arg = {
+            'k_r': k_r,
+            'G': G,
+            'k_sec': k_sec,
+            'inc_sec': inc_sec,
+            'bw': bw,
+            'r': r
+        }
+        net_arg['init_num_filter'] = init_num_filter
+        net_arg['init_filter_size'] = init_filter_size
+        net_arg['init_padding'] = init_padding
+
+        return net_arg
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DPN68(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=68, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN68"])
+    return model
+
+
+def DPN92(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=92, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN92"])
+    return model
+
+
+def DPN98(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=98, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN98"])
+    return model
+
+
+def DPN107(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=107, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN107"])
+    return model
+
+
+def DPN131(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=131, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN131"])
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/dsnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/dsnet.py
new file mode 100755
index 000000000..20b28c2b0
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/dsnet.py
@@ -0,0 +1,701 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2105.14734v4
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .vision_transformer import to_2tuple, zeros_, ones_, VisionTransformer, Identity, zeros_
+from functools import partial
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DSNet_tiny":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DSNet_tiny_pretrained.pdparams",
+    "DSNet_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DSNet_small_pretrained.pdparams",
+    "DSNet_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DSNet_base_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2D(in_features, hidden_features, 1)
+        self.act = act_layer()
+        self.fc2 = nn.Conv2D(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x):
+        x = self.dwconv(x)
+        return x
+
+
+class DWConvMlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2D(in_features, hidden_features, 1)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Conv2D(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        C = int(C // 3)
+        qkv = x.reshape(
+            (B, N, 3, self.num_heads, C // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((B, N, C))
+        x = self.proj_drop(x)
+        return x
+
+
+class Cross_Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, tokens_q, memory_k, memory_v, shape=None):
+        assert shape is not None
+        attn = (tokens_q.matmul(memory_k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(memory_v)).transpose((0, 2, 1, 3)).reshape(
+            (shape[0], shape[1], shape[2]))
+        x = self.proj_drop(x)
+        return x
+
+
+class MixBlock(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 downsample=2,
+                 conv_ffn=False):
+        super().__init__()
+        self.pos_embed = nn.Conv2D(dim, dim, 3, padding=1, groups=dim)
+        self.dim = dim
+        self.norm1 = nn.BatchNorm2D(dim)
+        self.conv1 = nn.Conv2D(dim, dim, 1)
+        self.conv2 = nn.Conv2D(dim, dim, 1)
+        self.dim_conv = int(dim * 0.5)
+        self.dim_sa = dim - self.dim_conv
+        self.norm_conv1 = nn.BatchNorm2D(self.dim_conv)
+        self.norm_sa1 = nn.LayerNorm(self.dim_sa)
+        self.conv = nn.Conv2D(
+            self.dim_conv, self.dim_conv, 3, padding=1, groups=self.dim_conv)
+        self.channel_up = nn.Linear(self.dim_sa, 3 * self.dim_sa)
+        self.cross_channel_up_conv = nn.Conv2D(self.dim_conv,
+                                               3 * self.dim_conv, 1)
+        self.cross_channel_up_sa = nn.Linear(self.dim_sa, 3 * self.dim_sa)
+        self.fuse_channel_conv = nn.Linear(self.dim_conv, self.dim_conv)
+        self.fuse_channel_sa = nn.Linear(self.dim_sa, self.dim_sa)
+        self.num_heads = num_heads
+        self.attn = Attention(
+            self.dim_sa,
+            num_heads=self.num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=0.1,
+            proj_drop=drop)
+        self.cross_attn = Cross_Attention(
+            self.dim_sa,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=0.1,
+            proj_drop=drop)
+        self.norm_conv2 = nn.BatchNorm2D(self.dim_conv)
+        self.norm_sa2 = nn.LayerNorm(self.dim_sa)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = nn.BatchNorm2D(dim)
+        self.downsample = downsample
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if conv_ffn:
+            self.mlp = DWConvMlp(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                act_layer=act_layer,
+                drop=drop)
+        else:
+            self.mlp = Mlp(in_features=dim,
+                           hidden_features=mlp_hidden_dim,
+                           act_layer=act_layer,
+                           drop=drop)
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        _, _, H, W = x.shape
+        residual = x
+        x = self.norm1(x)
+        x = self.conv1(x)
+
+        qkv = x[:, :self.dim_sa, :]
+        conv = x[:, self.dim_sa:, :, :]
+        residual_conv = conv
+        conv = residual_conv + self.conv(self.norm_conv1(conv))
+
+        sa = F.interpolate(
+            qkv,
+            size=(H // self.downsample, W // self.downsample),
+            mode='bilinear')
+        B, _, H_down, W_down = sa.shape
+        sa = sa.flatten(2).transpose([0, 2, 1])
+        residual_sa = sa
+        sa = self.norm_sa1(sa)
+        sa = self.channel_up(sa)
+        sa = residual_sa + self.attn(sa)
+
+        # cross attention
+        residual_conv_co = conv
+        residual_sa_co = sa
+        conv_qkv = self.cross_channel_up_conv(self.norm_conv2(conv))
+        conv_qkv = conv_qkv.flatten(2).transpose([0, 2, 1])
+
+        sa_qkv = self.cross_channel_up_sa(self.norm_sa2(sa))
+
+        B_conv, N_conv, C_conv = conv_qkv.shape
+        C_conv = int(C_conv // 3)
+        conv_qkv = conv_qkv.reshape((B_conv, N_conv, 3, self.num_heads,
+                                     C_conv // self.num_heads)).transpose(
+                                         (2, 0, 3, 1, 4))
+        conv_q, conv_k, conv_v = conv_qkv[0], conv_qkv[1], conv_qkv[2]
+
+        B_sa, N_sa, C_sa = sa_qkv.shape
+        C_sa = int(C_sa // 3)
+        sa_qkv = sa_qkv.reshape(
+            (B_sa, N_sa, 3, self.num_heads, C_sa // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        sa_q, sa_k, sa_v = sa_qkv[0], sa_qkv[1], sa_qkv[2]
+
+        # sa -> conv
+        conv = self.cross_attn(
+            conv_q, sa_k, sa_v, shape=(B_conv, N_conv, C_conv))
+        conv = self.fuse_channel_conv(conv)
+        conv = conv.reshape((B, H, W, C_conv)).transpose((0, 3, 1, 2))
+        conv = residual_conv_co + conv
+
+        # conv -> sa
+        sa = self.cross_attn(sa_q, conv_k, conv_v, shape=(B_sa, N_sa, C_sa))
+        sa = residual_sa_co + self.fuse_channel_sa(sa)
+        sa = sa.reshape((B, H_down, W_down, C_sa)).transpose((0, 3, 1, 2))
+        sa = F.interpolate(sa, size=(H, W), mode='bilinear')
+        x = paddle.concat([conv, sa], axis=1)
+        x = residual + self.drop_path(self.conv2(x))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Overlapping Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2))
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        return x
+
+
+class MixVisionTransformer(nn.Layer):
+    """ Mixed Vision Transformer for DSNet
+    A PaddlePaddle impl of : `Dual-stream Network for Visual Recognition` - https://arxiv.org/abs/2105.14734v4
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=[64, 128, 320, 512],
+                 depth=[2, 2, 4, 1],
+                 num_heads=[1, 2, 5, 8],
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 representation_size=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=None,
+                 overlap_embed=False,
+                 conv_ffn=False):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            class_num (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Layer): normalization layer
+            overlap_embed (bool): enable overlapped patch embedding if True
+            conv_ffn (bool): enable depthwise convolution for mlp if True
+        """
+        super().__init__()
+        self.class_num = class_num
+        self.num_features = self.embed_dim = embed_dim
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        downsamples = [8, 4, 2, 2]
+        if overlap_embed:
+            self.patch_embed1 = OverlapPatchEmbed(
+                img_size=img_size,
+                patch_size=7,
+                stride=4,
+                in_chans=in_chans,
+                embed_dim=embed_dim[0])
+            self.patch_embed2 = OverlapPatchEmbed(
+                img_size=img_size // 4,
+                patch_size=3,
+                stride=2,
+                in_chans=embed_dim[0],
+                embed_dim=embed_dim[1])
+            self.patch_embed3 = OverlapPatchEmbed(
+                img_size=img_size // 8,
+                patch_size=3,
+                stride=2,
+                in_chans=embed_dim[1],
+                embed_dim=embed_dim[2])
+            self.patch_embed4 = OverlapPatchEmbed(
+                img_size=img_size // 16,
+                patch_size=3,
+                stride=2,
+                in_chans=embed_dim[2],
+                embed_dim=embed_dim[3])
+        else:
+            self.patch_embed1 = PatchEmbed(
+                img_size=img_size,
+                patch_size=4,
+                in_chans=in_chans,
+                embed_dim=embed_dim[0])
+            self.patch_embed2 = PatchEmbed(
+                img_size=img_size // 4,
+                patch_size=2,
+                in_chans=embed_dim[0],
+                embed_dim=embed_dim[1])
+            self.patch_embed3 = PatchEmbed(
+                img_size=img_size // 8,
+                patch_size=2,
+                in_chans=embed_dim[1],
+                embed_dim=embed_dim[2])
+            self.patch_embed4 = PatchEmbed(
+                img_size=img_size // 16,
+                patch_size=2,
+                in_chans=embed_dim[2],
+                embed_dim=embed_dim[3])
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.mixture = False
+        dpr = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depth))
+        ]
+        self.blocks1 = nn.LayerList([
+            MixBlock(
+                dim=embed_dim[0],
+                num_heads=num_heads[0],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                downsample=downsamples[0],
+                conv_ffn=conv_ffn) for i in range(depth[0])
+        ])
+
+        self.blocks2 = nn.LayerList([
+            MixBlock(
+                dim=embed_dim[1],
+                num_heads=num_heads[1],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                downsample=downsamples[1],
+                conv_ffn=conv_ffn) for i in range(depth[1])
+        ])
+
+        self.blocks3 = nn.LayerList([
+            MixBlock(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                downsample=downsamples[2],
+                conv_ffn=conv_ffn) for i in range(depth[2])
+        ])
+
+        if self.mixture:
+            self.blocks4 = nn.LayerList([
+                Block(
+                    dim=embed_dim[3],
+                    num_heads=16,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    downsample=downsamples[3],
+                    conv_ffn=conv_ffn) for i in range(depth[3])
+            ])
+            self.norm = norm_layer(embed_dim[-1])
+        else:
+            self.blocks4 = nn.LayerList([
+                MixBlock(
+                    dim=embed_dim[3],
+                    num_heads=num_heads[3],
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    downsample=downsamples[3],
+                    conv_ffn=conv_ffn) for i in range(depth[3])
+            ])
+            self.norm = nn.BatchNorm2D(embed_dim[-1])
+
+        # Representation layer
+        if representation_size:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(
+                OrderedDict([('fc', nn.Linear(embed_dim, representation_size)),
+                             ('act', nn.Tanh())]))
+        else:
+            self.pre_logits = Identity()
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim[-1],
+                              class_num) if class_num > 0 else Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            TruncatedNormal(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, class_num, global_pool=''):
+        self.class_num = class_num
+        self.head = nn.Linear(self.embed_dim,
+                              class_num) if class_num > 0 else Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed1(x)
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x)
+        x = self.patch_embed2(x)
+        for blk in self.blocks2:
+            x = blk(x)
+        x = self.patch_embed3(x)
+        for blk in self.blocks3:
+            x = blk(x)
+        x = self.patch_embed4(x)
+        if self.mixture:
+            x = x.flatten(2).transpose([0, 2, 1])
+        for blk in self.blocks4:
+            x = blk(x)
+        x = self.norm(x)
+        x = self.pre_logits(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.mixture:
+            x = x.mean(1)
+        else:
+            x = x.flatten(2).mean(-1)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DSNet_tiny(pretrained=False, use_ssld=False, **kwargs):
+    model = MixVisionTransformer(
+        patch_size=16,
+        depth=[2, 2, 4, 1],
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DSNet_tiny"], use_ssld=use_ssld)
+    return model
+
+
+def DSNet_small(pretrained=False, use_ssld=False, **kwargs):
+    model = MixVisionTransformer(
+        patch_size=16,
+        depth=[3, 4, 8, 3],
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DSNet_small"], use_ssld=use_ssld)
+    return model
+
+
+def DSNet_base(pretrained=False, use_ssld=False, **kwargs):
+    model = MixVisionTransformer(
+        patch_size=16,
+        depth=[3, 4, 28, 3],
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DSNet_base"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/efficientnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/efficientnet.py
new file mode 100755
index 000000000..9217efd4f
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/efficientnet.py
@@ -0,0 +1,1028 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/lukemelas/EfficientNet-PyTorch
+# reference: https://arxiv.org/abs/1905.11946
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+import math
+import collections
+import re
+import copy
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "EfficientNetB0_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB0_small_pretrained.pdparams",
+    "EfficientNetB0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB0_pretrained.pdparams",
+    "EfficientNetB1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB1_pretrained.pdparams",
+    "EfficientNetB2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB2_pretrained.pdparams",
+    "EfficientNetB3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB3_pretrained.pdparams",
+    "EfficientNetB4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB4_pretrained.pdparams",
+    "EfficientNetB5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB5_pretrained.pdparams",
+    "EfficientNetB6":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB6_pretrained.pdparams",
+    "EfficientNetB7":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB7_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'batch_norm_momentum',
+    'batch_norm_epsilon',
+    'dropout_rate',
+    'num_classes',
+    'width_coefficient',
+    'depth_coefficient',
+    'depth_divisor',
+    'depth_trunc',
+    'min_depth',
+    'drop_connect_rate',
+])
+
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'stride', 'se_ratio'
+])
+
+GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)
+
+
+def efficientnet_params(model_name):
+    """ Map EfficientNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients:   width,depth,resolution,dropout
+        'efficientnet-b0-small': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    }
+    return params_dict[model_name]
+
+
+def efficientnet(width_coefficient=None,
+                 depth_coefficient=None,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.2):
+    """ Get block arguments according to parameter and coefficients. """
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        dropout_rate=dropout_rate,
+        drop_connect_rate=drop_connect_rate,
+        num_classes=1000,
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        depth_divisor=8,
+        depth_trunc='ceil',
+        min_depth=None)
+
+    return blocks_args, global_params
+
+
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('efficientnet'):
+        w, d, _, p = efficientnet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' %
+                                  model_name)
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+def round_filters(filters, global_params):
+    """ Calculate and round number of filters based on depth multiplier. """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """ Round number of filters based on depth multiplier. """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    if global_params.depth_trunc == 'round':
+        return max(1, round(multiplier * repeats))
+    else:
+        return int(math.ceil(multiplier * repeats))
+
+
+class BlockDecoder(object):
+    """
+    Block Decoder, straight from the official TensorFlow repository.
+    """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """ Gets a block through a string notation of arguments. """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        cond_1 = ('s' in options and len(options['s']) == 1)
+        cond_2 = ((len(options['s']) == 2) and
+                  (options['s'][0] == options['s'][1]))
+        assert (cond_1 or cond_2)
+
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=[int(options['s'][0])])
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' %
+            (block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters, 'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """
+        Decode a list of string notations to specify blocks in the network.
+
+        string_list: list of strings, each string is a notation of block
+        return
+            list of BlockArgs namedtuples of block args
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """
+        Encodes a list of BlockArgs to a list of strings.
+
+        :param blocks_args: a list of BlockArgs namedtuples of block args
+        :return: a list of strings, each string is a notation of block
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def initial_type(name, use_bias=False):
+    param_attr = ParamAttr(name=name + "_weights")
+    if use_bias:
+        bias_attr = ParamAttr(name=name + "_offset")
+    else:
+        bias_attr = False
+    return param_attr, bias_attr
+
+
+def init_batch_norm_layer(name="batch_norm"):
+    param_attr = ParamAttr(name=name + "_scale")
+    bias_attr = ParamAttr(name=name + "_offset")
+    return param_attr, bias_attr
+
+
+def init_fc_layer(name="fc"):
+    param_attr = ParamAttr(name=name + "_weights")
+    bias_attr = ParamAttr(name=name + "_offset")
+    return param_attr, bias_attr
+
+
+def cal_padding(img_size, stride, filter_size, dilation=1):
+    """Calculate padding size."""
+    if img_size % stride == 0:
+        out_size = max(filter_size - stride, 0)
+    else:
+        out_size = max(filter_size - (img_size % stride), 0)
+    return out_size // 2, out_size - out_size // 2
+
+
+inp_shape = {
+    "b0_small": [224, 112, 112, 56, 28, 14, 14, 7],
+    "b0": [224, 112, 112, 56, 28, 14, 14, 7],
+    "b1": [240, 120, 120, 60, 30, 15, 15, 8],
+    "b2": [260, 130, 130, 65, 33, 17, 17, 9],
+    "b3": [300, 150, 150, 75, 38, 19, 19, 10],
+    "b4": [380, 190, 190, 95, 48, 24, 24, 12],
+    "b5": [456, 228, 228, 114, 57, 29, 29, 15],
+    "b6": [528, 264, 264, 132, 66, 33, 33, 17],
+    "b7": [600, 300, 300, 150, 75, 38, 38, 19]
+}
+
+
+def _drop_connect(inputs, prob, is_test):
+    if is_test:
+        output = inputs
+    else:
+        keep_prob = 1.0 - prob
+        inputs_shape = inputs.shape
+        random_tensor = keep_prob + paddle.rand(
+            shape=[inputs_shape[0], 1, 1, 1])
+        binary_tensor = paddle.floor(random_tensor)
+        output = paddle.multiply(inputs, binary_tensor) / keep_prob
+    return output
+
+
+class Conv2ds(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=None,
+                 name="conv2d",
+                 act=None,
+                 use_bias=False,
+                 padding_type=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(Conv2ds, self).__init__()
+        assert act in [None, "swish", "sigmoid"]
+        self.act = act
+
+        param_attr, bias_attr = initial_type(name=name, use_bias=use_bias)
+
+        def get_padding(filter_size, stride=1, dilation=1):
+            padding = ((stride - 1) + dilation * (filter_size - 1)) // 2
+            return padding
+
+        self.need_crop = False
+        if padding_type == "SAME":
+            inps = 1 if model_name == None and cur_stage == None else inp_shape[
+                model_name][cur_stage]
+            top_padding, bottom_padding = cal_padding(inps, stride,
+                                                      filter_size)
+            left_padding, right_padding = cal_padding(inps, stride,
+                                                      filter_size)
+            height_padding = bottom_padding
+            width_padding = right_padding
+            if top_padding != bottom_padding or left_padding != right_padding:
+                height_padding = top_padding + stride
+                width_padding = left_padding + stride
+                self.need_crop = True
+            padding = [height_padding, width_padding]
+        elif padding_type == "VALID":
+            height_padding = 0
+            width_padding = 0
+            padding = [height_padding, width_padding]
+        elif padding_type == "DYNAMIC":
+            padding = get_padding(filter_size, stride)
+        else:
+            padding = padding_type
+
+        groups = 1 if groups is None else groups
+        self._conv = Conv2D(
+            input_channels,
+            output_channels,
+            filter_size,
+            groups=groups,
+            stride=stride,
+            #             act=act,
+            padding=padding,
+            weight_attr=param_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        if self.act == "swish":
+            x = F.swish(x)
+        elif self.act == "sigmoid":
+            x = F.sigmoid(x)
+
+        if self.need_crop:
+            x = x[:, :, 1:, 1:]
+        return x
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 filter_size,
+                 output_channels,
+                 stride=1,
+                 num_groups=1,
+                 global_params=None,
+                 padding_type="SAME",
+                 conv_act=None,
+                 bn_act="swish",
+                 use_bn=True,
+                 use_bias=False,
+                 name=None,
+                 conv_name=None,
+                 bn_name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2ds(
+            input_channels=input_channels,
+            output_channels=output_channels,
+            filter_size=filter_size,
+            stride=stride,
+            groups=num_groups,
+            act=conv_act,
+            padding_type=padding_type,
+            name=conv_name,
+            use_bias=use_bias,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self.use_bn = use_bn
+        if use_bn is True:
+            bn_name = name + bn_name
+            param_attr, bias_attr = init_batch_norm_layer(bn_name)
+            momentum = global_params.batch_norm_momentum
+            epsilon = global_params.batch_norm_epsilon
+
+            self._bn = BatchNorm(
+                num_channels=output_channels,
+                act=bn_act,
+                momentum=momentum,
+                epsilon=epsilon,
+                moving_mean_name=bn_name + "_mean",
+                moving_variance_name=bn_name + "_variance",
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        if self.use_bn:
+            x = self._conv(inputs)
+            x = self._bn(x)
+            return x
+        else:
+            return self._conv(inputs)
+
+
+class ExpandConvNorm(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 global_params,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ExpandConvNorm, self).__init__()
+
+        self.oup = block_args.input_filters * block_args.expand_ratio
+        self.expand_ratio = block_args.expand_ratio
+
+        if self.expand_ratio != 1:
+            self._conv = ConvBNLayer(
+                input_channels,
+                1,
+                self.oup,
+                global_params=global_params,
+                bn_act=None,
+                padding_type=padding_type,
+                name=name,
+                conv_name=name + "_expand_conv",
+                bn_name="_bn0",
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        if self.expand_ratio != 1:
+            return self._conv(inputs)
+        else:
+            return inputs
+
+
+class DepthwiseConvNorm(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 global_params,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(DepthwiseConvNorm, self).__init__()
+
+        self.k = block_args.kernel_size
+        self.s = block_args.stride
+        if isinstance(self.s, list) or isinstance(self.s, tuple):
+            self.s = self.s[0]
+        oup = block_args.input_filters * block_args.expand_ratio
+
+        self._conv = ConvBNLayer(
+            input_channels,
+            self.k,
+            oup,
+            self.s,
+            num_groups=input_channels,
+            global_params=global_params,
+            bn_act=None,
+            padding_type=padding_type,
+            name=name,
+            conv_name=name + "_depthwise_conv",
+            bn_name="_bn1",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class ProjectConvNorm(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 global_params,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ProjectConvNorm, self).__init__()
+
+        self.final_oup = block_args.output_filters
+
+        self._conv = ConvBNLayer(
+            input_channels,
+            1,
+            self.final_oup,
+            global_params=global_params,
+            bn_act=None,
+            padding_type=padding_type,
+            name=name,
+            conv_name=name + "_project_conv",
+            bn_name="_bn2",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class SEBlock(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 num_squeezed_channels,
+                 oup,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(SEBlock, self).__init__()
+
+        self._pool = AdaptiveAvgPool2D(1)
+        self._conv1 = Conv2ds(
+            input_channels,
+            num_squeezed_channels,
+            1,
+            use_bias=True,
+            padding_type=padding_type,
+            act="swish",
+            name=name + "_se_reduce")
+
+        self._conv2 = Conv2ds(
+            num_squeezed_channels,
+            oup,
+            1,
+            act="sigmoid",
+            use_bias=True,
+            padding_type=padding_type,
+            name=name + "_se_expand")
+
+    def forward(self, inputs):
+        x = self._pool(inputs)
+        x = self._conv1(x)
+        x = self._conv2(x)
+        out = paddle.multiply(inputs, x)
+        return out
+
+
+class MbConvBlock(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 global_params,
+                 padding_type,
+                 use_se,
+                 name=None,
+                 drop_connect_rate=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(MbConvBlock, self).__init__()
+
+        oup = block_args.input_filters * block_args.expand_ratio
+        self.block_args = block_args
+        self.has_se = use_se and (block_args.se_ratio is not None) and (
+            0 < block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip
+        self.expand_ratio = block_args.expand_ratio
+        self.drop_connect_rate = drop_connect_rate
+
+        if self.expand_ratio != 1:
+            self._ecn = ExpandConvNorm(
+                input_channels,
+                block_args,
+                global_params,
+                padding_type=padding_type,
+                name=name,
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+        self._dcn = DepthwiseConvNorm(
+            input_channels * block_args.expand_ratio,
+            block_args,
+            global_params,
+            padding_type=padding_type,
+            name=name,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+        if self.has_se:
+            num_squeezed_channels = max(
+                1, int(block_args.input_filters * block_args.se_ratio))
+            self._se = SEBlock(
+                input_channels * block_args.expand_ratio,
+                num_squeezed_channels,
+                oup,
+                padding_type=padding_type,
+                name=name,
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+        self._pcn = ProjectConvNorm(
+            input_channels * block_args.expand_ratio,
+            block_args,
+            global_params,
+            padding_type=padding_type,
+            name=name,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+        self.final_oup = self._pcn.final_oup
+
+    def forward(self, inputs):
+        x = inputs
+        if self.expand_ratio != 1:
+            x = self._ecn(x)
+            x = F.swish(x)
+
+        x = self._dcn(x)
+        x = F.swish(x)
+        if self.has_se:
+            x = self._se(x)
+        x = self._pcn(x)
+
+        if self.id_skip and \
+                self.block_args.stride == 1 and \
+                self.block_args.input_filters == self.block_args.output_filters:
+            if self.drop_connect_rate:
+                x = _drop_connect(x, self.drop_connect_rate, not self.training)
+            x = paddle.add(x, inputs)
+        return x
+
+
+class ConvStemNorm(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 padding_type,
+                 _global_params,
+                 name=None,
+                 model_name=None,
+                 fix_stem=False,
+                 cur_stage=None):
+        super(ConvStemNorm, self).__init__()
+
+        output_channels = 32 if fix_stem else round_filters(32, _global_params)
+        self._conv = ConvBNLayer(
+            input_channels,
+            filter_size=3,
+            output_channels=output_channels,
+            stride=2,
+            global_params=_global_params,
+            bn_act=None,
+            padding_type=padding_type,
+            name="",
+            conv_name="_conv_stem",
+            bn_name="_bn0",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class ExtractFeatures(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 _block_args,
+                 _global_params,
+                 padding_type,
+                 use_se,
+                 model_name=None,
+                 fix_stem=False):
+        super(ExtractFeatures, self).__init__()
+
+        self._global_params = _global_params
+
+        self._conv_stem = ConvStemNorm(
+            input_channels,
+            padding_type=padding_type,
+            _global_params=_global_params,
+            model_name=model_name,
+            fix_stem=fix_stem,
+            cur_stage=0)
+
+        self.block_args_copy = copy.deepcopy(_block_args)
+        idx = 0
+        block_size = 0
+        for block_arg in self.block_args_copy:
+            block_arg = block_arg._replace(
+                input_filters=round_filters(block_arg.input_filters,
+                                            _global_params),
+                output_filters=round_filters(block_arg.output_filters,
+                                             _global_params),
+                num_repeat=round_repeats(block_arg.num_repeat, _global_params))
+            block_size += 1
+            for _ in range(block_arg.num_repeat - 1):
+                block_size += 1
+
+        self.final_oup = None
+        self.conv_seq = []
+        cur_stage = 1
+        for block_idx, block_args in enumerate(_block_args):
+            if not (fix_stem and block_idx == 0):
+                block_args = block_args._replace(input_filters=round_filters(
+                    block_args.input_filters, _global_params))
+            block_args = block_args._replace(
+                output_filters=round_filters(block_args.output_filters,
+                                             _global_params),
+                num_repeat=round_repeats(block_args.num_repeat,
+                                         _global_params))
+
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / block_size
+
+            _mc_block = self.add_sublayer(
+                "_blocks." + str(idx) + ".",
+                MbConvBlock(
+                    block_args.input_filters,
+                    block_args=block_args,
+                    global_params=_global_params,
+                    padding_type=padding_type,
+                    use_se=use_se,
+                    name="_blocks." + str(idx) + ".",
+                    drop_connect_rate=drop_connect_rate,
+                    model_name=model_name,
+                    cur_stage=cur_stage))
+            self.conv_seq.append(_mc_block)
+            self.final_oup = _mc_block.final_oup
+            idx += 1
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                drop_connect_rate = self._global_params.drop_connect_rate
+                if drop_connect_rate:
+                    drop_connect_rate *= float(idx) / block_size
+                _mc_block = self.add_sublayer(
+                    "block." + str(idx) + ".",
+                    MbConvBlock(
+                        block_args.input_filters,
+                        block_args,
+                        global_params=_global_params,
+                        padding_type=padding_type,
+                        use_se=use_se,
+                        name="_blocks." + str(idx) + ".",
+                        drop_connect_rate=drop_connect_rate,
+                        model_name=model_name,
+                        cur_stage=cur_stage))
+                self.conv_seq.append(_mc_block)
+                self.final_oup = _mc_block.final_oup
+                idx += 1
+            cur_stage += 1
+
+    def forward(self, inputs):
+        x = self._conv_stem(inputs)
+        x = F.swish(x)
+        for _mc_block in self.conv_seq:
+            x = _mc_block(x)
+        return x
+
+
+class EfficientNet(TheseusLayer):
+    def __init__(self,
+                 block_args,
+                 global_params,
+                 name="b0",
+                 padding_type="SAME",
+                 use_se=True,
+                 fix_stem=False,
+                 num_features=None,
+                 class_num=1000):
+        super(EfficientNet, self).__init__()
+
+        self.name = name
+        self.fix_stem = fix_stem
+        self._block_args = block_args
+        self._global_params = global_params
+        self.padding_type = padding_type
+        self.use_se = use_se
+
+        self._ef = ExtractFeatures(
+            3,
+            self._block_args,
+            self._global_params,
+            self.padding_type,
+            self.use_se,
+            model_name=self.name,
+            fix_stem=self.fix_stem)
+
+        output_channels = num_features or round_filters(1280,
+                                                        self._global_params)
+        self._conv = ConvBNLayer(
+            self._ef.final_oup,
+            1,
+            output_channels,
+            global_params=self._global_params,
+            bn_act="swish",
+            padding_type=self.padding_type,
+            name="",
+            conv_name="_conv_head",
+            bn_name="_bn1",
+            model_name=self.name,
+            cur_stage=7)
+        self._pool = AdaptiveAvgPool2D(1)
+
+        if self._global_params.dropout_rate:
+            self._drop = Dropout(
+                p=self._global_params.dropout_rate, mode="upscale_in_train")
+
+        param_attr, bias_attr = init_fc_layer("_fc")
+        self._fc = Linear(
+            output_channels,
+            class_num,
+            weight_attr=param_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        x = self._ef(inputs)
+        x = self._conv(x)
+        x = self._pool(x)
+        if self._global_params.dropout_rate:
+            x = self._drop(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def EfficientNetB0_small(padding_type='DYNAMIC',
+                         override_params=None,
+                         use_se=False,
+                         pretrained=False,
+                         use_ssld=False,
+                         **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b0-small",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b0',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB0_small"])
+    return model
+
+
+def EfficientNetB0(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b0",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b0',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB0"])
+    return model
+
+
+def EfficientNetB1(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b1",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b1',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB1"])
+    return model
+
+
+def EfficientNetB2(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b2",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b2',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB2"])
+    return model
+
+
+def EfficientNetB3(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b3",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b3',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB3"])
+    return model
+
+
+def EfficientNetB4(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b4",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b4',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB4"])
+    return model
+
+
+def EfficientNetB5(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b5",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b5',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB5"])
+    return model
+
+
+def EfficientNetB6(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b6",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b6',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB6"])
+    return model
+
+
+def EfficientNetB7(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b7",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b7',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB7"])
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/efficientnet_v2.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/efficientnet_v2.py
new file mode 100755
index 000000000..f620d895d
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/efficientnet_v2.py
@@ -0,0 +1,994 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/lukemelas/EfficientNet-PyTorch
+# reference: https://arxiv.org/abs/1905.11946
+
+import math
+import re
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant, Normal, Uniform
+from paddle.regularizer import L2Decay
+
+from ....utils.config import AttrDict
+
+from ....utils.save_load import (load_dygraph_pretrain,
+                                 load_dygraph_pretrain)
+
+MODEL_URLS = {
+    "EfficientNetV2_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetV2_S_pretrained.pdparams",
+    "EfficientNetV2_M":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetV2_M_pretrained.pdparams",
+    "EfficientNetV2_L":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetV2_L_pretrained.pdparams",
+    "EfficientNetV2_XL":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetV2_XL_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+inp_shape = {
+    "efficientnetv2-s": [384, 192, 192, 96, 48, 24, 24, 12],
+    "efficientnetv2-m": [384, 192, 192, 96, 48, 24, 24, 12],
+    "efficientnetv2-l": [384, 192, 192, 96, 48, 24, 24, 12],
+    "efficientnetv2-xl": [384, 192, 192, 96, 48, 24, 24, 12],
+}
+
+
+def cal_padding(img_size, stride, kernel_size):
+    """Calculate padding size."""
+    if img_size % stride == 0:
+        out_size = max(kernel_size - stride, 0)
+    else:
+        out_size = max(kernel_size - (img_size % stride), 0)
+    return out_size // 2, out_size - out_size // 2
+
+
+class Conv2ds(nn.Layer):
+    """Customed Conv2D with tensorflow's padding style
+
+    Args:
+        input_channels (int): input channels
+        output_channels (int): output channels
+        kernel_size (int): filter size
+        stride (int, optional): stride. Defaults to 1.
+        padding (int, optional): padding. Defaults to 0.
+        groups (int, optional): groups. Defaults to None.
+        act (str, optional): act. Defaults to None.
+        use_bias (bool, optional): use_bias. Defaults to None.
+        padding_type (str, optional): padding_type. Defaults to None.
+        model_name (str, optional): model name. Defaults to None.
+        cur_stage (int, optional): current stage. Defaults to None.
+
+    Returns:
+        nn.Layer: Customed Conv2D instance
+    """
+
+    def __init__(self,
+                 input_channels: int,
+                 output_channels: int,
+                 kernel_size: int,
+                 stride=1,
+                 padding=0,
+                 groups=None,
+                 act=None,
+                 use_bias=None,
+                 padding_type=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(Conv2ds, self).__init__()
+        assert act in [None, "swish", "sigmoid"]
+        self._act = act
+
+        def get_padding(kernel_size, stride=1, dilation=1):
+            padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+            return padding
+
+        inps = inp_shape[model_name][cur_stage]
+        self.need_crop = False
+        if padding_type == "SAME":
+            top_padding, bottom_padding = cal_padding(inps, stride,
+                                                      kernel_size)
+            left_padding, right_padding = cal_padding(inps, stride,
+                                                      kernel_size)
+            height_padding = bottom_padding
+            width_padding = right_padding
+            if top_padding != bottom_padding or left_padding != right_padding:
+                height_padding = top_padding + stride
+                width_padding = left_padding + stride
+                self.need_crop = True
+            padding = [height_padding, width_padding]
+        elif padding_type == "VALID":
+            height_padding = 0
+            width_padding = 0
+            padding = [height_padding, width_padding]
+        elif padding_type == "DYNAMIC":
+            padding = get_padding(kernel_size, stride)
+        else:
+            padding = padding_type
+
+        groups = 1 if groups is None else groups
+        self._conv = nn.Conv2D(
+            input_channels,
+            output_channels,
+            kernel_size,
+            groups=groups,
+            stride=stride,
+            padding=padding,
+            weight_attr=None,
+            bias_attr=use_bias
+            if not use_bias else ParamAttr(regularizer=L2Decay(0.0)))
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        if self._act == "swish":
+            x = F.swish(x)
+        elif self._act == "sigmoid":
+            x = F.sigmoid(x)
+
+        if self.need_crop:
+            x = x[:, :, 1:, 1:]
+        return x
+
+
+class BlockDecoder(object):
+    """Block Decoder for readability."""
+
+    def _decode_block_string(self, block_string):
+        """Gets a block through a string notation of arguments."""
+        assert isinstance(block_string, str)
+        ops = block_string.split('_')
+        options = AttrDict()
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        t = AttrDict(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            in_channels=int(options['i']),
+            out_channels=int(options['o']),
+            expand_ratio=int(options['e']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            strides=int(options['s']),
+            conv_type=int(options['c']) if 'c' in options else 0, )
+        return t
+
+    def _encode_block_string(self, block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d' % block.strides,
+            'e%s' % block.expand_ratio,
+            'i%d' % block.in_channels,
+            'o%d' % block.out_channels,
+            'c%d' % block.conv_type,
+            'f%d' % block.fused_conv,
+        ]
+        if block.se_ratio > 0 and block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        return '_'.join(args)
+
+    def decode(self, string_list):
+        """Decodes a list of string notations to specify blocks inside the network.
+
+        Args:
+        string_list: a list of strings, each string is a notation of block.
+
+        Returns:
+        A list of namedtuples to represent blocks arguments.
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(self._decode_block_string(block_string))
+        return blocks_args
+
+    def encode(self, blocks_args):
+        """Encodes a list of Blocks to a list of strings.
+
+        Args:
+        blocks_args: A list of namedtuples to represent blocks arguments.
+        Returns:
+        a list of strings, each string is a notation of block.
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(self._encode_block_string(block))
+        return block_strings
+
+
+#################### EfficientNet V2 configs ####################
+v2_base_block = [  # The baseline config for v2 models.
+    "r1_k3_s1_e1_i32_o16_c1",
+    "r2_k3_s2_e4_i16_o32_c1",
+    "r2_k3_s2_e4_i32_o48_c1",
+    "r3_k3_s2_e4_i48_o96_se0.25",
+    "r5_k3_s1_e6_i96_o112_se0.25",
+    "r8_k3_s2_e6_i112_o192_se0.25",
+]
+
+v2_s_block = [  # about base * (width1.4, depth1.8)
+    "r2_k3_s1_e1_i24_o24_c1",
+    "r4_k3_s2_e4_i24_o48_c1",
+    "r4_k3_s2_e4_i48_o64_c1",
+    "r6_k3_s2_e4_i64_o128_se0.25",
+    "r9_k3_s1_e6_i128_o160_se0.25",
+    "r15_k3_s2_e6_i160_o256_se0.25",
+]
+
+v2_m_block = [  # about base * (width1.6, depth2.2)
+    "r3_k3_s1_e1_i24_o24_c1",
+    "r5_k3_s2_e4_i24_o48_c1",
+    "r5_k3_s2_e4_i48_o80_c1",
+    "r7_k3_s2_e4_i80_o160_se0.25",
+    "r14_k3_s1_e6_i160_o176_se0.25",
+    "r18_k3_s2_e6_i176_o304_se0.25",
+    "r5_k3_s1_e6_i304_o512_se0.25",
+]
+
+v2_l_block = [  # about base * (width2.0, depth3.1)
+    "r4_k3_s1_e1_i32_o32_c1",
+    "r7_k3_s2_e4_i32_o64_c1",
+    "r7_k3_s2_e4_i64_o96_c1",
+    "r10_k3_s2_e4_i96_o192_se0.25",
+    "r19_k3_s1_e6_i192_o224_se0.25",
+    "r25_k3_s2_e6_i224_o384_se0.25",
+    "r7_k3_s1_e6_i384_o640_se0.25",
+]
+
+v2_xl_block = [  # only for 21k pretraining.
+    "r4_k3_s1_e1_i32_o32_c1",
+    "r8_k3_s2_e4_i32_o64_c1",
+    "r8_k3_s2_e4_i64_o96_c1",
+    "r16_k3_s2_e4_i96_o192_se0.25",
+    "r24_k3_s1_e6_i192_o256_se0.25",
+    "r32_k3_s2_e6_i256_o512_se0.25",
+    "r8_k3_s1_e6_i512_o640_se0.25",
+]
+efficientnetv2_params = {
+    # params:            (block, width, depth, dropout)
+    "efficientnetv2-s":
+    (v2_s_block, 1.0, 1.0, np.linspace(0.1, 0.3, 4).tolist()),
+    "efficientnetv2-m": (v2_m_block, 1.0, 1.0, 0.3),
+    "efficientnetv2-l": (v2_l_block, 1.0, 1.0, 0.4),
+    "efficientnetv2-xl": (v2_xl_block, 1.0, 1.0, 0.4),
+}
+
+
+def efficientnetv2_config(model_name: str):
+    """EfficientNetV2 model config."""
+    block, width, depth, dropout = efficientnetv2_params[model_name]
+
+    cfg = AttrDict(model=AttrDict(
+        model_name=model_name,
+        blocks_args=BlockDecoder().decode(block),
+        width_coefficient=width,
+        depth_coefficient=depth,
+        dropout_rate=dropout,
+        feature_size=1280,
+        bn_momentum=0.9,
+        bn_epsilon=1e-3,
+        depth_divisor=8,
+        min_depth=8,
+        act_fn="silu",
+        survival_prob=0.8,
+        local_pooling=False,
+        conv_dropout=0,
+        num_classes=1000))
+    return cfg
+
+
+def get_model_config(model_name: str):
+    """Main entry for model name to config."""
+    if model_name.startswith("efficientnetv2-"):
+        return efficientnetv2_config(model_name)
+    raise ValueError(f"Unknown model_name {model_name}")
+
+
+################################################################################
+
+
+def round_filters(filters,
+                  width_coefficient,
+                  depth_divisor,
+                  min_depth,
+                  skip=False):
+    """Round number of filters based on depth multiplier."""
+    multiplier = width_coefficient
+    divisor = depth_divisor
+    min_depth = min_depth
+    if skip or not multiplier:
+        return filters
+
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    return int(new_filters)
+
+
+def round_repeats(repeats, multiplier, skip=False):
+    """Round number of filters based on depth multiplier."""
+    if skip or not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+
+
+def activation_fn(act_fn: str):
+    """Customized non-linear activation type."""
+    if not act_fn:
+        return nn.Silu()
+    elif act_fn in ("silu", "swish"):
+        return nn.Swish()
+    elif act_fn == "relu":
+        return nn.ReLU()
+    elif act_fn == "relu6":
+        return nn.ReLU6()
+    elif act_fn == "elu":
+        return nn.ELU()
+    elif act_fn == "leaky_relu":
+        return nn.LeakyReLU()
+    elif act_fn == "selu":
+        return nn.SELU()
+    elif act_fn == "mish":
+        return nn.Mish()
+    else:
+        raise ValueError("Unsupported act_fn {}".format(act_fn))
+
+
+def drop_path(x, training=False, survival_prob=1.0):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if not training:
+        return x
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    keep_prob = paddle.to_tensor(survival_prob, dtype=x.dtype)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class SE(nn.Layer):
+    """Squeeze-and-excitation layer.
+
+    Args:
+        local_pooling (bool): local_pooling
+        act_fn (str): act_fn
+        in_channels (int): in_channels
+        se_channels (int): se_channels
+        out_channels (int): out_channels
+        cur_stage (int): cur_stage
+        padding_type (str): padding_type
+        model_name (str): model_name
+    """
+
+    def __init__(self,
+                 local_pooling: bool,
+                 act_fn: str,
+                 in_channels: int,
+                 se_channels: int,
+                 out_channels: int,
+                 cur_stage: int,
+                 padding_type: str,
+                 model_name: str):
+        super(SE, self).__init__()
+
+        self._local_pooling = local_pooling
+        self._act = activation_fn(act_fn)
+
+        # Squeeze and Excitation layer.
+        self._se_reduce = Conv2ds(
+            in_channels,
+            se_channels,
+            1,
+            stride=1,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self._se_expand = Conv2ds(
+            se_channels,
+            out_channels,
+            1,
+            stride=1,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, x):
+        if self._local_pooling:
+            se_tensor = F.adaptive_avg_pool2d(x, output_size=1)
+        else:
+            se_tensor = paddle.mean(x, axis=[2, 3], keepdim=True)
+        se_tensor = self._se_expand(self._act(self._se_reduce(se_tensor)))
+        return F.sigmoid(se_tensor) * x
+
+
+class MBConvBlock(nn.Layer):
+    """A class of MBConv: Mobile Inverted Residual Bottleneck.
+
+    Args:
+        se_ratio (int): se_ratio
+        in_channels (int): in_channels
+        expand_ratio (int): expand_ratio
+        kernel_size (int): kernel_size
+        strides (int): strides
+        out_channels (int): out_channels
+        bn_momentum (float): bn_momentum
+        bn_epsilon (float): bn_epsilon
+        local_pooling (bool): local_pooling
+        conv_dropout (float): conv_dropout
+        cur_stage (int): cur_stage
+        padding_type (str): padding_type
+        model_name (str): model_name
+    """
+
+    def __init__(self,
+                 se_ratio: int,
+                 in_channels: int,
+                 expand_ratio: int,
+                 kernel_size: int,
+                 strides: int,
+                 out_channels: int,
+                 bn_momentum: float,
+                 bn_epsilon: float,
+                 local_pooling: bool,
+                 conv_dropout: float,
+                 cur_stage: int,
+                 padding_type: str,
+                 model_name: str):
+        super(MBConvBlock, self).__init__()
+
+        self.se_ratio = se_ratio
+        self.in_channels = in_channels
+        self.expand_ratio = expand_ratio
+        self.kernel_size = kernel_size
+        self.strides = strides
+        self.out_channels = out_channels
+
+        self.bn_momentum = bn_momentum
+        self.bn_epsilon = bn_epsilon
+
+        self._local_pooling = local_pooling
+        self.act_fn = None
+        self.conv_dropout = conv_dropout
+
+        self._act = activation_fn(None)
+        self._has_se = (self.se_ratio is not None and 0 < self.se_ratio <= 1)
+        """Builds block according to the arguments."""
+        expand_channels = self.in_channels * self.expand_ratio
+        kernel_size = self.kernel_size
+
+        # Expansion phase. Called if not using fused convolutions and expansion
+        # phase is necessary.
+        if self.expand_ratio != 1:
+            self._expand_conv = Conv2ds(
+                self.in_channels,
+                expand_channels,
+                1,
+                stride=1,
+                use_bias=False,
+                padding_type=padding_type,
+                model_name=model_name,
+                cur_stage=cur_stage)
+            self._norm0 = nn.BatchNorm2D(
+                expand_channels,
+                self.bn_momentum,
+                self.bn_epsilon,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # Depth-wise convolution phase. Called if not using fused convolutions.
+        self._depthwise_conv = Conv2ds(
+            expand_channels,
+            expand_channels,
+            kernel_size,
+            padding=kernel_size // 2,
+            stride=self.strides,
+            groups=expand_channels,
+            use_bias=False,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+        self._norm1 = nn.BatchNorm2D(
+            expand_channels,
+            self.bn_momentum,
+            self.bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        if self._has_se:
+            num_reduced_filters = max(1, int(self.in_channels * self.se_ratio))
+            self._se = SE(self._local_pooling, None, expand_channels,
+                          num_reduced_filters, expand_channels, cur_stage,
+                          padding_type, model_name)
+        else:
+            self._se = None
+
+        # Output phase.
+        self._project_conv = Conv2ds(
+            expand_channels,
+            self.out_channels,
+            1,
+            stride=1,
+            use_bias=False,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self._norm2 = nn.BatchNorm2D(
+            self.out_channels,
+            self.bn_momentum,
+            self.bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.drop_out = nn.Dropout(self.conv_dropout)
+
+    def residual(self, inputs, x, survival_prob):
+        if (self.strides == 1 and self.in_channels == self.out_channels):
+            # Apply only if skip connection presents.
+            if survival_prob:
+                x = drop_path(x, self.training, survival_prob)
+            x = paddle.add(x, inputs)
+
+        return x
+
+    def forward(self, inputs, survival_prob=None):
+        """Implementation of call().
+
+        Args:
+            inputs: the inputs tensor.
+            survival_prob: float, between 0 to 1, drop connect rate.
+
+        Returns:
+            A output tensor.
+        """
+        x = inputs
+        if self.expand_ratio != 1:
+            x = self._act(self._norm0(self._expand_conv(x)))
+
+        x = self._act(self._norm1(self._depthwise_conv(x)))
+
+        if self.conv_dropout and self.expand_ratio > 1:
+            x = self.drop_out(x)
+
+        if self._se:
+            x = self._se(x)
+
+        x = self._norm2(self._project_conv(x))
+        x = self.residual(inputs, x, survival_prob)
+
+        return x
+
+
+class FusedMBConvBlock(MBConvBlock):
+    """Fusing the proj conv1x1 and depthwise_conv into a conv2d."""
+
+    def __init__(self, se_ratio, in_channels, expand_ratio, kernel_size,
+                 strides, out_channels, bn_momentum, bn_epsilon, local_pooling,
+                 conv_dropout, cur_stage, padding_type, model_name):
+        """Builds block according to the arguments."""
+        super(MBConvBlock, self).__init__()
+        self.se_ratio = se_ratio
+        self.in_channels = in_channels
+        self.expand_ratio = expand_ratio
+        self.kernel_size = kernel_size
+        self.strides = strides
+        self.out_channels = out_channels
+
+        self.bn_momentum = bn_momentum
+        self.bn_epsilon = bn_epsilon
+
+        self._local_pooling = local_pooling
+        self.act_fn = None
+        self.conv_dropout = conv_dropout
+
+        self._act = activation_fn(None)
+        self._has_se = (self.se_ratio is not None and 0 < self.se_ratio <= 1)
+
+        expand_channels = self.in_channels * self.expand_ratio
+        kernel_size = self.kernel_size
+        if self.expand_ratio != 1:
+            # Expansion phase:
+            self._expand_conv = Conv2ds(
+                self.in_channels,
+                expand_channels,
+                kernel_size,
+                padding=kernel_size // 2,
+                stride=self.strides,
+                use_bias=False,
+                padding_type=padding_type,
+                model_name=model_name,
+                cur_stage=cur_stage)
+            self._norm0 = nn.BatchNorm2D(
+                expand_channels,
+                self.bn_momentum,
+                self.bn_epsilon,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        if self._has_se:
+            num_reduced_filters = max(1, int(self.in_channels * self.se_ratio))
+            self._se = SE(self._local_pooling, None, expand_channels,
+                          num_reduced_filters, expand_channels, cur_stage,
+                          padding_type, model_name)
+        else:
+            self._se = None
+
+        # Output phase:
+        self._project_conv = Conv2ds(
+            expand_channels,
+            self.out_channels,
+            1 if (self.expand_ratio != 1) else kernel_size,
+            padding=(1 if (self.expand_ratio != 1) else kernel_size) // 2,
+            stride=1 if (self.expand_ratio != 1) else self.strides,
+            use_bias=False,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self._norm1 = nn.BatchNorm2D(
+            self.out_channels,
+            self.bn_momentum,
+            self.bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.drop_out = nn.Dropout(conv_dropout)
+
+    def forward(self, inputs, survival_prob=None):
+        """Implementation of call().
+
+        Args:
+            inputs: the inputs tensor.
+            training: boolean, whether the model is constructed for training.
+            survival_prob: float, between 0 to 1, drop connect rate.
+
+        Returns:
+            A output tensor.
+        """
+        x = inputs
+        if self.expand_ratio != 1:
+            x = self._act(self._norm0(self._expand_conv(x)))
+
+        if self.conv_dropout and self.expand_ratio > 1:
+            x = self.drop_out(x)
+
+        if self._se:
+            x = self._se(x)
+
+        x = self._norm1(self._project_conv(x))
+        if self.expand_ratio == 1:
+            x = self._act(x)  # add act if no expansion.
+
+        x = self.residual(inputs, x, survival_prob)
+        return x
+
+
+class Stem(nn.Layer):
+    """Stem layer at the begining of the network."""
+
+    def __init__(self, width_coefficient, depth_divisor, min_depth, skip,
+                 bn_momentum, bn_epsilon, act_fn, stem_channels, cur_stage,
+                 padding_type, model_name):
+        super(Stem, self).__init__()
+        self._conv_stem = Conv2ds(
+            3,
+            round_filters(stem_channels, width_coefficient, depth_divisor,
+                          min_depth, skip),
+            3,
+            padding=1,
+            stride=2,
+            use_bias=False,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self._norm = nn.BatchNorm2D(
+            round_filters(stem_channels, width_coefficient, depth_divisor,
+                          min_depth, skip),
+            bn_momentum,
+            bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self._act = activation_fn(act_fn)
+
+    def forward(self, inputs):
+        return self._act(self._norm(self._conv_stem(inputs)))
+
+
+class Head(nn.Layer):
+    """Head layer for network outputs."""
+
+    def __init__(self,
+                 in_channels,
+                 feature_size,
+                 bn_momentum,
+                 bn_epsilon,
+                 act_fn,
+                 dropout_rate,
+                 local_pooling,
+                 width_coefficient,
+                 depth_divisor,
+                 min_depth,
+                 skip=False):
+        super(Head, self).__init__()
+        self.in_channels = in_channels
+        self.feature_size = feature_size
+        self.bn_momentum = bn_momentum
+        self.bn_epsilon = bn_epsilon
+        self.dropout_rate = dropout_rate
+        self._local_pooling = local_pooling
+        self._conv_head = nn.Conv2D(
+            in_channels,
+            round_filters(self.feature_size or 1280, width_coefficient,
+                          depth_divisor, min_depth, skip),
+            kernel_size=1,
+            stride=1,
+            bias_attr=False)
+        self._norm = nn.BatchNorm2D(
+            round_filters(self.feature_size or 1280, width_coefficient,
+                          depth_divisor, min_depth, skip),
+            self.bn_momentum,
+            self.bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self._act = activation_fn(act_fn)
+
+        self._avg_pooling = nn.AdaptiveAvgPool2D(output_size=1)
+
+        if isinstance(self.dropout_rate,
+                      (list, tuple)) or self.dropout_rate > 0:
+            self._dropout = nn.Dropout(self.dropout_rate[0] if isinstance(
+                self.dropout_rate, (list, tuple)) else self.dropout_rate)
+        else:
+            self._dropout = None
+
+    def forward(self, x):
+        """Call the layer."""
+        outputs = self._act(self._norm(self._conv_head(x)))
+
+        if self._local_pooling:
+            outputs = F.adaptive_avg_pool2d(outputs, output_size=1)
+            if self._dropout:
+                outputs = self._dropout(outputs)
+            if self._fc:
+                outputs = paddle.squeeze(outputs, axis=[2, 3])
+                outputs = self._fc(outputs)
+        else:
+            outputs = self._avg_pooling(outputs)
+            if self._dropout:
+                outputs = self._dropout(outputs)
+        return paddle.flatten(outputs, start_axis=1)
+
+
+class EfficientNetV2(nn.Layer):
+    """A class implements tf.keras.Model.
+
+        Reference: https://arxiv.org/abs/1807.11626
+    """
+
+    def __init__(self,
+                 model_name,
+                 blocks_args=None,
+                 mconfig=None,
+                 include_top=True,
+                 class_num=1000,
+                 padding_type="SAME"):
+        """Initializes an `Model` instance.
+
+        Args:
+            model_name: A string of model name.
+            model_config: A dict of model configurations or a string of hparams.
+        Raises:
+            ValueError: when blocks_args is not specified as a list.
+        """
+        super(EfficientNetV2, self).__init__()
+        self.blocks_args = blocks_args
+        self.mconfig = mconfig
+        """Builds a model."""
+        self._blocks = nn.LayerList()
+
+        cur_stage = 0
+        # Stem part.
+        self._stem = Stem(
+            self.mconfig.width_coefficient,
+            self.mconfig.depth_divisor,
+            self.mconfig.min_depth,
+            False,
+            self.mconfig.bn_momentum,
+            self.mconfig.bn_epsilon,
+            self.mconfig.act_fn,
+            stem_channels=self.blocks_args[0].in_channels,
+            cur_stage=cur_stage,
+            padding_type=padding_type,
+            model_name=model_name)
+        cur_stage += 1
+
+        # Builds blocks.
+        for block_args in self.blocks_args:
+            assert block_args.num_repeat > 0
+            # Update block input and output filters based on depth multiplier.
+            in_channels = round_filters(
+                block_args.in_channels, self.mconfig.width_coefficient,
+                self.mconfig.depth_divisor, self.mconfig.min_depth, False)
+            out_channels = round_filters(
+                block_args.out_channels, self.mconfig.width_coefficient,
+                self.mconfig.depth_divisor, self.mconfig.min_depth, False)
+
+            repeats = round_repeats(block_args.num_repeat,
+                                    self.mconfig.depth_coefficient)
+            block_args.update(
+                dict(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    num_repeat=repeats))
+
+            # The first block needs to take care of stride and filter size increase.
+            conv_block = {
+                0: MBConvBlock,
+                1: FusedMBConvBlock
+            }[block_args.conv_type]
+            self._blocks.append(
+                conv_block(block_args.se_ratio, block_args.in_channels,
+                           block_args.expand_ratio, block_args.kernel_size,
+                           block_args.strides, block_args.out_channels,
+                           self.mconfig.bn_momentum, self.mconfig.bn_epsilon,
+                           self.mconfig.local_pooling, self.mconfig.
+                           conv_dropout, cur_stage, padding_type, model_name))
+            if block_args.num_repeat > 1:  # rest of blocks with the same block_arg
+                block_args.in_channels = block_args.out_channels
+                block_args.strides = 1
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(
+                    conv_block(
+                        block_args.se_ratio, block_args.in_channels,
+                        block_args.expand_ratio, block_args.kernel_size,
+                        block_args.strides, block_args.out_channels,
+                        self.mconfig.bn_momentum, self.mconfig.bn_epsilon,
+                        self.mconfig.local_pooling, self.mconfig.conv_dropout,
+                        cur_stage, padding_type, model_name))
+            cur_stage += 1
+
+        # Head part.
+        self._head = Head(
+            self.blocks_args[-1].out_channels, self.mconfig.feature_size,
+            self.mconfig.bn_momentum, self.mconfig.bn_epsilon,
+            self.mconfig.act_fn, self.mconfig.dropout_rate,
+            self.mconfig.local_pooling, self.mconfig.width_coefficient,
+            self.mconfig.depth_divisor, self.mconfig.min_depth, False)
+
+        # top part for classification
+        if include_top and class_num:
+            self._fc = nn.Linear(
+                self.mconfig.feature_size,
+                class_num,
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        else:
+            self._fc = None
+
+        # initialize weight
+        def _init_weights(m):
+            if isinstance(m, nn.Conv2D):
+                out_filters, in_channels, kernel_height, kernel_width = m.weight.shape
+                if in_channels == 1 and out_filters > in_channels:
+                    out_filters = in_channels
+                fan_out = int(kernel_height * kernel_width * out_filters)
+                Normal(mean=0.0, std=np.sqrt(2.0 / fan_out))(m.weight)
+            elif isinstance(m, nn.Linear):
+                init_range = 1.0 / np.sqrt(m.weight.shape[1])
+                Uniform(-init_range, init_range)(m.weight)
+                Constant(0.0)(m.bias)
+
+        self.apply(_init_weights)
+
+    def forward(self, inputs):
+        # Calls Stem layers
+        outputs = self._stem(inputs)
+        # print(f"stem: {outputs.mean().item():.10f}")
+
+        # Calls blocks.
+        for idx, block in enumerate(self._blocks):
+            survival_prob = self.mconfig.survival_prob
+            if survival_prob:
+                drop_rate = 1.0 - survival_prob
+                survival_prob = 1.0 - drop_rate * float(idx) / len(
+                    self._blocks)
+            outputs = block(outputs, survival_prob=survival_prob)
+
+        # Head to obtain the final feature.
+        outputs = self._head(outputs)
+        # Calls final dense layers and returns logits.
+        if self._fc:
+            outputs = self._fc(outputs)
+
+        return outputs
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def EfficientNetV2_S(include_top=True, pretrained=False, **kwargs):
+    """Get a V2 model instance.
+
+    Returns:
+        nn.Layer: A single model instantce
+    """
+    model_name = "efficientnetv2-s"
+    model_config = efficientnetv2_config(model_name)
+    model = EfficientNetV2(model_name, model_config.model.blocks_args,
+                           model_config.model, include_top, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetV2_S"])
+    return model
+
+
+def EfficientNetV2_M(include_top=True, pretrained=False, **kwargs):
+    """Get a V2 model instance.
+
+    Returns:
+        nn.Layer: A single model instantce
+    """
+    model_name = "efficientnetv2-m"
+    model_config = efficientnetv2_config(model_name)
+    model = EfficientNetV2(model_name, model_config.model.blocks_args,
+                           model_config.model, include_top, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetV2_M"])
+    return model
+
+
+def EfficientNetV2_L(include_top=True, pretrained=False, **kwargs):
+    """Get a V2 model instance.
+
+    Returns:
+        nn.Layer: A single model instantce
+    """
+    model_name = "efficientnetv2-l"
+    model_config = efficientnetv2_config(model_name)
+    model = EfficientNetV2(model_name, model_config.model.blocks_args,
+                           model_config.model, include_top, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetV2_L"])
+    return model
+
+
+def EfficientNetV2_XL(include_top=True, pretrained=False, **kwargs):
+    """Get a V2 model instance.
+
+    Returns:
+        nn.Layer: A single model instantce
+    """
+    model_name = "efficientnetv2-xl"
+    model_config = efficientnetv2_config(model_name)
+    model = EfficientNetV2(model_name, model_config.model.blocks_args,
+                           model_config.model, include_top, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetV2_XL"])
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/fasternet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/fasternet.py
new file mode 100755
index 000000000..4389d7c29
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/fasternet.py
@@ -0,0 +1,399 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2303.03667
+
+import os
+import math
+import copy
+import warnings
+
+import paddle
+import paddle.nn as nn
+
+from .vision_transformer import trunc_normal_, zeros_, ones_
+from ....utils.save_load import load_dygraph_pretrain
+from ..model_zoo.vision_transformer import DropPath
+
+MODEL_URLS = {
+    "FasterNet_T0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_T0_pretrained.pdparams",
+    "FasterNet_T1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_T1_pretrained.pdparams",
+    "FasterNet_T2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_T2_pretrained.pdparams",
+    "FasterNet_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_S_pretrained.pdparams",
+    "FasterNet_M":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_M_pretrained.pdparams",
+    "FasterNet_L":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_L_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+
+NET_CONFIG = {
+    "FasterNet_T0":
+    [3, 40, [1, 2, 8, 2], 2, 4, 4, 4, 2, 2, True, 1280, 0.0, 0, 'BN', 'GELU'],
+    "FasterNet_T1": [
+        3, 64, [1, 2, 8, 2], 2, 4, 4, 4, 2, 2, True, 1280, 0.02, 0, 'BN',
+        'GELU'
+    ],
+    "FasterNet_T2": [
+        3, 96, [1, 2, 8, 2], 2, 4, 4, 4, 2, 2, True, 1280, 0.05, 0, 'BN',
+        'RELU'
+    ],
+    "FasterNet_S": [
+        3, 128, [1, 2, 13, 2], 2, 4, 4, 4, 2, 2, True, 1280, 0.1, 0, 'BN',
+        'RELU'
+    ],
+    "FasterNet_M": [
+        3, 144, [3, 4, 18, 3], 2, 4, 4, 4, 2, 2, True, 1280, 0.2, 0, 'BN',
+        'RELU'
+    ],
+    "FasterNet_L": [
+        3, 192, [3, 4, 18, 3], 2, 4, 4, 4, 2, 2, True, 1280, 0.3, 0, 'BN',
+        'RELU'
+    ],
+}
+
+
+class PartialConv(nn.Layer):
+    def __init__(self, dim: int, n_div: int, forward: str):
+        super().__init__()
+        self.dim_conv3 = dim // n_div
+        self.dim_untouched = dim - self.dim_conv3
+        self.partial_conv3 = nn.Conv2D(
+            in_channels=self.dim_conv3,
+            out_channels=self.dim_conv3,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        if forward == 'slicing':
+            self.forward = self.forward_slicing
+        elif forward == 'split_cat':
+            self.forward = self.forward_split_cat
+        else:
+            raise NotImplementedError(
+                f"Forward method '{forward}' is not implemented.")
+
+    def forward_slicing(self, x):
+        x = x.clone()
+        x[:, :self.dim_conv3, :, :] = self.partial_conv3(
+            x[:, :self.dim_conv3, :, :])
+        return x
+
+    def forward_split_cat(self, x):
+        x1, x2 = paddle.split(
+            x=x, num_or_sections=[self.dim_conv3, self.dim_untouched], axis=1)
+        x1 = self.partial_conv3(x1)
+        x = paddle.concat(x=(x1, x2), axis=1)
+        return x
+
+
+class MLPBlock(nn.Layer):
+    def __init__(self, dim, n_div, mlp_ratio, drop_path,
+                 layer_scale_init_value, act_layer, norm_layer, pconv_fw_type):
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        if drop_path > 0.:
+            self.drop_path = DropPath(drop_path)
+        else:
+            self.drop_path = nn.Identity()
+        self.n_div = n_div
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_layer = [
+            nn.Conv2D(
+                in_channels=dim,
+                out_channels=mlp_hidden_dim,
+                kernel_size=1,
+                bias_attr=False), norm_layer(mlp_hidden_dim), act_layer(),
+            nn.Conv2D(
+                in_channels=mlp_hidden_dim,
+                out_channels=dim,
+                kernel_size=1,
+                bias_attr=False)
+        ]
+        self.mlp = nn.Sequential(*mlp_layer)
+        self.spatial_mixing = PartialConv(dim, n_div, pconv_fw_type)
+        if layer_scale_init_value > 0:
+            self.layer_scale = (
+                paddle.base.framework.EagerParamBase.from_tensor(
+                    tensor=layer_scale_init_value * paddle.ones(shape=dim),
+                    trainable=True))
+            self.forward = self.forward_layer_scale
+        else:
+            self.forward = self.forward
+
+    def forward(self, x):
+        shortcut = x
+        x = self.spatial_mixing(x)
+        x = shortcut + self.drop_path(self.mlp(x))
+        return x
+
+    def forward_layer_scale(self, x):
+        shortcut = x
+        x = self.spatial_mixing(x)
+        x = shortcut + self.drop_path(
+            self.layer_scale.unsqueeze(axis=-1).unsqueeze(axis=-1) *
+            self.mlp(x))
+        return x
+
+
+class BasicStage(nn.Layer):
+    def __init__(self, dim, depth, n_div, mlp_ratio, drop_path,
+                 layer_scale_init_value, norm_layer, act_layer, pconv_fw_type):
+        super().__init__()
+        blocks_list = [
+            MLPBlock(
+                dim=dim,
+                n_div=n_div,
+                mlp_ratio=mlp_ratio,
+                drop_path=drop_path[i],
+                layer_scale_init_value=layer_scale_init_value,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                pconv_fw_type=pconv_fw_type) for i in range(depth)
+        ]
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        x = self.blocks(x)
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    def __init__(self, patch_size, patch_stride, in_chans, embed_dim,
+                 norm_layer):
+        super().__init__()
+        self.proj = nn.Conv2D(
+            in_channels=in_chans,
+            out_channels=embed_dim,
+            kernel_size=patch_size,
+            stride=patch_stride,
+            bias_attr=False)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim, momentum=0.1)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(self, x):
+        x = self.norm(self.proj(x))
+        return x
+
+
+class PatchMerging(nn.Layer):
+    def __init__(self, patch_size_t, patch_stride_t, dim, norm_layer):
+        super().__init__()
+        self.reduction = nn.Conv2D(
+            in_channels=dim,
+            out_channels=2 * dim,
+            kernel_size=patch_size_t,
+            stride=patch_stride_t,
+            bias_attr=False)
+        if norm_layer is not None:
+            self.norm = norm_layer(2 * dim)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(self, x):
+        x = self.norm(self.reduction(x))
+        return x
+
+
+class FasterNet(nn.Layer):
+    """
+    FasterNet
+    Args:
+        in_chans: int=3. Number of input channels. Default value is 3.
+        embed_dim: int=96. The dimension of embedding. Default value is 96.
+        depths: tuple=(1, 2, 8, 2). The depth of each stage. Default value is (1, 2, 8, 2).
+        mlp_ratio: float=2.0. The ratio of hidden dimension to embedding dimension. Default value is 2.0.
+        n_div: int=4. The number of divisions in the spatial dimension. Default value is 4.
+        patch_size: int=4. The size of patch. Default value is 4.
+        patch_stride: int=4. The stride of patch. Default value is 4.
+        patch_size_t: int=2. The size of patch for merging. Default value is 2.
+        patch_stride_t: int=2. The stride of patch for merging. Default value is 2.
+        patch_norm: bool=True. Whether to use patch normalization. Default value is True.
+        feature_dim: int=1280. The dimension of feature. Default value is 1280.
+        drop_path_rate: float=0.1. The drop path rate. Default value is 0.1.
+        layer_scale_init_value: float=0.0. The initial value of layer scale. Default value is 0.0.
+        norm_layer: str='BN'. The type of normalization layer. Default value is 'BN'.
+        act_layer: str='RELU'. The type of activation layer. Default value is 'RELU'.
+        class_num: int=1000. The number of classes. Default value is 1000.
+        fork_feat: bool=False. Whether to return feature maps. Default value is False.
+        pretrained: str=None. The path of pretrained model. Default value is None.
+        pconv_fw_type: str='split_cat'. The type of partial convolution forward. Default value is 'split_cat'.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+    Returns:
+        model: nn.Layer. Specific FasterNet model depends on args.
+    """
+    def __init__(self,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=(1, 2, 8, 2),
+                 mlp_ratio=2.0,
+                 n_div=4,
+                 patch_size=4,
+                 patch_stride=4,
+                 patch_size_t=2,
+                 patch_stride_t=2,
+                 patch_norm=True,
+                 feature_dim=1280,
+                 drop_path_rate=0.1,
+                 layer_scale_init_value=0,
+                 norm_layer='BN',
+                 act_layer='RELU',
+                 class_num=1000,
+                 fork_feat=False,
+                 pretrained=None,
+                 pconv_fw_type='split_cat',
+                 **kwargs):
+        super().__init__()
+        if norm_layer == 'BN':
+            norm_layer = nn.BatchNorm2D
+        else:
+            raise NotImplementedError
+        if act_layer == 'GELU':
+            act_layer = nn.GELU
+        elif act_layer == 'RELU':
+            act_layer = nn.ReLU
+        else:
+            raise NotImplementedError
+        if not fork_feat:
+            self.class_num = class_num
+        self.num_stages = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2**(self.num_stages - 1))
+        self.mlp_ratio = mlp_ratio
+        self.depths = depths
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            patch_stride=patch_stride,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        dpr = [
+            x.item()
+            for x in paddle.linspace(
+                start=0, stop=drop_path_rate, num=sum(depths))
+        ]
+        stages_list = []
+        for i_stage in range(self.num_stages):
+            stage = BasicStage(
+                dim=int(embed_dim * 2**i_stage),
+                n_div=n_div,
+                depth=depths[i_stage],
+                mlp_ratio=self.mlp_ratio,
+                drop_path=dpr[sum(depths[:i_stage]):sum(depths[:i_stage + 1])],
+                layer_scale_init_value=layer_scale_init_value,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                pconv_fw_type=pconv_fw_type)
+            stages_list.append(stage)
+            if i_stage < self.num_stages - 1:
+                stages_list.append(
+                    PatchMerging(
+                        patch_size_t=patch_size_t,
+                        patch_stride_t=patch_stride_t,
+                        dim=int(embed_dim * 2**i_stage),
+                        norm_layer=norm_layer))
+        self.stages = nn.Sequential(*stages_list)
+        self.avgpool_pre_head = nn.Sequential(
+            nn.AdaptiveAvgPool2D(output_size=1),
+            nn.Conv2D(
+                in_channels=self.num_features,
+                out_channels=feature_dim,
+                kernel_size=1,
+                bias_attr=False),
+            act_layer())
+        self.head = (nn.Linear(
+            in_features=feature_dim, out_features=class_num)
+                        if class_num > 0 else nn.Identity())
+        self.apply(self.cls_init_weights)
+        
+
+    def cls_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, (nn.Conv1D, nn.Conv2D)):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, (nn.LayerNorm, nn.GroupNorm)):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = self.stages(x)
+        x = self.avgpool_pre_head(x)
+        x = paddle.flatten(x=x, start_axis=1)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def FasterNet_T0(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_T0"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_T0"], use_ssld)
+    return model
+
+
+def FasterNet_T1(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_T1"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_T1"], use_ssld)
+    return model
+
+
+def FasterNet_T2(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_T2"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_T2"], use_ssld)
+    return model
+
+
+def FasterNet_S(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_S"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_S"], use_ssld)
+    return model
+
+
+def FasterNet_M(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_M"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_M"], use_ssld)
+    return model
+
+
+def FasterNet_L(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_L"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_L"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/foundation_vit.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/foundation_vit.py
new file mode 100755
index 000000000..35146960c
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/foundation_vit.py
@@ -0,0 +1,1261 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# reference: https://arxiv.org/abs/2010.11929
+
+from collections.abc import Callable, Iterable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import sys
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal, Assign
+
+from ....utils import logger
+from ....utils.save_load import load_dygraph_pretrain
+from ..base.theseus_layer import TheseusLayer
+
+MODEL_URLS = {
+    "CLIP_vit_base_patch32_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CLIP_vit_base_patch32_224.pdparams",
+    "CLIP_vit_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CLIP_vit_base_patch16_224.pdparams",
+    "CLIP_vit_large_patch14_336":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CLIP_vit_large_patch14_336.pdparams",
+    "CLIP_vit_large_patch14_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CLIP_vit_large_patch14_224.pdparams",
+    "BEiTv2_vit_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/BEiTv2_vit_base_patch16_224.pdparams",
+    "BEiTv2_vit_large_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/BEiTv2_vit_large_patch16_224.pdparams",
+    "CAE_vit_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CAE_vit_base_patch16_224.pdparams",
+    'EVA_vit_giant_patch14':
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/EVA_vit_giant_patch14.pdparams",
+    "MOCOV3_vit_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MOCOV3_vit_small.pdparams",
+    "MOCOV3_vit_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MOCOV3_vit_base.pdparams",
+    "MAE_vit_huge_patch14":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MAE_vit_huge_patch14.pdparams",
+    "MAE_vit_large_patch16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MAE_vit_large_patch16.pdparams",
+    "MAE_vit_base_patch16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MAE_vit_base_patch16.pdparams",
+}
+
+
+def check_support_fused_op(use_fused_linear):
+    if use_fused_linear:
+        if paddle.device.cuda.get_device_capability()[0] >= 8:
+            return True
+        else:
+            logger.warning("The current device don't support Fused OP! Using the general Linear instead.")
+    return False
+
+
+def resize_pos_embed(pos_embed,
+                     src_shape,
+                     dst_shape,
+                     mode='bicubic',
+                     num_extra_tokens=1):
+    """Resize pos_embed weights.
+
+    Args:
+        pos_embed (torch.Tensor): Position embedding weights with shape
+            [1, L, C].
+        src_shape (tuple): The resolution of downsampled origin training
+            image, in format (H, W).
+        dst_shape (tuple): The resolution of downsampled new training
+            image, in format (H, W).
+        mode (str): Algorithm used for upsampling. Choose one from 'nearest',
+            'linear', 'bilinear', 'bicubic' and 'trilinear'.
+            Defaults to 'bicubic'.
+        num_extra_tokens (int): The number of extra tokens, such as cls_token.
+            Defaults to 1.
+
+    Returns:
+        torch.Tensor: The resized pos_embed of shape [1, L_new, C]
+    """
+    if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1]:
+        return pos_embed
+    assert pos_embed.ndim == 3, 'shape of pos_embed must be [1, L, C]'
+    _, L, C = pos_embed.shape
+    src_h, src_w = src_shape
+    assert L == src_h * src_w + num_extra_tokens, \
+        f"The length of `pos_embed` ({L}) doesn't match the expected " \
+        f'shape ({src_h}*{src_w}+{num_extra_tokens}). Please check the' \
+        '`img_size` argument.'
+    extra_tokens = pos_embed[:, :num_extra_tokens]
+
+    src_weight = pos_embed[:, num_extra_tokens:]
+    src_weight = src_weight.reshape([-1, src_h, src_w, C]).transpose(
+        [0, 3, 1, 2])
+
+    # The cubic interpolate algorithm only accepts float32
+    dst_weight = paddle.nn.functional.interpolate(
+        paddle.cast(src_weight, paddle.float32),
+        size=dst_shape,
+        align_corners=False,
+        mode=mode)
+    dst_weight = paddle.flatten(dst_weight, 2).transpose([0, 2, 1])
+    dst_weight = paddle.cast(dst_weight, src_weight.dtype)
+
+    return paddle.concat((extra_tokens, dst_weight), axis=1)
+
+
+def pading_for_not_divisible(pixel_values,
+                             height,
+                             width,
+                             patch_size,
+                             format="BCHW",
+                             function="split"):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    if height % patch_size[0] == 0 and width % patch_size[1] == 0:
+        return pixel_values, None
+    if function == "split":
+        pading_width = patch_size[1] - width % patch_size[1]
+        pading_height = patch_size[0] - height % patch_size[0]
+    elif function == "merge":
+        pading_width = width % 2
+        pading_height = height % 2
+    if format == "BCHW":
+        pad_index = (0, 0, 0, 0, 0, pading_height, 0, pading_width)
+    elif format == "BHWC":
+        pad_index = (0, 0, 0, pading_height, 0, pading_width, 0, 0)
+    else:
+        assert ("vaild format")
+
+    return paddle.nn.functional.pad(pixel_values, pad_index), pad_index
+
+
+__all__ = list(MODEL_URLS.keys())
+
+_model_size = None
+_model_diff = None
+
+_CLIP_diff = {
+    'add_layer_norm_before_encoder': [
+        'vit_base_patch32_224', 'vit_base_patch16_224',
+        'vit_large_patch14_336', 'vit_large_patch14_224'
+    ],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': ['vit_base_patch16_224'],
+    },
+    'remove_cls_token_in_forward': ['vit_base_patch16_224'],
+}
+
+
+_MOCOV3_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_CoCa_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_BEiTv2_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa':
+    ['vit_base_patch16_224', 'vit_large_patch16_224'],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp':
+    ['vit_base_patch16_224', 'vit_large_patch16_224'],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': ['vit_base_patch16_224', 'vit_large_patch16_224'],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_CAE_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': ['vit_base_patch16_224'],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': ['vit_base_patch16_224'],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],  # 3 x 197 x 786
+        'return_all_tokens': [],  # 3 x 197 x 1000
+        'return_patch_tokens': [],  # 3 x 196 x 1000
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_EVA_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': ['vit_huge_patch14'],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_MAE_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': ['vit_huge_patch14'],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+trunc_normal_ = TruncatedNormal(std=.02)
+normal_ = Normal
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(TheseusLayer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class QuickGELU(TheseusLayer):
+    def forward(self, x):
+        return x * nn.functional.sigmoid(1.702 * x)
+
+
+class Mlp(TheseusLayer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.,
+                 Linear=nn.Linear):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = Linear(in_features, hidden_features)
+        self.act = act_layer() if _model_size not in _model_diff[
+            'replace_mlp_GELU'] else QuickGELU()
+        self.fc2 = Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(TheseusLayer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 model_name=None,
+                 window_size=None,
+                 use_fused_attn=False,
+                 Linear=nn.Linear):
+        super().__init__()
+        self._model_name = model_name
+
+        if _model_size in _model_diff['add_relative_position_bias_in_msa']:
+            assert isinstance(
+                window_size, Iterable
+            ), f'window_size must be iterable, should not be {type(window_size)}'
+            self.window_size = window_size
+            self._register_relative_position_index(
+                window_size=window_size,
+                num_heads=num_heads, )
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.use_fused_attn = use_fused_attn
+        # TODO: support mask
+        if use_fused_attn:
+            if hasattr(self, 'relative_position_bias_table') or (_model_size in _model_diff['add_shared_rel_pos_bias'] and rel_pos_bias is not None):
+                logger.warning("The fused attn don't support `relative_position` yet, so fused attn will not be used.")
+                self.use_fused_attn = False
+
+    def _register_relative_position_index(
+            self,
+            window_size,
+            num_heads, ):
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = self.create_parameter(
+            [self.num_relative_distance, num_heads],
+            default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros((window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+    def forward(self, x, rel_pos_bias=None):
+        # B= x.shape[0]
+        N, C = x.shape[1], x.shape[2]
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // self.num_heads))
+
+        if not self.use_fused_attn:
+            qkv = qkv.transpose((2, 0, 3, 1, 4))
+            q, k, v = qkv[0], qkv[1], qkv[2]
+            attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+            if hasattr(self, 'relative_position_bias_table'):
+                relative_position_bias = \
+                    self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                        self.window_size[0] * self.window_size[1] + 1,
+                        self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.transpose(
+                    [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0)
+
+            if _model_size in _model_diff[
+                    'add_shared_rel_pos_bias'] and rel_pos_bias is not None:
+                attn = attn + rel_pos_bias
+
+            attn = nn.functional.softmax(attn, axis=-1)
+            attn = self.attn_drop(attn).matmul(v)
+            attn = attn.transpose((0, 2, 1, 3))
+        else:
+            qkv = qkv.transpose((2, 0, 1, 3, 4))
+            q, k, v = qkv[0], qkv[1], qkv[2]
+            # TODO: support mask
+            attn = paddle.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop.p if self.training else 0.)
+
+        x = attn.reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(TheseusLayer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 model_name,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 init_values=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 window_size=None,
+                 use_fused_attn=False,
+                 use_fused_linear=False):
+        super().__init__()
+        global _model_size
+        global _model_diff
+        self._model_name = model_name
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        Linear = paddle.incubate.nn.FusedLinear if use_fused_linear else nn.Linear
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            model_name=self._model_name,
+            window_size=window_size,
+            use_fused_attn=use_fused_attn,
+            Linear=Linear)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+
+        if _model_size in _model_diff['add_mul_gamma_to_msa_mlp']:
+            self.gamma_1 = self.create_parameter(
+                [dim],
+                default_initializer=nn.initializer.Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                [dim],
+                default_initializer=nn.initializer.Constant(value=init_values))
+        else:
+            self.gamma_1 = None
+            self.gamma_2 = None
+
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop,
+                       Linear=Linear)
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is not None:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        else:
+            atten_result = self.drop_path(
+                self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + atten_result
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class RelativePositionBias(TheseusLayer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = self.create_parameter(
+            [self.num_relative_distance, num_heads],
+            default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+
+
+class PatchEmbed(TheseusLayer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 conv_bias=False):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * \
+            (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        if conv_bias:
+            self.proj = nn.Conv2D(
+                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        else:
+            self.proj = nn.Conv2D(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_size,
+                bias_attr=False)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x, _ = pading_for_not_divisible(x, H, W, patch_size=self.patch_size)
+
+        x = self.proj(x)
+        _, _, H, W = x.shape
+
+        x = x.flatten(2).transpose((0, 2, 1))
+        return x, (H, W)
+
+
+class Head(TheseusLayer):
+    def __init__(self, embed_dim, class_num, norm_layer, model_size, setting):
+        super().__init__()
+        self.model_size = model_size
+        self.setting = setting
+
+        self.fc_norm = eval(norm_layer)(
+            embed_dim,
+            epsilon=1e-5) if model_size in setting['fc_norm'] else None
+        self.return_all_tokens = model_size in setting['return_all_tokens']
+        self.return_patch_tokens = model_size in setting['return_patch_tokens']
+        self.return_tokens_mean = model_size in setting['return_tokens_mean']
+
+        self.fc_head = nn.Linear(embed_dim,
+                                 class_num) if class_num > 0 else Identity()
+
+    def forward(self, x):
+        if self.fc_norm is not None:
+            if self.return_all_tokens:
+                x = self.fc_norm(x)
+            else:
+                t = x[:, 1:]
+                if self.return_patch_tokens:
+                    x = self.fc_norm(t)
+                else:
+                    x = self.fc_norm(t.mean(1))
+        elif isinstance(self.fc_head, Identity):
+            if self.return_all_tokens:
+                x = x
+            elif self.return_patch_tokens:
+                x = x[:, 1:]
+            elif self.return_tokens_mean:
+                x = x.mean(1)
+            else:
+                x = x[:, 0]
+        else:
+            x = x
+        return self.fc_head(x)
+
+
+class VisionTransformer(TheseusLayer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 model_name,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=768,
+                 output_dim=512,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 image_project=False,
+                 conv_bias=False,
+                 feature_frame=False,
+                 hugging_face_framework=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 head_init_scale=0.001,
+                 **kwargs):
+        super().__init__()
+        global _model_diff
+        global _model_size
+        _model_split = model_name.split('_')
+        self.model_name = _model_split[0]
+        self.feature_frame = feature_frame
+        self.model_size = '_'.join(_model_split[1:])
+        _model_size = self.model_size
+        _model_diff = eval(f'_{self.model_name}_diff')
+
+        self.class_num = class_num
+        self.return_embed = kwargs.get('return_embed', False)
+        self.return_mean_embed = kwargs.get('return_mean_embed', False) and self.return_embed
+        self.num_features = self.embed_dim = embed_dim
+        use_fused_attn = check_support_fused_op(kwargs.get('use_fused_attn', False))
+        use_fused_linear = check_support_fused_op(kwargs.get('use_fused_linear', False))
+        _img_size = to_2tuple(img_size)
+        _patch_size = to_2tuple(patch_size)
+        self.window_size = (_img_size[0] // _patch_size[0],
+                            _img_size[1] // _patch_size[1])
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            conv_bias=conv_bias)
+        num_patches = self.patch_embed.num_patches
+
+        if _model_size in _model_diff['add_shared_rel_pos_bias']:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.window_size, num_heads=num_heads)
+
+        #self.ln_pre = nn.LayerNorm(embed_dim) if _model_size in _model_diff[
+        #    'add_layer_norm_before_encoder'] else nn.Identity()
+
+        if _model_size in _model_diff['remove_cls_token'] or self.feature_frame:
+            self.pos_embed = self.create_parameter(
+                shape=(1, num_patches, embed_dim), default_initializer=zeros_)
+            self.cls_token = None
+        else:
+            self.pos_embed = self.create_parameter(
+                shape=(1, num_patches + 1, embed_dim),
+                default_initializer=zeros_)
+            self.cls_token = self.create_parameter(
+                shape=(1, 1, embed_dim), default_initializer=zeros_)
+            self.add_parameter("cls_token", self.cls_token)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # for LaClip
+        if image_project:
+            image_projection = self.create_parameter(
+                shape=(img_size, embed_dim),
+                default_initializer=Assign(
+                    paddle.empty((img_size, embed_dim))))
+            self.add_parameter("image_projection", image_projection)
+        else:
+            self.image_projection = None
+        self.hugging_face_framework = hugging_face_framework
+        #for path size hugging face plan
+        if hugging_face_framework:
+            self.ln_pre = nn.LayerNorm(embed_dim)
+            self.add_parameter("pos_embed", self.pos_embed)
+        else:
+            self.ln_pre = nn.Identity() if _model_size not in _model_diff[
+                'add_layer_norm_before_encoder'] else nn.LayerNorm(embed_dim)
+            if _model_size in _model_diff['remove_abs_pos_emb']:
+                self.pos_embed = None
+            else:
+                self.add_parameter("pos_embed", self.pos_embed)
+
+        #proj
+        proj = self.create_parameter(
+            shape=(embed_dim, ),
+            default_initializer=Assign((embed_dim**-0.5) * paddle.randn((
+                (embed_dim, output_dim)))))
+        self.add_parameter("proj", proj)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                model_name=self.model_name,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                window_size=self.window_size,
+                use_fused_attn=use_fused_attn,
+                use_fused_linear=use_fused_linear) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+        self.head = Identity() if self.return_embed else Head(
+            embed_dim, class_num, norm_layer, self.model_size,
+            _model_diff['head'])
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed)
+        if not _model_size in _model_diff['remove_cls_token'] and self.feature_frame == False:
+            trunc_normal_(self.cls_token)
+
+        self.apply(self._init_weights)
+
+        if feature_frame:
+            self.feature = nn.Sequential(
+            nn.Linear(embed_dim * self.patch_embed.num_patches, embed_dim,bias_attr=False),
+            nn.BatchNorm1D(embed_dim, epsilon=2e-5),
+            nn.Linear(embed_dim, output_dim, bias_attr=False),
+            nn.BatchNorm1D(output_dim, epsilon=2e-5))
+            self.pos_drop = Identity()
+            self.cls_token = None
+            self.image_projection = Identity()
+            self.proj = None
+            self.ln_pre = Identity()
+
+        if head_init_scale != 1:
+            if not self.return_embed and class_num > 0:
+                self.head.fc_head.weight.set_value(
+                    self.head.fc_head.weight *
+                    paddle.to_tensor(head_init_scale))
+                self.head.fc_head.bias.set_value(
+                    self.head.fc_head.bias * paddle.to_tensor(head_init_scale))
+            else:
+                logger.warning(
+                    "Because the head or head.fc_head of ViT is Identity() class, the argument head_init_scale is invalid."
+                )
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x, output_dimensions = self.patch_embed(x)
+        if not _model_size in _model_diff['remove_cls_token'] and (self.feature_frame==False):
+            cls_tokens = self.cls_token.expand((B, -1, -1))
+            x = paddle.concat((cls_tokens, x), axis=1)
+
+        if self.pos_embed is not None:
+            x = x + resize_pos_embed(self.pos_embed, self.window_size,
+                                     output_dimensions)
+
+        x = self.ln_pre(x)
+        x = self.pos_drop(x)
+        rel_pos_bias = self.rel_pos_bias() if hasattr(self,
+                                                      'rel_pos_bias') else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias=rel_pos_bias)
+
+        if _model_size in _model_diff['remove_cls_token_in_forward']:
+            x = x[:, 1:, :]
+        if self.hugging_face_framework or self.return_embed == False:
+            pooled, token = x[:, 0], x[:, 1:]
+        else:
+            pooled = x
+        x = self.norm(pooled)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        if self.feature_frame:
+            B, L, C = x.shape
+            x = paddle.reshape(x,[B, -1])
+            x = self.feature(x)
+
+        x = self.head(x)
+
+        if self.proj is not None and isinstance(self.head,Identity):
+           x = x @self.proj
+        if self.return_mean_embed:
+            x = x.mean(1)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CLIP_vit_base_patch32_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def CLIP_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def React_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "React_vit_base_patch16_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("React","CLIP"),
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        hugging_face_framework=True,
+        epsilon=1e-5,
+        **kwargs, )
+    return model
+
+
+def React_vit_base_patch32_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "React_vit_base_patch32_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("React","CLIP"),
+        img_size=224,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        hugging_face_framework=True,
+        epsilon=1e-5,
+        **kwargs, )
+    return model
+
+
+def LaCLIP_vit_base_patch32_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "LaCLIP_vit_base_patch32_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("LaCLIP","CLIP"),
+        img_size=224,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        hugging_face_framework=True,
+        epsilon=1e-5,
+        **kwargs, )
+
+    return model
+
+
+def LaCLIP_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "LaCLIP_vit_base_patch16_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("LaCLIP","CLIP"),
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        hugging_face_framework=True,
+        epsilon=1e-5,
+        **kwargs, )
+
+    return model
+
+
+def Unicom_vit_base_patch32_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "Unicom_vit_base_patch32_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("Unicom","CLIP"),
+        img_size=224,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        conv_bias=True,
+        feature_frame=True,
+        hugging_face_framework=False,
+        image_project=False,
+        epsilon=1e-5,
+        **kwargs, )
+
+    return model
+
+
+def Unicom_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "Unicom_vit_base_patch16_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("Unicom","CLIP"),
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        hugging_face_framework=False,
+        image_project=False,
+        feature_frame=True,
+        conv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+
+    return model
+
+
+def CLIP_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def CLIP_vit_large_patch14_336(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=336,
+        patch_size=14,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def CLIP_vit_large_patch14_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=14,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def BEiTv2_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def BEiTv2_vit_large_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MOCOV3_vit_small(pretrained=False, use_ssld=False, **kwargs):
+    """
+    vit small in mocov3
+    """
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MOCOV3_vit_base(pretrained=False, use_ssld=False, **kwargs):
+    """
+    vit base in mocov3
+    """
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MAE_vit_base_patch16(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MAE_vit_large_patch16(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MAE_vit_huge_patch14(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=14,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def EVA_vit_giant_patch14(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=14,
+        embed_dim=1408,
+        depth=40,
+        num_heads=16,
+        init_values=None,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        class_num=0,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def CAE_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/ghostnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/ghostnet.py
new file mode 100755
index 000000000..8a3960827
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/ghostnet.py
@@ -0,0 +1,364 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/ghostnet_pytorch
+# reference: https://arxiv.org/abs/1911.11907
+
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform, KaimingNormal
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "GhostNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x0_5_pretrained.pdparams",
+    "GhostNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x1_0_pretrained.pdparams",
+    "GhostNet_x1_3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x1_3_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+
+        self._batch_norm = BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale", regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset", regularizer=L2Decay(0.0)),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, num_channels, reduction_ratio=4, name=None):
+        super(SEBlock, self).__init__()
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        self._num_channels = num_channels
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        med_ch = num_channels // reduction_ratio
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_1_weights"),
+            bias_attr=ParamAttr(name=name + "_1_offset"))
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_channels,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_2_weights"),
+            bias_attr=ParamAttr(name=name + "_2_offset"))
+
+    def forward(self, inputs):
+        pool = self.pool2d_gap(inputs)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = paddle.clip(x=excitation, min=0, max=1)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = paddle.multiply(inputs, excitation)
+        return out
+
+
+class GhostModule(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 output_channels,
+                 kernel_size=1,
+                 ratio=2,
+                 dw_size=3,
+                 stride=1,
+                 relu=True,
+                 name=None):
+        super(GhostModule, self).__init__()
+        init_channels = int(math.ceil(output_channels / ratio))
+        new_channels = int(init_channels * (ratio - 1))
+        self.primary_conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=init_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=1,
+            act="relu" if relu else None,
+            name=name + "_primary_conv")
+        self.cheap_operation = ConvBNLayer(
+            in_channels=init_channels,
+            out_channels=new_channels,
+            kernel_size=dw_size,
+            stride=1,
+            groups=init_channels,
+            act="relu" if relu else None,
+            name=name + "_cheap_operation")
+
+    def forward(self, inputs):
+        x = self.primary_conv(inputs)
+        y = self.cheap_operation(x)
+        out = paddle.concat([x, y], axis=1)
+        return out
+
+
+class GhostBottleneck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 hidden_dim,
+                 output_channels,
+                 kernel_size,
+                 stride,
+                 use_se,
+                 name=None):
+        super(GhostBottleneck, self).__init__()
+        self._stride = stride
+        self._use_se = use_se
+        self._num_channels = in_channels
+        self._output_channels = output_channels
+        self.ghost_module_1 = GhostModule(
+            in_channels=in_channels,
+            output_channels=hidden_dim,
+            kernel_size=1,
+            stride=1,
+            relu=True,
+            name=name + "_ghost_module_1")
+        if stride == 2:
+            self.depthwise_conv = ConvBNLayer(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                groups=hidden_dim,
+                act=None,
+                name=name +
+                "_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+        if use_se:
+            self.se_block = SEBlock(num_channels=hidden_dim, name=name + "_se")
+        self.ghost_module_2 = GhostModule(
+            in_channels=hidden_dim,
+            output_channels=output_channels,
+            kernel_size=1,
+            relu=False,
+            name=name + "_ghost_module_2")
+        if stride != 1 or in_channels != output_channels:
+            self.shortcut_depthwise = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                groups=in_channels,
+                act=None,
+                name=name +
+                "_shortcut_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+            self.shortcut_conv = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=output_channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                act=None,
+                name=name + "_shortcut_conv")
+
+    def forward(self, inputs):
+        x = self.ghost_module_1(inputs)
+        if self._stride == 2:
+            x = self.depthwise_conv(x)
+        if self._use_se:
+            x = self.se_block(x)
+        x = self.ghost_module_2(x)
+        if self._stride == 1 and self._num_channels == self._output_channels:
+            shortcut = inputs
+        else:
+            shortcut = self.shortcut_depthwise(inputs)
+            shortcut = self.shortcut_conv(shortcut)
+        return paddle.add(x=x, y=shortcut)
+
+
+class GhostNet(nn.Layer):
+    def __init__(self, scale, class_num=1000):
+        super(GhostNet, self).__init__()
+        self.cfgs = [
+            # k, t, c, SE, s
+            [3, 16, 16, 0, 1],
+            [3, 48, 24, 0, 2],
+            [3, 72, 24, 0, 1],
+            [5, 72, 40, 1, 2],
+            [5, 120, 40, 1, 1],
+            [3, 240, 80, 0, 2],
+            [3, 200, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 480, 112, 1, 1],
+            [3, 672, 112, 1, 1],
+            [5, 672, 160, 1, 2],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1]
+        ]
+        self.scale = scale
+        output_channels = int(self._make_divisible(16 * self.scale, 4))
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=output_channels,
+            kernel_size=3,
+            stride=2,
+            groups=1,
+            act="relu",
+            name="conv1")
+        # build inverted residual blocks
+        idx = 0
+        self.ghost_bottleneck_list = []
+        for k, exp_size, c, use_se, s in self.cfgs:
+            in_channels = output_channels
+            output_channels = int(self._make_divisible(c * self.scale, 4))
+            hidden_dim = int(self._make_divisible(exp_size * self.scale, 4))
+            ghost_bottleneck = self.add_sublayer(
+                name="_ghostbottleneck_" + str(idx),
+                sublayer=GhostBottleneck(
+                    in_channels=in_channels,
+                    hidden_dim=hidden_dim,
+                    output_channels=output_channels,
+                    kernel_size=k,
+                    stride=s,
+                    use_se=use_se,
+                    name="_ghostbottleneck_" + str(idx)))
+            self.ghost_bottleneck_list.append(ghost_bottleneck)
+            idx += 1
+        # build last several layers
+        in_channels = output_channels
+        output_channels = int(self._make_divisible(exp_size * self.scale, 4))
+        self.conv_last = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=output_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            act="relu",
+            name="conv_last")
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        in_channels = output_channels
+        self._fc0_output_channels = 1280
+        self.fc_0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=self._fc0_output_channels,
+            kernel_size=1,
+            stride=1,
+            act="relu",
+            name="fc_0")
+        self.dropout = nn.Dropout(p=0.2)
+        stdv = 1.0 / math.sqrt(self._fc0_output_channels * 1.0)
+        self.fc_1 = Linear(
+            self._fc0_output_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_1_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_1_offset"))
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        for ghost_bottleneck in self.ghost_bottleneck_list:
+            x = ghost_bottleneck(x)
+        x = self.conv_last(x)
+        x = self.pool2d_gap(x)
+        x = self.fc_0(x)
+        x = self.dropout(x)
+        x = paddle.reshape(x, shape=[-1, self._fc0_output_channels])
+        x = self.fc_1(x)
+        return x
+
+    def _make_divisible(self, v, divisor, min_value=None):
+        """
+        This function is taken from the original tf repo.
+        It ensures that all layers have a channel number that is divisible by 8
+        It can be seen here:
+        https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+        """
+        if min_value is None:
+            min_value = divisor
+        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+        # Make sure that round down does not go down by more than 10%.
+        if new_v < 0.9 * v:
+            new_v += divisor
+        return new_v
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def GhostNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def GhostNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def GhostNet_x1_3(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=1.3, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x1_3"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/googlenet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/googlenet.py
new file mode 100755
index 000000000..fcd52c923
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/googlenet.py
@@ -0,0 +1,365 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1409.4842
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "GoogLeNet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GoogLeNet_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def xavier(channels, filter_size, name):
+    stdv = (3.0 / (filter_size**2 * channels))**0.5
+    param_attr = ParamAttr(
+        initializer=Uniform(-stdv, stdv), name=name + "_weights")
+    return param_attr
+
+
+class ConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        return y
+
+
+class Inception(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter1,
+                 filter3R,
+                 filter3,
+                 filter5R,
+                 filter5,
+                 proj,
+                 name=None,
+                 data_format="NCHW"):
+        super(Inception, self).__init__()
+        self.data_format = data_format
+
+        self._conv1 = ConvLayer(
+            input_channels,
+            filter1,
+            1,
+            name="inception_" + name + "_1x1",
+            data_format=data_format)
+        self._conv3r = ConvLayer(
+            input_channels,
+            filter3R,
+            1,
+            name="inception_" + name + "_3x3_reduce",
+            data_format=data_format)
+        self._conv3 = ConvLayer(
+            filter3R,
+            filter3,
+            3,
+            name="inception_" + name + "_3x3",
+            data_format=data_format)
+        self._conv5r = ConvLayer(
+            input_channels,
+            filter5R,
+            1,
+            name="inception_" + name + "_5x5_reduce",
+            data_format=data_format)
+        self._conv5 = ConvLayer(
+            filter5R,
+            filter5,
+            5,
+            name="inception_" + name + "_5x5",
+            data_format=data_format)
+        self._pool = MaxPool2D(
+            kernel_size=3, stride=1, padding=1, data_format=data_format)
+
+        self._convprj = ConvLayer(
+            input_channels,
+            proj,
+            1,
+            name="inception_" + name + "_3x3_proj",
+            data_format=data_format)
+
+    def forward(self, inputs):
+        conv1 = self._conv1(inputs)
+
+        conv3r = self._conv3r(inputs)
+        conv3 = self._conv3(conv3r)
+
+        conv5r = self._conv5r(inputs)
+        conv5 = self._conv5(conv5r)
+
+        pool = self._pool(inputs)
+        convprj = self._convprj(pool)
+
+        if self.data_format == "NHWC":
+            cat = paddle.concat([conv1, conv3, conv5, convprj], axis=3)
+        else:
+            cat = paddle.concat([conv1, conv3, conv5, convprj], axis=1)
+        cat = F.relu(cat)
+        return cat
+
+
+class GoogLeNetDY(nn.Layer):
+    def __init__(self, class_num=1000, data_format="NCHW"):
+        super(GoogLeNetDY, self).__init__()
+        self.data_format = data_format
+        self._conv = ConvLayer(
+            3, 64, 7, 2, name="conv1", data_format=data_format)
+        self._pool = MaxPool2D(
+            kernel_size=3, stride=2, data_format=data_format)
+        self._conv_1 = ConvLayer(
+            64, 64, 1, name="conv2_1x1", data_format=data_format)
+        self._conv_2 = ConvLayer(
+            64, 192, 3, name="conv2_3x3", data_format=data_format)
+
+        self._ince3a = Inception(
+            192,
+            192,
+            64,
+            96,
+            128,
+            16,
+            32,
+            32,
+            name="ince3a",
+            data_format=data_format)
+        self._ince3b = Inception(
+            256,
+            256,
+            128,
+            128,
+            192,
+            32,
+            96,
+            64,
+            name="ince3b",
+            data_format=data_format)
+
+        self._ince4a = Inception(
+            480,
+            480,
+            192,
+            96,
+            208,
+            16,
+            48,
+            64,
+            name="ince4a",
+            data_format=data_format)
+        self._ince4b = Inception(
+            512,
+            512,
+            160,
+            112,
+            224,
+            24,
+            64,
+            64,
+            name="ince4b",
+            data_format=data_format)
+        self._ince4c = Inception(
+            512,
+            512,
+            128,
+            128,
+            256,
+            24,
+            64,
+            64,
+            name="ince4c",
+            data_format=data_format)
+        self._ince4d = Inception(
+            512,
+            512,
+            112,
+            144,
+            288,
+            32,
+            64,
+            64,
+            name="ince4d",
+            data_format=data_format)
+        self._ince4e = Inception(
+            528,
+            528,
+            256,
+            160,
+            320,
+            32,
+            128,
+            128,
+            name="ince4e",
+            data_format=data_format)
+
+        self._ince5a = Inception(
+            832,
+            832,
+            256,
+            160,
+            320,
+            32,
+            128,
+            128,
+            name="ince5a",
+            data_format=data_format)
+        self._ince5b = Inception(
+            832,
+            832,
+            384,
+            192,
+            384,
+            48,
+            128,
+            128,
+            name="ince5b",
+            data_format=data_format)
+
+        self._pool_5 = AdaptiveAvgPool2D(1, data_format=data_format)
+
+        self._drop = Dropout(p=0.4, mode="downscale_in_infer")
+        self.flatten = nn.Flatten()
+        self._fc_out = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out"),
+            bias_attr=ParamAttr(name="out_offset"))
+        self._pool_o1 = AvgPool2D(
+            kernel_size=5, stride=3, data_format=data_format)
+        self._conv_o1 = ConvLayer(
+            512, 128, 1, name="conv_o1", data_format=data_format)
+        self._fc_o1 = Linear(
+            1152,
+            1024,
+            weight_attr=xavier(2048, 1, "fc_o1"),
+            bias_attr=ParamAttr(name="fc_o1_offset"))
+        self._drop_o1 = Dropout(p=0.7, mode="downscale_in_infer")
+        self._out1 = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out1"),
+            bias_attr=ParamAttr(name="out1_offset"))
+        self._pool_o2 = AvgPool2D(
+            kernel_size=5, stride=3, data_format=data_format)
+        self._conv_o2 = ConvLayer(
+            528, 128, 1, name="conv_o2", data_format=data_format)
+        self._fc_o2 = Linear(
+            1152,
+            1024,
+            weight_attr=xavier(2048, 1, "fc_o2"),
+            bias_attr=ParamAttr(name="fc_o2_offset"))
+        self._drop_o2 = Dropout(p=0.7, mode="downscale_in_infer")
+        self._out2 = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out2"),
+            bias_attr=ParamAttr(name="out2_offset"))
+
+    def forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        x = self._conv(inputs)
+        x = self._pool(x)
+        x = self._conv_1(x)
+        x = self._conv_2(x)
+        x = self._pool(x)
+
+        x = self._ince3a(x)
+        x = self._ince3b(x)
+        x = self._pool(x)
+
+        ince4a = self._ince4a(x)
+        x = self._ince4b(ince4a)
+        x = self._ince4c(x)
+        ince4d = self._ince4d(x)
+        x = self._ince4e(ince4d)
+        x = self._pool(x)
+
+        x = self._ince5a(x)
+        ince5b = self._ince5b(x)
+
+        x = self._pool_5(ince5b)
+        x = self._drop(x)
+        x = self.flatten(x)
+        out = self._fc_out(x)
+
+        x = self._pool_o1(ince4a)
+        x = self._conv_o1(x)
+        x = self.flatten(x)
+        x = self._fc_o1(x)
+        x = F.relu(x)
+        x = self._drop_o1(x)
+        out1 = self._out1(x)
+
+        x = self._pool_o2(ince4d)
+        x = self._conv_o2(x)
+        x = self.flatten(x)
+        x = self._fc_o2(x)
+        x = self._drop_o2(x)
+        out2 = self._out2(x)
+        return [out, out1, out2]
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def GoogLeNet(pretrained=False, use_ssld=False, **kwargs):
+    model = GoogLeNetDY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GoogLeNet"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/hardnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/hardnet.py
new file mode 100755
index 000000000..fa3399e3e
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/hardnet.py
@@ -0,0 +1,294 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/PingoLH/Pytorch-HarDNet
+# reference: https://arxiv.org/abs/1909.00948
+
+import paddle
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    'HarDNet39_ds':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet39_ds_pretrained.pdparams',
+    'HarDNet68_ds':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet68_ds_pretrained.pdparams',
+    'HarDNet68':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet68_pretrained.pdparams',
+    'HarDNet85':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet85_pretrained.pdparams'
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+def ConvLayer(in_channels,
+              out_channels,
+              kernel_size=3,
+              stride=1,
+              bias_attr=False):
+    layer = nn.Sequential(
+        ('conv', nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=1,
+            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)),
+        ('relu', nn.ReLU6()))
+    return layer
+
+
+def DWConvLayer(in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                bias_attr=False):
+    layer = nn.Sequential(
+        ('dwconv', nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            groups=out_channels,
+            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)))
+    return layer
+
+
+def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1):
+    layer = nn.Sequential(
+        ('layer1', ConvLayer(
+            in_channels, out_channels, kernel_size=kernel_size)),
+        ('layer2', DWConvLayer(
+            out_channels, out_channels, stride=stride)))
+    return layer
+
+
+class HarDBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 growth_rate,
+                 grmul,
+                 n_layers,
+                 keepBase=False,
+                 residual_out=False,
+                 dwconv=False):
+        super().__init__()
+        self.keepBase = keepBase
+        self.links = []
+        layers_ = []
+        self.out_channels = 0  # if upsample else in_channels
+        for i in range(n_layers):
+            outch, inch, link = self.get_link(i + 1, in_channels, growth_rate,
+                                              grmul)
+            self.links.append(link)
+            if dwconv:
+                layers_.append(CombConvLayer(inch, outch))
+            else:
+                layers_.append(ConvLayer(inch, outch))
+
+            if (i % 2 == 0) or (i == n_layers - 1):
+                self.out_channels += outch
+        # print("Blk out =",self.out_channels)
+        self.layers = nn.LayerList(layers_)
+
+    def get_link(self, layer, base_ch, growth_rate, grmul):
+        if layer == 0:
+            return base_ch, 0, []
+        out_channels = growth_rate
+
+        link = []
+        for i in range(10):
+            dv = 2**i
+            if layer % dv == 0:
+                k = layer - dv
+                link.append(k)
+                if i > 0:
+                    out_channels *= grmul
+
+        out_channels = int(int(out_channels + 1) / 2) * 2
+        in_channels = 0
+
+        for i in link:
+            ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul)
+            in_channels += ch
+
+        return out_channels, in_channels, link
+
+    def forward(self, x):
+        layers_ = [x]
+
+        for layer in range(len(self.layers)):
+            link = self.links[layer]
+            tin = []
+            for i in link:
+                tin.append(layers_[i])
+            if len(tin) > 1:
+                x = paddle.concat(tin, 1)
+            else:
+                x = tin[0]
+            out = self.layers[layer](x)
+            layers_.append(out)
+
+        t = len(layers_)
+        out_ = []
+        for i in range(t):
+            if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1):
+                out_.append(layers_[i])
+        out = paddle.concat(out_, 1)
+
+        return out
+
+
+class HarDNet(nn.Layer):
+    def __init__(self,
+                 depth_wise=False,
+                 arch=85,
+                 class_num=1000,
+                 with_pool=True):
+        super().__init__()
+        first_ch = [32, 64]
+        second_kernel = 3
+        max_pool = True
+        grmul = 1.7
+        drop_rate = 0.1
+
+        # HarDNet68
+        ch_list = [128, 256, 320, 640, 1024]
+        gr = [14, 16, 20, 40, 160]
+        n_layers = [8, 16, 16, 16, 4]
+        downSamp = [1, 0, 1, 1, 0]
+
+        if arch == 85:
+            # HarDNet85
+            first_ch = [48, 96]
+            ch_list = [192, 256, 320, 480, 720, 1280]
+            gr = [24, 24, 28, 36, 48, 256]
+            n_layers = [8, 16, 16, 16, 16, 4]
+            downSamp = [1, 0, 1, 0, 1, 0]
+            drop_rate = 0.2
+
+        elif arch == 39:
+            # HarDNet39
+            first_ch = [24, 48]
+            ch_list = [96, 320, 640, 1024]
+            grmul = 1.6
+            gr = [16, 20, 64, 160]
+            n_layers = [4, 16, 8, 4]
+            downSamp = [1, 1, 1, 0]
+
+        if depth_wise:
+            second_kernel = 1
+            max_pool = False
+            drop_rate = 0.05
+
+        blks = len(n_layers)
+        self.base = nn.LayerList([])
+
+        # First Layer: Standard Conv3x3, Stride=2
+        self.base.append(
+            ConvLayer(
+                in_channels=3,
+                out_channels=first_ch[0],
+                kernel_size=3,
+                stride=2,
+                bias_attr=False))
+
+        # Second Layer
+        self.base.append(
+            ConvLayer(
+                first_ch[0], first_ch[1], kernel_size=second_kernel))
+
+        # Maxpooling or DWConv3x3 downsampling
+        if max_pool:
+            self.base.append(nn.MaxPool2D(kernel_size=3, stride=2, padding=1))
+        else:
+            self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2))
+
+        # Build all HarDNet blocks
+        ch = first_ch[1]
+        for i in range(blks):
+            blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)
+            ch = blk.out_channels
+            self.base.append(blk)
+
+            if i == blks - 1 and arch == 85:
+                self.base.append(nn.Dropout(0.1))
+
+            self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1))
+            ch = ch_list[i]
+            if downSamp[i] == 1:
+                if max_pool:
+                    self.base.append(nn.MaxPool2D(kernel_size=2, stride=2))
+                else:
+                    self.base.append(DWConvLayer(ch, ch, stride=2))
+
+        ch = ch_list[blks - 1]
+
+        layers = []
+
+        if with_pool:
+            layers.append(nn.AdaptiveAvgPool2D((1, 1)))
+
+        if class_num > 0:
+            layers.append(nn.Flatten())
+            layers.append(nn.Dropout(drop_rate))
+            layers.append(nn.Linear(ch, class_num))
+
+        self.base.append(nn.Sequential(*layers))
+
+    def forward(self, x):
+        for layer in self.base:
+            x = layer(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def HarDNet39_ds(pretrained=False, **kwargs):
+    model = HarDNet(arch=39, depth_wise=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet39_ds"])
+    return model
+
+
+def HarDNet68_ds(pretrained=False, **kwargs):
+    model = HarDNet(arch=68, depth_wise=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet68_ds"])
+    return model
+
+
+def HarDNet68(pretrained=False, **kwargs):
+    model = HarDNet(arch=68, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet68"])
+    return model
+
+
+def HarDNet85(pretrained=False, **kwargs):
+    model = HarDNet(arch=85, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet85"])
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/inception_v4.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/inception_v4.py
new file mode 100755
index 000000000..476330004
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/inception_v4.py
@@ -0,0 +1,479 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1602.07261
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "InceptionV4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/InceptionV4_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act='relu',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class InceptionStem(nn.Layer):
+    def __init__(self):
+        super(InceptionStem, self).__init__()
+        self._conv_1 = ConvBNLayer(
+            3, 32, 3, stride=2, act="relu", name="conv1_3x3_s2")
+        self._conv_2 = ConvBNLayer(32, 32, 3, act="relu", name="conv2_3x3_s1")
+        self._conv_3 = ConvBNLayer(
+            32, 64, 3, padding=1, act="relu", name="conv3_3x3_s1")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2 = ConvBNLayer(
+            64, 96, 3, stride=2, act="relu", name="inception_stem1_3x3_s2")
+        self._conv1_1 = ConvBNLayer(
+            160, 64, 1, act="relu", name="inception_stem2_3x3_reduce")
+        self._conv1_2 = ConvBNLayer(
+            64, 96, 3, act="relu", name="inception_stem2_3x3")
+        self._conv2_1 = ConvBNLayer(
+            160, 64, 1, act="relu", name="inception_stem2_1x7_reduce")
+        self._conv2_2 = ConvBNLayer(
+            64,
+            64, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_stem2_1x7")
+        self._conv2_3 = ConvBNLayer(
+            64,
+            64, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_stem2_7x1")
+        self._conv2_4 = ConvBNLayer(
+            64, 96, 3, act="relu", name="inception_stem2_3x3_2")
+        self._conv3 = ConvBNLayer(
+            192, 192, 3, stride=2, act="relu", name="inception_stem3_3x3_s2")
+
+    def forward(self, inputs):
+        conv = self._conv_1(inputs)
+        conv = self._conv_2(conv)
+        conv = self._conv_3(conv)
+
+        pool1 = self._pool(conv)
+        conv2 = self._conv2(conv)
+        concat = paddle.concat([pool1, conv2], axis=1)
+
+        conv1 = self._conv1_1(concat)
+        conv1 = self._conv1_2(conv1)
+
+        conv2 = self._conv2_1(concat)
+        conv2 = self._conv2_2(conv2)
+        conv2 = self._conv2_3(conv2)
+        conv2 = self._conv2_4(conv2)
+
+        concat = paddle.concat([conv1, conv2], axis=1)
+
+        conv1 = self._conv3(concat)
+        pool1 = self._pool(concat)
+
+        concat = paddle.concat([conv1, pool1], axis=1)
+        return concat
+
+
+class InceptionA(nn.Layer):
+    def __init__(self, name):
+        super(InceptionA, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            384, 96, 1, act="relu", name="inception_a" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            384, 96, 1, act="relu", name="inception_a" + name + "_1x1_2")
+        self._conv3_1 = ConvBNLayer(
+            384, 64, 1, act="relu", name="inception_a" + name + "_3x3_reduce")
+        self._conv3_2 = ConvBNLayer(
+            64,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3")
+        self._conv4_1 = ConvBNLayer(
+            384,
+            64,
+            1,
+            act="relu",
+            name="inception_a" + name + "_3x3_2_reduce")
+        self._conv4_2 = ConvBNLayer(
+            64,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3_2")
+        self._conv4_3 = ConvBNLayer(
+            96,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+
+        conv4 = self._conv4_1(inputs)
+        conv4 = self._conv4_2(conv4)
+        conv4 = self._conv4_3(conv4)
+
+        concat = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+        return concat
+
+
+class ReductionA(nn.Layer):
+    def __init__(self):
+        super(ReductionA, self).__init__()
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2 = ConvBNLayer(
+            384, 384, 3, stride=2, act="relu", name="reduction_a_3x3")
+        self._conv3_1 = ConvBNLayer(
+            384, 192, 1, act="relu", name="reduction_a_3x3_2_reduce")
+        self._conv3_2 = ConvBNLayer(
+            192, 224, 3, padding=1, act="relu", name="reduction_a_3x3_2")
+        self._conv3_3 = ConvBNLayer(
+            224, 256, 3, stride=2, act="relu", name="reduction_a_3x3_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv2 = self._conv2(inputs)
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+        concat = paddle.concat([pool1, conv2, conv3], axis=1)
+        return concat
+
+
+class InceptionB(nn.Layer):
+    def __init__(self, name=None):
+        super(InceptionB, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            1024, 128, 1, act="relu", name="inception_b" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            1024, 384, 1, act="relu", name="inception_b" + name + "_1x1_2")
+        self._conv3_1 = ConvBNLayer(
+            1024,
+            192,
+            1,
+            act="relu",
+            name="inception_b" + name + "_1x7_reduce")
+        self._conv3_2 = ConvBNLayer(
+            192,
+            224, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7")
+        self._conv3_3 = ConvBNLayer(
+            224,
+            256, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1")
+        self._conv4_1 = ConvBNLayer(
+            1024,
+            192,
+            1,
+            act="relu",
+            name="inception_b" + name + "_7x1_2_reduce")
+        self._conv4_2 = ConvBNLayer(
+            192,
+            192, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7_2")
+        self._conv4_3 = ConvBNLayer(
+            192,
+            224, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1_2")
+        self._conv4_4 = ConvBNLayer(
+            224,
+            224, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7_3")
+        self._conv4_5 = ConvBNLayer(
+            224,
+            256, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+
+        conv4 = self._conv4_1(inputs)
+        conv4 = self._conv4_2(conv4)
+        conv4 = self._conv4_3(conv4)
+        conv4 = self._conv4_4(conv4)
+        conv4 = self._conv4_5(conv4)
+
+        concat = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+        return concat
+
+
+class ReductionB(nn.Layer):
+    def __init__(self):
+        super(ReductionB, self).__init__()
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2_1 = ConvBNLayer(
+            1024, 192, 1, act="relu", name="reduction_b_3x3_reduce")
+        self._conv2_2 = ConvBNLayer(
+            192, 192, 3, stride=2, act="relu", name="reduction_b_3x3")
+        self._conv3_1 = ConvBNLayer(
+            1024, 256, 1, act="relu", name="reduction_b_1x7_reduce")
+        self._conv3_2 = ConvBNLayer(
+            256,
+            256, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="reduction_b_1x7")
+        self._conv3_3 = ConvBNLayer(
+            256,
+            320, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="reduction_b_7x1")
+        self._conv3_4 = ConvBNLayer(
+            320, 320, 3, stride=2, act="relu", name="reduction_b_3x3_2")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+
+        conv2 = self._conv2_1(inputs)
+        conv2 = self._conv2_2(conv2)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+        conv3 = self._conv3_4(conv3)
+
+        concat = paddle.concat([pool1, conv2, conv3], axis=1)
+
+        return concat
+
+
+class InceptionC(nn.Layer):
+    def __init__(self, name=None):
+        super(InceptionC, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            1536, 256, 1, act="relu", name="inception_c" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            1536, 256, 1, act="relu", name="inception_c" + name + "_1x1_2")
+        self._conv3_0 = ConvBNLayer(
+            1536, 384, 1, act="relu", name="inception_c" + name + "_1x1_3")
+        self._conv3_1 = ConvBNLayer(
+            384,
+            256, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3")
+        self._conv3_2 = ConvBNLayer(
+            384,
+            256, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1")
+        self._conv4_0 = ConvBNLayer(
+            1536, 384, 1, act="relu", name="inception_c" + name + "_1x1_4")
+        self._conv4_00 = ConvBNLayer(
+            384,
+            448, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3_2")
+        self._conv4_000 = ConvBNLayer(
+            448,
+            512, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1_2")
+        self._conv4_1 = ConvBNLayer(
+            512,
+            256, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3_3")
+        self._conv4_2 = ConvBNLayer(
+            512,
+            256, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_0(inputs)
+        conv3_1 = self._conv3_1(conv3)
+        conv3_2 = self._conv3_2(conv3)
+
+        conv4 = self._conv4_0(inputs)
+        conv4 = self._conv4_00(conv4)
+        conv4 = self._conv4_000(conv4)
+        conv4_1 = self._conv4_1(conv4)
+        conv4_2 = self._conv4_2(conv4)
+
+        concat = paddle.concat(
+            [conv1, conv2, conv3_1, conv3_2, conv4_1, conv4_2], axis=1)
+
+        return concat
+
+
+class InceptionV4DY(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(InceptionV4DY, self).__init__()
+        self._inception_stem = InceptionStem()
+
+        self._inceptionA_1 = InceptionA(name="1")
+        self._inceptionA_2 = InceptionA(name="2")
+        self._inceptionA_3 = InceptionA(name="3")
+        self._inceptionA_4 = InceptionA(name="4")
+        self._reductionA = ReductionA()
+
+        self._inceptionB_1 = InceptionB(name="1")
+        self._inceptionB_2 = InceptionB(name="2")
+        self._inceptionB_3 = InceptionB(name="3")
+        self._inceptionB_4 = InceptionB(name="4")
+        self._inceptionB_5 = InceptionB(name="5")
+        self._inceptionB_6 = InceptionB(name="6")
+        self._inceptionB_7 = InceptionB(name="7")
+        self._reductionB = ReductionB()
+
+        self._inceptionC_1 = InceptionC(name="1")
+        self._inceptionC_2 = InceptionC(name="2")
+        self._inceptionC_3 = InceptionC(name="3")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self._drop = Dropout(p=0.2, mode="downscale_in_infer")
+        stdv = 1.0 / math.sqrt(1536 * 1.0)
+        self.out = Linear(
+            1536,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="final_fc_weights"),
+            bias_attr=ParamAttr(name="final_fc_offset"))
+
+    def forward(self, inputs):
+        x = self._inception_stem(inputs)
+
+        x = self._inceptionA_1(x)
+        x = self._inceptionA_2(x)
+        x = self._inceptionA_3(x)
+        x = self._inceptionA_4(x)
+        x = self._reductionA(x)
+
+        x = self._inceptionB_1(x)
+        x = self._inceptionB_2(x)
+        x = self._inceptionB_3(x)
+        x = self._inceptionB_4(x)
+        x = self._inceptionB_5(x)
+        x = self._inceptionB_6(x)
+        x = self._inceptionB_7(x)
+        x = self._reductionB(x)
+
+        x = self._inceptionC_1(x)
+        x = self._inceptionC_2(x)
+        x = self._inceptionC_3(x)
+
+        x = self.avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._drop(x)
+        x = self.out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def InceptionV4(pretrained=False, use_ssld=False, **kwargs):
+    model = InceptionV4DY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["InceptionV4"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/levit.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/levit.py
new file mode 100755
index 000000000..47734cc9c
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/levit.py
@@ -0,0 +1,590 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/facebookresearch/LeViT
+# reference: https://openaccess.thecvf.com/content/ICCV2021/html/Graham_LeViT_A_Vision_Transformer_in_ConvNets_Clothing_for_Faster_Inference_ICCV_2021_paper.html
+
+import itertools
+import math
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+from paddle.regularizer import L2Decay
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, Identity
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "LeViT_128S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_128S_pretrained.pdparams",
+    "LeViT_128":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_128_pretrained.pdparams",
+    "LeViT_192":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_192_pretrained.pdparams",
+    "LeViT_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_256_pretrained.pdparams",
+    "LeViT_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def cal_attention_biases(attention_biases, attention_bias_idxs):
+    gather_list = []
+    attention_bias_t = paddle.transpose(attention_biases, (1, 0))
+    nums = attention_bias_idxs.shape[0]
+    for idx in range(nums):
+        gather = paddle.gather(attention_bias_t, attention_bias_idxs[idx])
+        gather_list.append(gather)
+    shape0, shape1 = attention_bias_idxs.shape
+    gather = paddle.concat(gather_list)
+    return paddle.transpose(gather, (1, 0)).reshape((0, shape0, shape1))
+
+
+class Conv2d_BN(nn.Sequential):
+    def __init__(self,
+                 a,
+                 b,
+                 ks=1,
+                 stride=1,
+                 pad=0,
+                 dilation=1,
+                 groups=1,
+                 bn_weight_init=1,
+                 resolution=-10000):
+        super().__init__()
+        self.add_sublayer(
+            'c',
+            nn.Conv2D(
+                a, b, ks, stride, pad, dilation, groups, bias_attr=False))
+        bn = nn.BatchNorm2D(b)
+        ones_(bn.weight)
+        zeros_(bn.bias)
+        self.add_sublayer('bn', bn)
+
+
+class Linear_BN(nn.Sequential):
+    def __init__(self, a, b, bn_weight_init=1):
+        super().__init__()
+        self.add_sublayer('c', nn.Linear(a, b, bias_attr=False))
+        bn = nn.BatchNorm1D(b)
+        if bn_weight_init == 0:
+            zeros_(bn.weight)
+        else:
+            ones_(bn.weight)
+        zeros_(bn.bias)
+        self.add_sublayer('bn', bn)
+
+    def forward(self, x):
+        l, bn = self._sub_layers.values()
+        x = l(x)
+        return paddle.reshape(bn(x.flatten(0, 1)), x.shape)
+
+
+class BN_Linear(nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_sublayer('bn', nn.BatchNorm1D(a))
+        l = nn.Linear(a, b, bias_attr=bias)
+        trunc_normal_(l.weight)
+        if bias:
+            zeros_(l.bias)
+        self.add_sublayer('l', l)
+
+
+def b16(n, activation, resolution=224):
+    return nn.Sequential(
+        Conv2d_BN(
+            3, n // 8, 3, 2, 1, resolution=resolution),
+        activation(),
+        Conv2d_BN(
+            n // 8, n // 4, 3, 2, 1, resolution=resolution // 2),
+        activation(),
+        Conv2d_BN(
+            n // 4, n // 2, 3, 2, 1, resolution=resolution // 4),
+        activation(),
+        Conv2d_BN(
+            n // 2, n, 3, 2, 1, resolution=resolution // 8))
+
+
+class Residual(nn.Layer):
+    def __init__(self, m, drop):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+
+    def forward(self, x):
+        if self.training and self.drop > 0:
+            y = paddle.rand(
+                shape=[x.shape[0], 1, 1]).__ge__(self.drop).astype("float32")
+            y = y.divide(paddle.full_like(y, 1 - self.drop))
+            return paddle.add(x, y)
+        else:
+            return paddle.add(x, self.m(x))
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 key_dim,
+                 num_heads=8,
+                 attn_ratio=4,
+                 activation=None,
+                 resolution=14):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        self.h = self.dh + nh_kd * 2
+        self.qkv = Linear_BN(dim, self.h)
+        self.proj = nn.Sequential(
+            activation(), Linear_BN(
+                self.dh, dim, bn_weight_init=0))
+        points = list(itertools.product(range(resolution), range(resolution)))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = self.create_parameter(
+            shape=(num_heads, len(attention_offsets)),
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+        tensor_idxs = paddle.to_tensor(idxs, dtype='int64')
+        self.register_buffer('attention_bias_idxs',
+                             paddle.reshape(tensor_idxs, [N, N]))
+
+    @paddle.no_grad()
+    def train(self, mode=True):
+        if mode:
+            super().train()
+        else:
+            super().eval()
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = cal_attention_biases(self.attention_biases,
+                                           self.attention_bias_idxs)
+
+    def forward(self, x):
+        self.training = True
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = paddle.reshape(qkv,
+                             [B, N, self.num_heads, self.h // self.num_heads])
+        q, k, v = paddle.split(
+            qkv, [self.key_dim, self.key_dim, self.d], axis=3)
+        q = paddle.transpose(q, perm=[0, 2, 1, 3])
+        k = paddle.transpose(k, perm=[0, 2, 1, 3])
+        v = paddle.transpose(v, perm=[0, 2, 1, 3])
+        k_transpose = paddle.transpose(k, perm=[0, 1, 3, 2])
+
+        if self.training:
+            attention_biases = cal_attention_biases(self.attention_biases,
+                                                    self.attention_bias_idxs)
+        else:
+            attention_biases = self.ab
+        attn = (paddle.matmul(q, k_transpose) * self.scale + attention_biases)
+        attn = F.softmax(attn)
+        x = paddle.transpose(paddle.matmul(attn, v), perm=[0, 2, 1, 3])
+        x = paddle.reshape(x, [B, N, self.dh])
+        x = self.proj(x)
+        return x
+
+
+class Subsample(nn.Layer):
+    def __init__(self, stride, resolution):
+        super().__init__()
+        self.stride = stride
+        self.resolution = resolution
+
+    def forward(self, x):
+        B, N, C = x.shape
+        x = paddle.reshape(x, [B, self.resolution, self.resolution, C])
+        end1, end2 = x.shape[1], x.shape[2]
+        x = x[:, 0:end1:self.stride, 0:end2:self.stride]
+        x = paddle.reshape(x, [B, -1, C])
+        return x
+
+
+class AttentionSubsample(nn.Layer):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 key_dim,
+                 num_heads=8,
+                 attn_ratio=2,
+                 activation=None,
+                 stride=2,
+                 resolution=14,
+                 resolution_=7):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * self.num_heads
+        self.attn_ratio = attn_ratio
+        self.resolution_ = resolution_
+        self.resolution_2 = resolution_**2
+        self.training = True
+        h = self.dh + nh_kd
+        self.kv = Linear_BN(in_dim, h)
+
+        self.q = nn.Sequential(
+            Subsample(stride, resolution), Linear_BN(in_dim, nh_kd))
+        self.proj = nn.Sequential(activation(), Linear_BN(self.dh, out_dim))
+
+        self.stride = stride
+        self.resolution = resolution
+        points = list(itertools.product(range(resolution), range(resolution)))
+        points_ = list(
+            itertools.product(range(resolution_), range(resolution_)))
+
+        N = len(points)
+        N_ = len(points_)
+        attention_offsets = {}
+        idxs = []
+        i = 0
+        j = 0
+        for p1 in points_:
+            i += 1
+            for p2 in points:
+                j += 1
+                size = 1
+                offset = (abs(p1[0] * stride - p2[0] + (size - 1) / 2),
+                          abs(p1[1] * stride - p2[1] + (size - 1) / 2))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = self.create_parameter(
+            shape=(num_heads, len(attention_offsets)),
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+
+        tensor_idxs_ = paddle.to_tensor(idxs, dtype='int64')
+        self.register_buffer('attention_bias_idxs',
+                             paddle.reshape(tensor_idxs_, [N_, N]))
+
+    @paddle.no_grad()
+    def train(self, mode=True):
+        if mode:
+            super().train()
+        else:
+            super().eval()
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = cal_attention_biases(self.attention_biases,
+                                           self.attention_bias_idxs)
+
+    def forward(self, x):
+        self.training = True
+        B, N, C = x.shape
+        kv = self.kv(x)
+        kv = paddle.reshape(kv, [B, N, self.num_heads, -1])
+        k, v = paddle.split(kv, [self.key_dim, self.d], axis=3)
+        k = paddle.transpose(k, perm=[0, 2, 1, 3])  # BHNC
+        v = paddle.transpose(v, perm=[0, 2, 1, 3])
+        q = paddle.reshape(
+            self.q(x), [B, self.resolution_2, self.num_heads, self.key_dim])
+        q = paddle.transpose(q, perm=[0, 2, 1, 3])
+
+        if self.training:
+            attention_biases = cal_attention_biases(self.attention_biases,
+                                                    self.attention_bias_idxs)
+        else:
+            attention_biases = self.ab
+
+        attn = (paddle.matmul(
+            q, paddle.transpose(
+                k, perm=[0, 1, 3, 2]))) * self.scale + attention_biases
+        attn = F.softmax(attn)
+
+        x = paddle.reshape(
+            paddle.transpose(
+                paddle.matmul(attn, v), perm=[0, 2, 1, 3]), [B, -1, self.dh])
+        x = self.proj(x)
+        return x
+
+
+class LeViT(nn.Layer):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=[192],
+                 key_dim=[64],
+                 depth=[12],
+                 num_heads=[3],
+                 attn_ratio=[2],
+                 mlp_ratio=[2],
+                 hybrid_backbone=None,
+                 down_ops=[],
+                 attention_activation=nn.Hardswish,
+                 mlp_activation=nn.Hardswish,
+                 distillation=True,
+                 drop_path=0):
+        super().__init__()
+
+        self.class_num = class_num
+        self.num_features = embed_dim[-1]
+        self.embed_dim = embed_dim
+        self.distillation = distillation
+
+        self.patch_embed = hybrid_backbone
+
+        self.blocks = []
+        down_ops.append([''])
+        resolution = img_size // patch_size
+        for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate(
+                zip(embed_dim, key_dim, depth, num_heads, attn_ratio,
+                    mlp_ratio, down_ops)):
+            for _ in range(dpth):
+                self.blocks.append(
+                    Residual(
+                        Attention(
+                            ed,
+                            kd,
+                            nh,
+                            attn_ratio=ar,
+                            activation=attention_activation,
+                            resolution=resolution, ),
+                        drop_path))
+                if mr > 0:
+                    h = int(ed * mr)
+                    self.blocks.append(
+                        Residual(
+                            nn.Sequential(
+                                Linear_BN(ed, h),
+                                mlp_activation(),
+                                Linear_BN(
+                                    h, ed, bn_weight_init=0), ),
+                            drop_path))
+            if do[0] == 'Subsample':
+                #('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+                resolution_ = (resolution - 1) // do[5] + 1
+                self.blocks.append(
+                    AttentionSubsample(
+                        *embed_dim[i:i + 2],
+                        key_dim=do[1],
+                        num_heads=do[2],
+                        attn_ratio=do[3],
+                        activation=attention_activation,
+                        stride=do[5],
+                        resolution=resolution,
+                        resolution_=resolution_))
+                resolution = resolution_
+                if do[4] > 0:  # mlp_ratio
+                    h = int(embed_dim[i + 1] * do[4])
+                    self.blocks.append(
+                        Residual(
+                            nn.Sequential(
+                                Linear_BN(embed_dim[i + 1], h),
+                                mlp_activation(),
+                                Linear_BN(
+                                    h, embed_dim[i + 1], bn_weight_init=0), ),
+                            drop_path))
+        self.blocks = nn.Sequential(*self.blocks)
+
+        # Classifier head
+        self.head = BN_Linear(embed_dim[-1],
+                              class_num) if class_num > 0 else Identity()
+        if distillation:
+            self.head_dist = BN_Linear(
+                embed_dim[-1], class_num) if class_num > 0 else Identity()
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x.flatten(2)
+        x = paddle.transpose(x, perm=[0, 2, 1])
+        x = self.blocks(x)
+        x = x.mean(1)
+
+        x = paddle.reshape(x, [-1, self.embed_dim[-1]])
+        if self.distillation:
+            x = self.head(x), self.head_dist(x)
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+        else:
+            x = self.head(x)
+        return x
+
+
+def model_factory(C, D, X, N, drop_path, class_num, distillation):
+    embed_dim = [int(x) for x in C.split('_')]
+    num_heads = [int(x) for x in N.split('_')]
+    depth = [int(x) for x in X.split('_')]
+    act = nn.Hardswish
+    model = LeViT(
+        patch_size=16,
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        key_dim=[D] * 3,
+        depth=depth,
+        attn_ratio=[2, 2, 2],
+        mlp_ratio=[2, 2, 2],
+        down_ops=[
+            #('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+            ['Subsample', D, embed_dim[0] // D, 4, 2, 2],
+            ['Subsample', D, embed_dim[1] // D, 4, 2, 2],
+        ],
+        attention_activation=act,
+        mlp_activation=act,
+        hybrid_backbone=b16(embed_dim[0], activation=act),
+        class_num=class_num,
+        drop_path=drop_path,
+        distillation=distillation)
+
+    return model
+
+
+specification = {
+    'LeViT_128S': {
+        'C': '128_256_384',
+        'D': 16,
+        'N': '4_6_8',
+        'X': '2_3_4',
+        'drop_path': 0
+    },
+    'LeViT_128': {
+        'C': '128_256_384',
+        'D': 16,
+        'N': '4_8_12',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_192': {
+        'C': '192_288_384',
+        'D': 32,
+        'N': '3_5_6',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_256': {
+        'C': '256_384_512',
+        'D': 32,
+        'N': '4_6_8',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_384': {
+        'C': '384_512_768',
+        'D': 32,
+        'N': '6_9_12',
+        'X': '4_4_4',
+        'drop_path': 0.1
+    },
+}
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def LeViT_128S(pretrained=False,
+               use_ssld=False,
+               class_num=1000,
+               distillation=False,
+               **kwargs):
+    model = model_factory(
+        **specification['LeViT_128S'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_128S"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_128(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_128'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_128"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_192(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_192'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_192"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_256(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_256'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_256"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_384(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_384'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_384"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/micronet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/micronet.py
new file mode 100755
index 000000000..8069bb860
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/micronet.py
@@ -0,0 +1,618 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Code was heavily based on https://github.com/liyunsheng13/micronet
+# reference: https://arxiv.org/pdf/2108.05894
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MicroNet_M0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MicroNet_M0_pretrained.pdparams",
+    "MicroNet_M1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MicroNet_M1_pretrained.pdparams",
+    "MicroNet_M2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MicroNet_M2_pretrained.pdparams",
+    "MicroNet_M3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MicroNet_M3_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+
+NET_CONFIG = {
+    "msnx_dy6_exp4_4M_221": [
+        #s, n,  c, ks, c1, c2, g1, g2, c3, g3, g4,y1,y2,y3,r
+        [2, 1, 8, 3, 2, 2, 0, 4, 8, 2, 2, 2, 0, 1,
+         1],  # 6  ->12(0,0)  ->24  ->8(4,2)   ->8
+        [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 2, 1,
+         1],  # 8  ->16(0,0)  ->32  ->16(4,4)  ->12
+        [2, 1, 16, 5, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1,
+         1],  # 16 ->32(0,0)  ->64  ->16(8,2)  ->16
+        [1, 1, 32, 5, 1, 4, 4, 4, 32, 4, 4, 2, 2, 1,
+         1],  # 16 ->16(2,8)  ->96  ->32(8,4)  ->32
+        [2, 1, 64, 5, 1, 4, 8, 8, 64, 8, 8, 2, 2, 1,
+         1],  # 32 ->32(2,16) ->192 ->64(12,4) ->64
+        [1, 1, 96, 3, 1, 4, 8, 8, 96, 8, 8, 2, 2, 1,
+         2],  # 64 ->64(3,16) ->384 ->96(16,6) ->96
+        [1, 1, 384, 3, 1, 4, 12, 12, 0, 0, 0, 2, 2, 1,
+         2],  # 96 ->96(4,24) ->384
+    ],
+    "msnx_dy6_exp6_6M_221": [
+        #s, n,  c, ks, c1, c2, g1, g2, c3, g3, g4
+        [2, 1, 8, 3, 2, 2, 0, 6, 8, 2, 2, 2, 0, 1,
+         1],  # 6  ->12(0,0)  ->24  ->8(4,2)   ->8
+        [2, 1, 16, 3, 2, 2, 0, 8, 16, 4, 4, 2, 2, 1,
+         1],  # 8  ->16(0,0)  ->32  ->16(4,4)  ->16
+        [2, 1, 16, 5, 2, 2, 0, 16, 16, 4, 4, 2, 2, 1,
+         1],  # 16 ->32(0,0)  ->64  ->16(8,2)  ->16
+        [1, 1, 32, 5, 1, 6, 4, 4, 32, 4, 4, 2, 2, 1,
+         1],  # 16 ->16(2,8)  ->96  ->32(8,4)  ->32
+        [2, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1,
+         1],  # 32 ->32(2,16) ->192 ->64(12,4) ->64
+        [1, 1, 96, 3, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1,
+         2],  # 64 ->64(3,16) ->384 ->96(16,6) ->96
+        [1, 1, 576, 3, 1, 6, 12, 12, 0, 0, 0, 2, 2, 1,
+         2],  # 96 ->96(4,24) ->576
+    ],
+    "msnx_dy9_exp6_12M_221": [
+        #s, n,  c, ks, c1, c2, g1, g2, c3, g3, g4
+        [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 0, 1,
+         1],  # 8   ->16(0,0)   ->32  ->12(4,3)   ->12
+        [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1,
+         1],  # 12  ->24(0,0)   ->48  ->16(8,2)   ->16
+        [1, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 2, 2, 1,
+         1],  # 16  ->16(0,0)   ->64  ->24(8,3)   ->24
+        [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 2, 2, 1,
+         1],  # 24  ->24(2,12)  ->144 ->32(16,2)  ->32
+        [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 2, 2, 1,
+         2],  # 32  ->32(2,16)  ->192 ->32(16,2)  ->32
+        [1, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1,
+         2],  # 32  ->32(2,16)  ->192 ->64(12,4)  ->64
+        [2, 1, 96, 5, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1,
+         2],  # 64  ->64(4,12)  ->384 ->96(16,5)  ->96
+        [1, 1, 128, 3, 1, 6, 12, 12, 128, 8, 8, 2, 2, 1,
+         2],  # 96  ->96(5,16)  ->576 ->128(16,8) ->128
+        [1, 1, 768, 3, 1, 6, 16, 16, 0, 0, 0, 2, 2, 1,
+         2],  # 128 ->128(4,32) ->768
+    ],
+    "msnx_dy12_exp6_20M_020": [
+        #s, n,  c, ks, c1, c2, g1, g2, c3, g3, g4
+        [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 0, 2, 0,
+         1],  # 12  ->24(0,0)   ->48  ->16(8,2)   ->16
+        [2, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 0, 2, 0,
+         1],  # 16  ->32(0,0)   ->64  ->24(8,3)   ->24
+        [1, 1, 24, 3, 2, 2, 0, 24, 24, 4, 4, 0, 2, 0,
+         1],  # 24  ->48(0,0)   ->96  ->24(8,3)   ->24
+        [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 0, 2, 0,
+         1],  # 24  ->24(2,12)  ->144 ->32(16,2)  ->32
+        [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 0, 2, 0,
+         2],  # 32  ->32(2,16)  ->192 ->32(16,2)  ->32
+        [1, 1, 64, 5, 1, 6, 8, 8, 48, 8, 8, 0, 2, 0,
+         2],  # 32  ->32(2,16)  ->192 ->48(12,4)  ->64
+        [1, 1, 80, 5, 1, 6, 8, 8, 80, 8, 8, 0, 2, 0,
+         2],  # 48  ->48(3,16)  ->288 ->80(16,5)  ->80
+        [1, 1, 80, 5, 1, 6, 10, 10, 80, 8, 8, 0, 2, 0,
+         2],  # 80  ->80(4,20)  ->480 ->80(20,4)  ->80
+        [2, 1, 120, 5, 1, 6, 10, 10, 120, 10, 10, 0, 2, 0,
+         2],  # 80  ->80(4,20)  ->480 ->128(16,8) ->120
+        [1, 1, 120, 5, 1, 6, 12, 12, 120, 10, 10, 0, 2, 0,
+         2],  # 120 ->128(4,32) ->720 ->128(32,4) ->120
+        [1, 1, 144, 3, 1, 6, 12, 12, 144, 12, 12, 0, 2, 0,
+         2],  # 120 ->128(4,32) ->720 ->160(32,5) ->144
+        [1, 1, 864, 3, 1, 6, 12, 12, 0, 0, 0, 0, 2, 0,
+         2],  # 144 ->144(5,32) ->864
+    ],
+}
+
+ACTIVATION_CONFIG = {
+    "msnx_dy6_exp4_4M_221": {
+        "act_max": 2.0,
+        "reduction": 8,
+        "init_ab3": [1.0, 0.0],
+        "init_a": [1.0, 1.0],
+        "init_b": [0.0, 0.0],
+    },
+    "msnx_dy6_exp6_6M_221": {
+        "act_max": 2.0,
+        "reduction": 8,
+        "init_ab3": [1.0, 0.0],
+        "init_a": [1.0, 1.0],
+        "init_b": [0.0, 0.0],
+    },
+    "msnx_dy9_exp6_12M_221": {
+        "act_max": 2.0,
+        "reduction": 8,
+        "init_ab3": [1.0, 0.0],
+        "init_a": [1.0, 1.0],
+        "init_b": [0.0, 0.0],
+    },
+    "msnx_dy12_exp6_20M_020": {
+        "act_max": 2.0,
+        "reduction": 8,
+        "init_ab3": [1.0, 0.0],
+        "init_a": [1.0, 0.5],
+        "init_b": [0.0, 0.5],
+    },
+}
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class MaxGroupPooling(nn.Layer):
+    def __init__(self, channel_per_group=2):
+        super().__init__()
+        self.channel_per_group = channel_per_group
+
+    def forward(self, x):
+        if self.channel_per_group == 1:
+            return x
+
+        # max op
+        b, c, h, w = x.shape
+
+        # reshape
+        y = x.reshape([b, c // self.channel_per_group, -1, h, w])
+        out, _ = paddle.max(y, axis=2)
+        return out
+
+
+class SwishLinear(nn.Layer):
+    def __init__(self, inp, oup):
+        super().__init__()
+        self.linear = nn.Sequential(
+            nn.Linear(inp, oup), nn.BatchNorm1D(oup), nn.Hardswish())
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class StemLayer(nn.Layer):
+    def __init__(self, inp, oup, stride, groups=(4, 4)):
+        super().__init__()
+        g1, g2 = groups
+        self.stem = nn.Sequential(
+            SpatialSepConvSF(inp, groups, 3, stride),
+            MaxGroupPooling(2) if g1 * g2 == 2 * oup else nn.ReLU6())
+
+    def forward(self, x):
+        out = self.stem(x)
+        return out
+
+
+class GroupConv(nn.Layer):
+    def __init__(self, inp, oup, groups=2):
+        super().__init__()
+        self.inp = inp
+        self.oup = oup
+        self.groups = groups
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp, oup, 1, groups=self.groups[0], bias_attr=False),
+            nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class ChannelShuffle(nn.Layer):
+    def __init__(self, groups):
+        super().__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+
+        channels_per_group = c // self.groups
+
+        # reshape
+        x = x.reshape([b, self.groups, channels_per_group, h, w])
+        x = x.transpose([0, 2, 1, 3, 4])
+        out = x.reshape([b, c, h, w])
+
+        return out
+
+
+class SpatialSepConvSF(nn.Layer):
+    def __init__(self, inp, oups, kernel_size, stride):
+        super().__init__()
+
+        oup1, oup2 = oups
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                oup1, (kernel_size, 1), (stride, 1), (kernel_size // 2, 0),
+                groups=1,
+                bias_attr=False),
+            nn.BatchNorm2D(oup1),
+            nn.Conv2D(
+                oup1,
+                oup1 * oup2, (1, kernel_size), (1, stride),
+                (0, kernel_size // 2),
+                groups=oup1,
+                bias_attr=False),
+            nn.BatchNorm2D(oup1 * oup2),
+            ChannelShuffle(oup1))
+
+    def forward(self, x):
+        out = self.conv(x)
+        return out
+
+
+class DepthSpatialSepConv(nn.Layer):
+    def __init__(self, inp, expand, kernel_size, stride):
+        super().__init__()
+
+        exp1, exp2 = expand
+        hidden_dim = inp * exp1
+        oup = inp * exp1 * exp2
+
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                inp * exp1, (kernel_size, 1), (stride, 1),
+                (kernel_size // 2, 0),
+                groups=inp,
+                bias_attr=False),
+            nn.BatchNorm2D(inp * exp1),
+            nn.Conv2D(
+                hidden_dim,
+                oup, (1, kernel_size), (1, stride), (0, kernel_size // 2),
+                groups=hidden_dim,
+                bias_attr=False),
+            nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        out = self.conv(x)
+        return out
+
+
+class DYShiftMax(nn.Layer):
+    def __init__(self,
+                 inp,
+                 oup,
+                 reduction=4,
+                 act_max=1.0,
+                 act_relu=True,
+                 init_a=[0.0, 0.0],
+                 init_b=[0.0, 0.0],
+                 relu_before_pool=False,
+                 g=None,
+                 expansion=False):
+        super().__init__()
+        self.oup = oup
+        self.act_max = act_max * 2
+        self.act_relu = act_relu
+        self.avg_pool = nn.Sequential(nn.ReLU() if relu_before_pool == True
+                                      else nn.Identity(),
+                                      nn.AdaptiveAvgPool2D(1))
+
+        self.exp = 4 if act_relu else 2
+        self.init_a = init_a
+        self.init_b = init_b
+
+        # determine squeeze
+        squeeze = _make_divisible(inp // reduction, 4)
+        if squeeze < 4:
+            squeeze = 4
+
+        self.fc = nn.Sequential(
+            nn.Linear(inp, squeeze),
+            nn.ReLU(), nn.Linear(squeeze, oup * self.exp), nn.Hardsigmoid())
+        if g is None:
+            g = 1
+        self.g = g[1]
+        if self.g != 1 and expansion:
+            self.g = inp // self.g
+        self.gc = inp // self.g
+        index = paddle.to_tensor(list(range(inp))).reshape([1, inp, 1, 1])
+        index = index.reshape([1, self.g, self.gc, 1, 1])
+        indexgs = paddle.split(index, [1, self.g - 1], axis=1)
+        indexgs = paddle.concat((indexgs[1], indexgs[0]), axis=1)
+        indexs = paddle.split(indexgs, [1, self.gc - 1], axis=2)
+        indexs = paddle.concat((indexs[1], indexs[0]), axis=2)
+        self.index = indexs.reshape([inp]).astype(paddle.int64)
+        self.expansion = expansion
+
+    def forward(self, x):
+        x_in = x
+        x_out = x
+
+        b, c, _, _ = x_in.shape
+        y = self.avg_pool(x_in).reshape([b, c])
+        y = self.fc(y).reshape([b, self.oup * self.exp, 1, 1])
+        y = (y - 0.5) * self.act_max
+
+        n2, c2, h2, w2 = x_out.shape
+        x2 = paddle.index_select(x_out, self.index, axis=1)
+
+        if self.exp == 4:
+            a1, b1, a2, b2 = paddle.split(y, 4, axis=1)
+
+            a1 = a1 + self.init_a[0]
+            a2 = a2 + self.init_a[1]
+
+            b1 = b1 + self.init_b[0]
+            b2 = b2 + self.init_b[1]
+
+            z1 = x_out * a1 + x2 * b1
+            z2 = x_out * a2 + x2 * b2
+
+            out = paddle.maximum(z1, z2)
+
+        elif self.exp == 2:
+            a1, b1 = paddle.split(y, 2, axis=1)
+            a1 = a1 + self.init_a[0]
+            b1 = b1 + self.init_b[0]
+            out = x_out * a1 + x2 * b1
+
+        return out
+
+
+class DYMicroBlock(nn.Layer):
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size=3,
+                 stride=1,
+                 ch_exp=(2, 2),
+                 ch_per_group=4,
+                 groups_1x1=(1, 1),
+                 dy=[0, 0, 0],
+                 ratio=1.0,
+                 activation_cfg=None):
+        super().__init__()
+
+        self.identity = stride == 1 and inp == oup
+
+        y1, y2, y3 = dy
+        act_max = activation_cfg["act_max"]
+        act_reduction = activation_cfg["reduction"] * ratio
+        init_a = activation_cfg["init_a"]
+        init_b = activation_cfg["init_b"]
+        init_ab3 = activation_cfg["init_ab3"]
+
+        t1 = ch_exp
+        gs1 = ch_per_group
+        hidden_fft, g1, g2 = groups_1x1
+
+        hidden_dim1 = inp * t1[0]
+        hidden_dim2 = inp * t1[0] * t1[1]
+
+        if gs1[0] == 0:
+            self.layers = nn.Sequential(
+                DepthSpatialSepConv(inp, t1, kernel_size, stride),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=act_max,
+                    act_relu=True if y2 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=False) if y2 > 0 else nn.ReLU6(),
+                ChannelShuffle(gs1[1]),
+                ChannelShuffle(hidden_dim2 // 2) if y2 != 0 else nn.Identity(),
+                GroupConv(hidden_dim2, oup, (g1, g2)),
+                DYShiftMax(
+                    oup,
+                    oup,
+                    act_max=act_max,
+                    act_relu=False,
+                    init_a=[init_ab3[0], 0.0],
+                    reduction=act_reduction // 2,
+                    init_b=[init_ab3[1], 0.0],
+                    g=(g1, g2),
+                    expansion=False) if y3 > 0 else nn.Identity(),
+                ChannelShuffle(g2),
+                ChannelShuffle(oup // 2)
+                if oup % 2 == 0 and y3 != 0 else nn.Identity())
+        elif g2 == 0:
+            self.layers = nn.Sequential(
+                GroupConv(inp, hidden_dim2, gs1),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=act_max,
+                    act_relu=False,
+                    init_a=[init_ab3[0], 0.0],
+                    reduction=act_reduction,
+                    init_b=[init_ab3[1], 0.0],
+                    g=gs1,
+                    expansion=False) if y3 > 0 else nn.Identity())
+        else:
+            self.layers = nn.Sequential(
+                GroupConv(inp, hidden_dim2, gs1),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=act_max,
+                    act_relu=True if y1 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=False) if y1 > 0 else nn.ReLU6(),
+                ChannelShuffle(gs1[1]),
+                DepthSpatialSepConv(hidden_dim2, (1, 1), kernel_size, stride),
+                nn.Identity(),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=act_max,
+                    act_relu=True if y2 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=True) if y2 > 0 else nn.ReLU6(),
+                ChannelShuffle(hidden_dim2 // 4)
+                if y1 != 0 and y2 != 0 else nn.Identity()
+                if y1 == 0 and y2 == 0 else ChannelShuffle(hidden_dim2 // 2),
+                GroupConv(hidden_dim2, oup, (g1, g2)),
+                DYShiftMax(
+                    oup,
+                    oup,
+                    act_max=act_max,
+                    act_relu=False,
+                    init_a=[init_ab3[0], 0.0],
+                    reduction=act_reduction // 2
+                    if oup < hidden_dim2 else act_reduction,
+                    init_b=[init_ab3[1], 0.0],
+                    g=(g1, g2),
+                    expansion=False) if y3 > 0 else nn.Identity(),
+                ChannelShuffle(g2),
+                ChannelShuffle(oup // 2) if y3 != 0 else nn.Identity())
+
+    def forward(self, x):
+        out = self.layers(x)
+        if self.identity:
+            out = out + x
+        return out
+
+
+class MicroNet(nn.Layer):
+    def __init__(self,
+                 net_cfg,
+                 activation_cfg,
+                 input_size=224,
+                 class_num=1000,
+                 stem_ch=16,
+                 stem_groups=[4, 8],
+                 out_ch=1024,
+                 dropout_rate=0.0):
+        super().__init__()
+
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = stem_ch
+        layers = [StemLayer(3, input_channel, stride=2, groups=stem_groups)]
+
+        for s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r in net_cfg:
+            for i in range(n):
+                layers.append(
+                    DYMicroBlock(
+                        input_channel,
+                        c,
+                        kernel_size=ks,
+                        stride=s if i == 0 else 1,
+                        ch_exp=(c1, c2),
+                        ch_per_group=(g1, g2),
+                        groups_1x1=(c3, g3, g4),
+                        dy=[y1, y2, y3],
+                        ratio=r,
+                        activation_cfg=activation_cfg))
+                input_channel = c
+        self.features = nn.Sequential(*layers)
+
+        self.avgpool = nn.Sequential(nn.ReLU6(),
+                                     nn.AdaptiveAvgPool2D(1), nn.Hardswish())
+
+        # building last several layers
+        self.classifier = nn.Sequential(
+            SwishLinear(input_channel, out_ch),
+            nn.Dropout(dropout_rate), SwishLinear(out_ch, class_num))
+
+        self.apply(self._initialize_weights)
+
+    def _initialize_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            nn.initializer.Normal(std=math.sqrt(2. / n))(m.weight)
+        elif isinstance(m, nn.Linear):
+            nn.initializer.Normal(std=0.01)(m.weight)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = self.classifier(x.flatten(1))
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MicroNet_M0(pretrained=False, use_ssld=False, **kwargs):
+    model = MicroNet(
+        NET_CONFIG["msnx_dy6_exp4_4M_221"],
+        ACTIVATION_CONFIG["msnx_dy6_exp4_4M_221"],
+        stem_ch=4,
+        stem_groups=[2, 2],
+        out_ch=640,
+        dropout_rate=0.05,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MicroNet_M0"], use_ssld)
+    return model
+
+
+def MicroNet_M1(pretrained=False, use_ssld=False, **kwargs):
+    model = MicroNet(
+        NET_CONFIG["msnx_dy6_exp6_6M_221"],
+        ACTIVATION_CONFIG["msnx_dy6_exp6_6M_221"],
+        stem_ch=6,
+        stem_groups=[3, 2],
+        out_ch=960,
+        dropout_rate=0.05,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MicroNet_M1"], use_ssld)
+    return model
+
+
+def MicroNet_M2(pretrained=False, use_ssld=False, **kwargs):
+    model = MicroNet(
+        NET_CONFIG["msnx_dy9_exp6_12M_221"],
+        ACTIVATION_CONFIG["msnx_dy9_exp6_12M_221"],
+        stem_ch=8,
+        stem_groups=[4, 2],
+        out_ch=1024,
+        dropout_rate=0.1,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MicroNet_M2"], use_ssld)
+    return model
+
+
+def MicroNet_M3(pretrained=False, use_ssld=False, **kwargs):
+    model = MicroNet(
+        NET_CONFIG["msnx_dy12_exp6_20M_020"],
+        ACTIVATION_CONFIG["msnx_dy12_exp6_20M_020"],
+        stem_ch=12,
+        stem_groups=[4, 3],
+        out_ch=1024,
+        dropout_rate=0.1,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MicroNet_M3"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/mixnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mixnet.py
new file mode 100755
index 000000000..201630cf9
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mixnet.py
@@ -0,0 +1,812 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1907.09595
+
+import os
+from inspect import isfunction
+from functools import reduce
+import paddle
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MixNet_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_S_pretrained.pdparams",
+    "MixNet_M":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_M_pretrained.pdparams",
+    "MixNet_L":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_L_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class Identity(nn.Layer):
+    """
+    Identity block.
+    """
+
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def round_channels(channels, divisor=8):
+    """
+    Round weighted channel number (make divisible operation).
+
+    Parameters:
+    ----------
+    channels : int or float
+        Original number of channels.
+    divisor : int, default 8
+        Alignment value.
+
+    Returns:
+    -------
+    int
+        Weighted number of channels.
+    """
+    rounded_channels = max(
+        int(channels + divisor / 2.0) // divisor * divisor, divisor)
+    if float(rounded_channels) < 0.9 * channels:
+        rounded_channels += divisor
+    return rounded_channels
+
+
+def get_activation_layer(activation):
+    """
+    Create activation layer from string/function.
+
+    Parameters:
+    ----------
+    activation : function, or str, or nn.Module
+        Activation function or name of activation function.
+
+    Returns:
+    -------
+    nn.Module
+        Activation layer.
+    """
+    assert activation is not None
+    if isfunction(activation):
+        return activation()
+    elif isinstance(activation, str):
+        if activation == "relu":
+            return nn.ReLU()
+        elif activation == "relu6":
+            return nn.ReLU6()
+        elif activation == "swish":
+            return nn.Swish()
+        elif activation == "hswish":
+            return nn.Hardswish()
+        elif activation == "sigmoid":
+            return nn.Sigmoid()
+        elif activation == "hsigmoid":
+            return nn.Hardsigmoid()
+        elif activation == "identity":
+            return Identity()
+        else:
+            raise NotImplementedError()
+    else:
+        assert isinstance(activation, nn.Layer)
+        return activation
+
+
+class ConvBlock(nn.Layer):
+    """
+    Standard convolution block with Batch normalization and activation.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int, or tuple/list of 2 int, or tuple/list of 4 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str or None, default nn.ReLU()
+        Activation function or name of activation function.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 use_bn=True,
+                 bn_eps=1e-5,
+                 activation=nn.ReLU()):
+        super(ConvBlock, self).__init__()
+        self.activate = (activation is not None)
+        self.use_bn = use_bn
+        self.use_pad = (isinstance(padding, (list, tuple)) and
+                        (len(padding) == 4))
+
+        if self.use_pad:
+            self.pad = padding
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias,
+            weight_attr=None)
+        if self.use_bn:
+            self.bn = nn.BatchNorm2D(num_features=out_channels, epsilon=bn_eps)
+        if self.activate:
+            self.activ = get_activation_layer(activation)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        if self.activate:
+            x = self.activ(x)
+        return x
+
+
+class SEBlock(nn.Layer):
+    def __init__(self,
+                 channels,
+                 reduction=16,
+                 mid_channels=None,
+                 round_mid=False,
+                 use_conv=True,
+                 mid_activation=nn.ReLU(),
+                 out_activation=nn.Sigmoid()):
+        super(SEBlock, self).__init__()
+        self.use_conv = use_conv
+        if mid_channels is None:
+            mid_channels = channels // reduction if not round_mid else round_channels(
+                float(channels) / reduction)
+
+        self.pool = nn.AdaptiveAvgPool2D(output_size=1)
+        if use_conv:
+            self.conv1 = nn.Conv2D(
+                in_channels=channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                bias_attr=True,
+                weight_attr=None)
+
+        else:
+            self.fc1 = nn.Linear(
+                in_features=channels, out_features=mid_channels)
+        self.activ = get_activation_layer(mid_activation)
+        if use_conv:
+            self.conv2 = nn.Conv2D(
+                in_channels=mid_channels,
+                out_channels=channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                bias_attr=True,
+                weight_attr=None)
+        else:
+            self.fc2 = nn.Linear(
+                in_features=mid_channels, out_features=channels)
+        self.sigmoid = get_activation_layer(out_activation)
+
+    def forward(self, x):
+        w = self.pool(x)
+        if not self.use_conv:
+            w = w.reshape(shape=[w.shape[0], -1])
+        w = self.conv1(w) if self.use_conv else self.fc1(w)
+        w = self.activ(w)
+        w = self.conv2(w) if self.use_conv else self.fc2(w)
+        w = self.sigmoid(w)
+        if not self.use_conv:
+            w = w.unsqueeze(2).unsqueeze(3)
+        x = x * w
+        return x
+
+
+class MixConv(nn.Layer):
+    """
+    Mixed convolution layer from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    axis : int, default 1
+        The axis on which to concatenate the outputs.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 axis=1):
+        super(MixConv, self).__init__()
+        kernel_size = kernel_size if isinstance(kernel_size,
+                                                list) else [kernel_size]
+        padding = padding if isinstance(padding, list) else [padding]
+        kernel_count = len(kernel_size)
+        self.splitted_in_channels = self.split_channels(in_channels,
+                                                        kernel_count)
+        splitted_out_channels = self.split_channels(out_channels, kernel_count)
+        for i, kernel_size_i in enumerate(kernel_size):
+            in_channels_i = self.splitted_in_channels[i]
+            out_channels_i = splitted_out_channels[i]
+            padding_i = padding[i]
+            _ = self.add_sublayer(
+                name=str(i),
+                sublayer=nn.Conv2D(
+                    in_channels=in_channels_i,
+                    out_channels=out_channels_i,
+                    kernel_size=kernel_size_i,
+                    stride=stride,
+                    padding=padding_i,
+                    dilation=dilation,
+                    groups=(out_channels_i
+                            if out_channels == groups else groups),
+                    bias_attr=bias,
+                    weight_attr=None))
+        self.axis = axis
+
+    def forward(self, x):
+        xx = paddle.split(x, self.splitted_in_channels, axis=self.axis)
+        xx = paddle.split(x, self.splitted_in_channels, axis=self.axis)
+        out = [
+            conv_i(x_i) for x_i, conv_i in zip(xx, self._sub_layers.values())
+        ]
+        x = paddle.concat(tuple(out), axis=self.axis)
+        return x
+
+    @staticmethod
+    def split_channels(channels, kernel_count):
+        splitted_channels = [channels // kernel_count] * kernel_count
+        splitted_channels[0] += channels - sum(splitted_channels)
+        return splitted_channels
+
+
+class MixConvBlock(nn.Layer):
+    """
+    Mixed convolution block with Batch normalization and activation.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str or None, default nn.ReLU()
+        Activation function or name of activation function.
+    activate : bool, default True
+        Whether activate the convolution block.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 use_bn=True,
+                 bn_eps=1e-5,
+                 activation=nn.ReLU()):
+        super(MixConvBlock, self).__init__()
+        self.activate = (activation is not None)
+        self.use_bn = use_bn
+
+        self.conv = MixConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        if self.use_bn:
+            self.bn = nn.BatchNorm2D(num_features=out_channels, epsilon=bn_eps)
+        if self.activate:
+            self.activ = get_activation_layer(activation)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        if self.activate:
+            x = self.activ(x)
+        return x
+
+
+def mixconv1x1_block(in_channels,
+                     out_channels,
+                     kernel_count,
+                     stride=1,
+                     groups=1,
+                     bias=False,
+                     use_bn=True,
+                     bn_eps=1e-5,
+                     activation=nn.ReLU()):
+    """
+    1x1 version of the mixed convolution block.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_count : int
+        Kernel count.
+    stride : int or tuple/list of 2 int, default 1
+        Strides of the convolution.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str, or None, default nn.ReLU()
+        Activation function or name of activation function.
+    """
+    return MixConvBlock(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=([1] * kernel_count),
+        stride=stride,
+        padding=([0] * kernel_count),
+        groups=groups,
+        bias=bias,
+        use_bn=use_bn,
+        bn_eps=bn_eps,
+        activation=activation)
+
+
+class MixUnit(nn.Layer):
+    """
+    MixNet unit.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.  exp_channels : int
+        Number of middle (expanded) channels.
+    stride : int or tuple/list of 2 int
+        Strides of the second convolution layer.
+    exp_kernel_count : int
+        Expansion convolution kernel count for each unit.
+    conv1_kernel_count : int
+        Conv1 kernel count for each unit.
+    conv2_kernel_count : int
+        Conv2 kernel count for each unit.
+    exp_factor : int
+        Expansion factor for each unit.
+    se_factor : int
+        SE reduction factor for each unit.
+    activation : str
+        Activation function or name of activation function.
+    """
+
+    def __init__(self, in_channels, out_channels, stride, exp_kernel_count,
+                 conv1_kernel_count, conv2_kernel_count, exp_factor, se_factor,
+                 activation):
+        super(MixUnit, self).__init__()
+        assert exp_factor >= 1
+        assert se_factor >= 0
+        self.residual = (in_channels == out_channels) and (stride == 1)
+        self.use_se = se_factor > 0
+        mid_channels = exp_factor * in_channels
+        self.use_exp_conv = exp_factor > 1
+
+        if self.use_exp_conv:
+            if exp_kernel_count == 1:
+                self.exp_conv = ConvBlock(
+                    in_channels=in_channels,
+                    out_channels=mid_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                    bias=False,
+                    use_bn=True,
+                    bn_eps=1e-5,
+                    activation=activation)
+            else:
+                self.exp_conv = mixconv1x1_block(
+                    in_channels=in_channels,
+                    out_channels=mid_channels,
+                    kernel_count=exp_kernel_count,
+                    activation=activation)
+        if conv1_kernel_count == 1:
+            self.conv1 = ConvBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                dilation=1,
+                groups=mid_channels,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5,
+                activation=activation)
+        else:
+            self.conv1 = MixConvBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                kernel_size=[3 + 2 * i for i in range(conv1_kernel_count)],
+                stride=stride,
+                padding=[1 + i for i in range(conv1_kernel_count)],
+                groups=mid_channels,
+                activation=activation)
+        if self.use_se:
+            self.se = SEBlock(
+                channels=mid_channels,
+                reduction=(exp_factor * se_factor),
+                round_mid=False,
+                mid_activation=activation)
+        if conv2_kernel_count == 1:
+            self.conv2 = ConvBlock(
+                in_channels=mid_channels,
+                out_channels=out_channels,
+                activation=None,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                groups=1,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5)
+        else:
+            self.conv2 = mixconv1x1_block(
+                in_channels=mid_channels,
+                out_channels=out_channels,
+                kernel_count=conv2_kernel_count,
+                activation=None)
+
+    def forward(self, x):
+        if self.residual:
+            identity = x
+        if self.use_exp_conv:
+            x = self.exp_conv(x)
+        x = self.conv1(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.conv2(x)
+        if self.residual:
+            x = x + identity
+        return x
+
+
+class MixInitBlock(nn.Layer):
+    """
+    MixNet specific initial block.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    """
+
+    def __init__(self, in_channels, out_channels):
+        super(MixInitBlock, self).__init__()
+        self.conv1 = ConvBlock(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=2,
+            kernel_size=3,
+            padding=1)
+        self.conv2 = MixUnit(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            stride=1,
+            exp_kernel_count=1,
+            conv1_kernel_count=1,
+            conv2_kernel_count=1,
+            exp_factor=1,
+            se_factor=0,
+            activation="relu")
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class MixNet(nn.Layer):
+    """
+    MixNet model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+
+    Parameters:
+    ----------
+    channels : list of list of int
+        Number of output channels for each unit.
+    init_block_channels : int
+        Number of output channels for the initial unit.
+    final_block_channels : int
+        Number of output channels for the final block of the feature extractor.
+    exp_kernel_counts : list of list of int
+        Expansion convolution kernel count for each unit.
+    conv1_kernel_counts : list of list of int
+        Conv1 kernel count for each unit.
+    conv2_kernel_counts : list of list of int
+        Conv2 kernel count for each unit.
+    exp_factors : list of list of int
+        Expansion factor for each unit.
+    se_factors : list of list of int
+        SE reduction factor for each unit.
+    in_channels : int, default 3
+        Number of input channels.
+    in_size : tuple of two ints, default (224, 224)
+        Spatial size of the expected input image.
+    class_num : int, default 1000
+        Number of classification classes.
+    """
+
+    def __init__(self,
+                 channels,
+                 init_block_channels,
+                 final_block_channels,
+                 exp_kernel_counts,
+                 conv1_kernel_counts,
+                 conv2_kernel_counts,
+                 exp_factors,
+                 se_factors,
+                 in_channels=3,
+                 in_size=(224, 224),
+                 class_num=1000):
+        super(MixNet, self).__init__()
+        self.in_size = in_size
+        self.class_num = class_num
+
+        self.features = nn.Sequential()
+        self.features.add_sublayer(
+            "init_block",
+            MixInitBlock(
+                in_channels=in_channels, out_channels=init_block_channels))
+        in_channels = init_block_channels
+        for i, channels_per_stage in enumerate(channels):
+            stage = nn.Sequential()
+            for j, out_channels in enumerate(channels_per_stage):
+                stride = 2 if ((j == 0) and (i != 3)) or (
+                    (j == len(channels_per_stage) // 2) and (i == 3)) else 1
+                exp_kernel_count = exp_kernel_counts[i][j]
+                conv1_kernel_count = conv1_kernel_counts[i][j]
+                conv2_kernel_count = conv2_kernel_counts[i][j]
+                exp_factor = exp_factors[i][j]
+                se_factor = se_factors[i][j]
+                activation = "relu" if i == 0 else "swish"
+                stage.add_sublayer(
+                    "unit{}".format(j + 1),
+                    MixUnit(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        stride=stride,
+                        exp_kernel_count=exp_kernel_count,
+                        conv1_kernel_count=conv1_kernel_count,
+                        conv2_kernel_count=conv2_kernel_count,
+                        exp_factor=exp_factor,
+                        se_factor=se_factor,
+                        activation=activation))
+                in_channels = out_channels
+            self.features.add_sublayer("stage{}".format(i + 1), stage)
+        self.features.add_sublayer(
+            "final_block",
+            ConvBlock(
+                in_channels=in_channels,
+                out_channels=final_block_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                groups=1,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5,
+                activation=nn.ReLU()))
+        in_channels = final_block_channels
+        self.features.add_sublayer(
+            "final_pool", nn.AvgPool2D(
+                kernel_size=7, stride=1))
+
+        self.output = nn.Linear(
+            in_features=in_channels, out_features=class_num)
+
+    def forward(self, x):
+        x = self.features(x)
+        reshape_dim = reduce(lambda x, y: x * y, x.shape[1:])
+        x = x.reshape(shape=[x.shape[0], reshape_dim])
+        x = self.output(x)
+        return x
+
+
+def get_mixnet(version, width_scale, model_name=None, **kwargs):
+    """
+    Create MixNet model with specific parameters.
+
+    Parameters:
+    ----------
+    version : str
+        Version of MobileNetV3 ('s' or 'm').
+    width_scale : float
+        Scale factor for width of layers.
+    model_name : str or None, default None
+        Model name.
+    """
+
+    if version == "s":
+        init_block_channels = 16
+        channels = [[24, 24], [40, 40, 40, 40], [80, 80, 80],
+                    [120, 120, 120, 200, 200, 200]]
+        exp_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 1, 1],
+                             [2, 2, 2, 1, 1, 1]]
+        conv1_kernel_counts = [[1, 1], [3, 2, 2, 2], [3, 2, 2],
+                               [3, 4, 4, 5, 4, 4]]
+        conv2_kernel_counts = [[2, 2], [1, 2, 2, 2], [2, 2, 2],
+                               [2, 2, 2, 1, 2, 2]]
+        exp_factors = [[6, 3], [6, 6, 6, 6], [6, 6, 6], [6, 3, 3, 6, 6, 6]]
+        se_factors = [[0, 0], [2, 2, 2, 2], [4, 4, 4], [2, 2, 2, 2, 2, 2]]
+    elif version == "m":
+        init_block_channels = 24
+        channels = [[32, 32], [40, 40, 40, 40], [80, 80, 80, 80],
+                    [120, 120, 120, 120, 200, 200, 200, 200]]
+        exp_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 2, 2, 2],
+                             [1, 2, 2, 2, 1, 1, 1, 1]]
+        conv1_kernel_counts = [[3, 1], [4, 2, 2, 2], [3, 4, 4, 4],
+                               [1, 4, 4, 4, 4, 4, 4, 4]]
+        conv2_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 2, 2, 2],
+                               [1, 2, 2, 2, 1, 2, 2, 2]]
+        exp_factors = [[6, 3], [6, 6, 6, 6], [6, 6, 6, 6],
+                       [6, 3, 3, 3, 6, 6, 6, 6]]
+        se_factors = [[0, 0], [2, 2, 2, 2], [4, 4, 4, 4],
+                      [2, 2, 2, 2, 2, 2, 2, 2]]
+    else:
+        raise ValueError("Unsupported MixNet version {}".format(version))
+
+    final_block_channels = 1536
+
+    if width_scale != 1.0:
+        channels = [[round_channels(cij * width_scale) for cij in ci]
+                    for ci in channels]
+        init_block_channels = round_channels(init_block_channels * width_scale)
+
+    net = MixNet(
+        channels=channels,
+        init_block_channels=init_block_channels,
+        final_block_channels=final_block_channels,
+        exp_kernel_counts=exp_kernel_counts,
+        conv1_kernel_counts=conv1_kernel_counts,
+        conv2_kernel_counts=conv2_kernel_counts,
+        exp_factors=exp_factors,
+        se_factors=se_factors,
+        **kwargs)
+
+    return net
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MixNet_S(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-S model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="s", width_scale=1.0, model_name="MixNet_S", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_S"], use_ssld=use_ssld)
+    return model
+
+
+def MixNet_M(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-M model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="m", width_scale=1.0, model_name="MixNet_M", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_M"], use_ssld=use_ssld)
+    return model
+
+
+def MixNet_L(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-S model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="m", width_scale=1.3, model_name="MixNet_L", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_L"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilenet_v2.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilenet_v2.py
new file mode 100755
index 000000000..5d606988f
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilenet_v2.py
@@ -0,0 +1,316 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1801.04381
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNetV2_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams",
+    "MobileNetV2_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams",
+    "MobileNetV2_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams",
+    "MobileNetV2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_pretrained.pdparams",
+    "MobileNetV2_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams",
+    "MobileNetV2_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 name=None,
+                 use_cudnn=True,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance",
+            data_layout=data_format)
+
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = F.relu6(y)
+        return y
+
+
+class InvertedResidualUnit(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_in_filter,
+                 num_filters,
+                 stride,
+                 filter_size,
+                 padding,
+                 expansion_factor,
+                 name,
+                 data_format="NCHW"):
+        super(InvertedResidualUnit, self).__init__()
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            name=name + "_expand",
+            data_format=data_format)
+
+        self._bottleneck_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            use_cudnn=False,
+            name=name + "_dwise",
+            data_format=data_format)
+
+        self._linear_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            name=name + "_linear",
+            data_format=data_format)
+
+    def forward(self, inputs, ifshortcut):
+        y = self._expand_conv(inputs, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = paddle.add(inputs, y)
+        return y
+
+
+class InvresiBlocks(nn.Layer):
+    def __init__(self, in_c, t, c, n, s, name, data_format="NCHW"):
+        super(InvresiBlocks, self).__init__()
+
+        self._first_block = InvertedResidualUnit(
+            num_channels=in_c,
+            num_in_filter=in_c,
+            num_filters=c,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t,
+            name=name + "_1",
+            data_format=data_format)
+
+        self._block_list = []
+        for i in range(1, n):
+            block = self.add_sublayer(
+                name + "_" + str(i + 1),
+                sublayer=InvertedResidualUnit(
+                    num_channels=c,
+                    num_in_filter=c,
+                    num_filters=c,
+                    stride=1,
+                    filter_size=3,
+                    padding=1,
+                    expansion_factor=t,
+                    name=name + "_" + str(i + 1),
+                    data_format=data_format))
+            self._block_list.append(block)
+
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for block in self._block_list:
+            y = block(y, ifshortcut=True)
+        return y
+
+
+class MobileNet(nn.Layer):
+    def __init__(self,
+                 class_num=1000,
+                 scale=1.0,
+                 prefix_name="",
+                 data_format="NCHW"):
+        super(MobileNet, self).__init__()
+        self.scale = scale
+        self.class_num = class_num
+        self.data_format = data_format
+
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            name=prefix_name + "conv1_1",
+            data_format=data_format)
+
+        self.block_list = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            block = self.add_sublayer(
+                prefix_name + "conv" + str(i),
+                sublayer=InvresiBlocks(
+                    in_c=in_c,
+                    t=t,
+                    c=int(c * scale),
+                    n=n,
+                    s=s,
+                    name=prefix_name + "conv" + str(i),
+                    data_format=data_format))
+            self.block_list.append(block)
+            in_c = int(c * scale)
+
+        self.out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self.conv9 = ConvBNLayer(
+            num_channels=in_c,
+            num_filters=self.out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            name=prefix_name + "conv9",
+            data_format=data_format)
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1, data_format=data_format)
+
+        self.out = Linear(
+            self.out_c,
+            class_num,
+            weight_attr=ParamAttr(name=prefix_name + "fc10_weights"),
+            bias_attr=ParamAttr(name=prefix_name + "fc10_offset"))
+
+    def forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        y = self.conv1(inputs, if_act=True)
+        for block in self.block_list:
+            y = block(y)
+        y = self.conv9(y, if_act=True)
+        y = self.pool2d_avg(y)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV2_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.25, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_25"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.75, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x1_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x2_0"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilenext.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilenext.py
new file mode 100755
index 000000000..e432d2d7b
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilenext.py
@@ -0,0 +1,262 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/zhoudaquan/rethinking_bottleneck_design
+# reference: https://arxiv.org/abs/2007.02269
+
+import math
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNeXt_x0_35": "",  # TODO
+    "MobileNeXt_x0_5": "",  # TODO
+    "MobileNeXt_x0_75": "",  # TODO
+    "MobileNeXt_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNeXt_x1_0_pretrained.pdparams",
+    "MobileNeXt_x1_4": "",  # TODO
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def conv_3x3_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2D(
+            inp, oup, 3, stride, 1, bias_attr=False),
+        nn.BatchNorm2D(oup),
+        nn.ReLU6())
+
+
+class SGBlock(nn.Layer):
+    def __init__(self, inp, oup, stride, expand_ratio, keep_3x3=False):
+        super(SGBlock, self).__init__()
+        assert stride in [1, 2]
+
+        hidden_dim = inp // expand_ratio
+        if hidden_dim < oup / 6.:
+            hidden_dim = math.ceil(oup / 6.)
+            hidden_dim = _make_divisible(hidden_dim, 16)  # + 16
+
+        self.identity = False
+        self.identity_div = 1
+        self.expand_ratio = expand_ratio
+
+        if expand_ratio == 2:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2D(
+                    inp, inp, 3, 1, 1, groups=inp, bias_attr=False),
+                nn.BatchNorm2D(inp),
+                nn.ReLU6(),
+                # pw-linear
+                nn.Conv2D(
+                    inp, hidden_dim, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(hidden_dim),
+                # pw-linear
+                nn.Conv2D(
+                    hidden_dim, oup, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(oup),
+                nn.ReLU6(),
+                # dw
+                nn.Conv2D(
+                    oup, oup, 3, stride, 1, groups=oup, bias_attr=False),
+                nn.BatchNorm2D(oup))
+        elif inp != oup and stride == 1 and keep_3x3 == False:
+            self.conv = nn.Sequential(
+                # pw-linear
+                nn.Conv2D(
+                    inp, hidden_dim, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(hidden_dim),
+                # pw-linear
+                nn.Conv2D(
+                    hidden_dim, oup, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(oup),
+                nn.ReLU6())
+        elif inp != oup and stride == 2 and keep_3x3 == False:
+            self.conv = nn.Sequential(
+                # pw-linear
+                nn.Conv2D(
+                    inp, hidden_dim, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(hidden_dim),
+                # pw-linear
+                nn.Conv2D(
+                    hidden_dim, oup, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(oup),
+                nn.ReLU6(),
+                # dw
+                nn.Conv2D(
+                    oup, oup, 3, stride, 1, groups=oup, bias_attr=False),
+                nn.BatchNorm2D(oup))
+        else:
+            if keep_3x3 == False:
+                self.identity = True
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2D(
+                    inp, inp, 3, 1, 1, groups=inp, bias_attr=False),
+                nn.BatchNorm2D(inp),
+                nn.ReLU6(),
+                # pw
+                nn.Conv2D(
+                    inp, hidden_dim, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(hidden_dim),
+                #nn.ReLU6(),
+                # pw
+                nn.Conv2D(
+                    hidden_dim, oup, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(oup),
+                nn.ReLU6(),
+                # dw
+                nn.Conv2D(
+                    oup, oup, 3, 1, 1, groups=oup, bias_attr=False),
+                nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        out = self.conv(x)
+
+        if self.identity:
+            if self.identity_div == 1:
+                out = out + x
+            else:
+                shape = x.shape
+                id_tensor = x[:, :shape[1] // self.identity_div, :, :]
+                out[:, :shape[1] // self.identity_div, :, :] = \
+                    out[:, :shape[1] // self.identity_div, :, :] + id_tensor
+
+        return out
+
+
+class MobileNeXt(nn.Layer):
+    def __init__(self, class_num=1000, width_mult=1.00):
+        super().__init__()
+
+        # setting of inverted residual blocks
+        self.cfgs = [
+            # t, c, n, s
+            [2, 96, 1, 2],
+            [6, 144, 1, 1],
+            [6, 192, 3, 2],
+            [6, 288, 3, 2],
+            [6, 384, 4, 1],
+            [6, 576, 4, 2],
+            [6, 960, 3, 1],
+            [6, 1280, 1, 1],
+        ]
+
+        # building first layer
+        input_channel = _make_divisible(32 * width_mult, 4
+                                        if width_mult == 0.1 else 8)
+        layers = [conv_3x3_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        block = SGBlock
+        for t, c, n, s in self.cfgs:
+            output_channel = _make_divisible(c * width_mult, 4
+                                             if width_mult == 0.1 else 8)
+            if c == 1280 and width_mult < 1:
+                output_channel = 1280
+            layers.append(
+                block(input_channel, output_channel, s, t, n == 1 and s == 1))
+            input_channel = output_channel
+            for _ in range(n - 1):
+                layers.append(block(input_channel, output_channel, 1, t))
+                input_channel = output_channel
+        self.features = nn.Sequential(*layers)
+        # building last several layers
+        input_channel = output_channel
+        output_channel = _make_divisible(input_channel, 4)
+        self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2), nn.Linear(output_channel, class_num))
+
+        self.apply(self._initialize_weights)
+
+    def _initialize_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            nn.initializer.Normal(std=math.sqrt(2. / n))(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            nn.initializer.Constant(1)(m.weight)
+            nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.initializer.Normal(std=0.01)(m.weight)
+            nn.initializer.Constant(0)(m.bias)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = x.flatten(1)
+        x = self.classifier(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNeXt_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=0.35, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x0_35"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNeXt_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=0.50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNeXt_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=0.75, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNeXt_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=1.00, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNeXt_x1_4(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=1.40, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x1_4"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit.py
new file mode 100755
index 000000000..58adaf18e
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit.py
@@ -0,0 +1,479 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/BR-IDL/PaddleViT/blob/develop/image_classification/MobileViT/mobilevit.py
+# and https://github.com/apple/ml-cvnets/blob/main/cvnets/models/classification/mobilevit.py
+# reference: https://arxiv.org/abs/2110.02178
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingUniform, TruncatedNormal, Constant
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileViT_XXS":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViT_XXS_pretrained.pdparams",
+    "MobileViT_XS":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViT_XS_pretrained.pdparams",
+    "MobileViT_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViT_S_pretrained.pdparams",
+}
+
+
+def _init_weights_linear():
+    weight_attr = ParamAttr(initializer=TruncatedNormal(std=.02))
+    bias_attr = ParamAttr(initializer=Constant(0.0))
+    return weight_attr, bias_attr
+
+
+def _init_weights_layernorm():
+    weight_attr = ParamAttr(initializer=Constant(1.0))
+    bias_attr = ParamAttr(initializer=Constant(0.0))
+    return weight_attr, bias_attr
+
+
+class ConvBnAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 bias_attr=False,
+                 groups=1):
+        super().__init__()
+        self.in_channels = in_channels
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingUniform()),
+            bias_attr=bias_attr)
+        self.norm = nn.BatchNorm2D(out_channels)
+        self.act = nn.Silu()
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+        out = self.act(out)
+        return out
+
+
+class Identity(nn.Layer):
+    """ Identity layer"""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class Mlp(nn.Layer):
+    def __init__(self, embed_dim, mlp_ratio, dropout=0.1):
+        super().__init__()
+        w_attr_1, b_attr_1 = _init_weights_linear()
+        self.fc1 = nn.Linear(
+            embed_dim,
+            int(embed_dim * mlp_ratio),
+            weight_attr=w_attr_1,
+            bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = _init_weights_linear()
+        self.fc2 = nn.Linear(
+            int(embed_dim * mlp_ratio),
+            embed_dim,
+            weight_attr=w_attr_2,
+            bias_attr=b_attr_2)
+
+        self.act = nn.Silu()
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout1(x)
+        x = self.fc2(x)
+        x = self.dropout2(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 qkv_bias=True,
+                 dropout=0.1,
+                 attention_dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.attn_head_dim = int(embed_dim / self.num_heads)
+        self.all_head_dim = self.attn_head_dim * self.num_heads
+
+        w_attr_1, b_attr_1 = _init_weights_linear()
+        self.qkv = nn.Linear(
+            embed_dim,
+            self.all_head_dim * 3,
+            weight_attr=w_attr_1,
+            bias_attr=b_attr_1 if qkv_bias else False)
+
+        self.scales = self.attn_head_dim**-0.5
+
+        w_attr_2, b_attr_2 = _init_weights_linear()
+        self.proj = nn.Linear(
+            embed_dim, embed_dim, weight_attr=w_attr_2, bias_attr=b_attr_2)
+
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def transpose_multihead(self, x):
+        B, P, N, d = x.shape
+        x = x.reshape([B, P, N, self.num_heads, d // self.num_heads])
+        x = x.transpose([0, 1, 3, 2, 4])
+        return x
+
+    def forward(self, x):
+        b_sz, n_patches, in_channels = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape([
+            b_sz, n_patches, 3, self.num_heads,
+            qkv.shape[-1] // self.num_heads // 3
+        ])
+        qkv = qkv.transpose([0, 3, 2, 1, 4])
+        query, key, value = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
+        query = query * self.scales
+        key = key.transpose([0, 1, 3, 2])
+        # QK^T
+        attn = paddle.matmul(query, key)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+        # weighted sum
+        out = paddle.matmul(attn, value)
+        out = out.transpose([0, 2, 1, 3]).reshape(
+            [b_sz, n_patches, out.shape[1] * out.shape[3]])
+        out = self.proj(out)
+        out = self.proj_dropout(out)
+        return out
+
+
+class EncoderLayer(nn.Layer):
+    def __init__(self,
+                 embed_dim,
+                 num_heads=4,
+                 qkv_bias=True,
+                 mlp_ratio=2.0,
+                 dropout=0.1,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        w_attr_1, b_attr_1 = _init_weights_layernorm()
+        w_attr_2, b_attr_2 = _init_weights_layernorm()
+
+        self.attn_norm = nn.LayerNorm(
+            embed_dim, weight_attr=w_attr_1, bias_attr=b_attr_1)
+        self.attn = Attention(embed_dim, num_heads, qkv_bias, dropout,
+                              attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.mlp_norm = nn.LayerNorm(
+            embed_dim, weight_attr=w_attr_2, bias_attr=b_attr_2)
+        self.mlp = Mlp(embed_dim, mlp_ratio, dropout)
+
+    def forward(self, x):
+        h = x
+        x = self.attn_norm(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+        h = x
+        x = self.mlp_norm(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = x + h
+        return x
+
+
+class Transformer(nn.Layer):
+    """Transformer block for MobileViTBlock"""
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 depth,
+                 qkv_bias=True,
+                 mlp_ratio=2.0,
+                 dropout=0.1,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        depth_decay = [x.item() for x in paddle.linspace(0, droppath, depth)]
+
+        layer_list = []
+        for i in range(depth):
+            layer_list.append(
+                EncoderLayer(embed_dim, num_heads, qkv_bias, mlp_ratio,
+                             dropout, attention_dropout, droppath))
+        self.layers = nn.LayerList(layer_list)
+
+        w_attr_1, b_attr_1 = _init_weights_layernorm()
+        self.norm = nn.LayerNorm(
+            embed_dim, weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        out = self.norm(x)
+        return out
+
+
+class MobileV2Block(nn.Layer):
+    """Mobilenet v2 InvertedResidual block"""
+
+    def __init__(self, inp, oup, stride=1, expansion=4):
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expansion))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expansion != 1:
+            layers.append(ConvBnAct(inp, hidden_dim, kernel_size=1))
+
+        layers.extend([
+            # dw
+            ConvBnAct(
+                hidden_dim,
+                hidden_dim,
+                stride=stride,
+                groups=hidden_dim,
+                padding=1),
+            # pw-linear
+            nn.Conv2D(
+                hidden_dim, oup, 1, 1, 0, bias_attr=False),
+            nn.BatchNorm2D(oup),
+        ])
+
+        self.conv = nn.Sequential(*layers)
+        self.out_channels = oup
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        return self.conv(x)
+
+
+class MobileViTBlock(nn.Layer):
+    """ MobileViTBlock for MobileViT"""
+
+    def __init__(self,
+                 dim,
+                 hidden_dim,
+                 depth,
+                 num_heads=4,
+                 qkv_bias=True,
+                 mlp_ratio=2.0,
+                 dropout=0.1,
+                 attention_dropout=0.,
+                 droppath=0.0,
+                 patch_size=(2, 2)):
+        super().__init__()
+        self.patch_h, self.patch_w = patch_size
+
+        # local representations
+        self.conv1 = ConvBnAct(dim, dim, padding=1)
+        self.conv2 = nn.Conv2D(
+            dim, hidden_dim, kernel_size=1, stride=1, bias_attr=False)
+        # global representations
+        self.transformer = Transformer(
+            embed_dim=hidden_dim,
+            num_heads=num_heads,
+            depth=depth,
+            qkv_bias=qkv_bias,
+            mlp_ratio=mlp_ratio,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            droppath=droppath)
+
+        # fusion
+        self.conv3 = ConvBnAct(hidden_dim, dim, kernel_size=1)
+        self.conv4 = ConvBnAct(2 * dim, dim, padding=1)
+
+    def forward(self, x):
+        h = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        patch_h = self.patch_h
+        patch_w = self.patch_w
+        patch_area = int(patch_w * patch_h)
+        _, in_channels, orig_h, orig_w = x.shape
+        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+        interpolate = False
+
+        if new_w != orig_w or new_h != orig_h:
+            x = F.interpolate(x, size=[new_h, new_w], mode="bilinear")
+            interpolate = True
+
+        num_patch_w, num_patch_h = new_w // patch_w, new_h // patch_h
+        num_patches = num_patch_h * num_patch_w
+        reshaped_x = x.reshape([-1, patch_h, num_patch_w, patch_w])
+        transposed_x = reshaped_x.transpose([0, 2, 1, 3])
+        reshaped_x = transposed_x.reshape(
+            [-1, in_channels, num_patches, patch_area])
+        transposed_x = reshaped_x.transpose([0, 3, 2, 1])
+
+        x = transposed_x.reshape([-1, num_patches, in_channels])
+        x = self.transformer(x)
+        x = x.reshape([-1, patch_h * patch_w, num_patches, in_channels])
+
+        _, pixels, num_patches, channels = x.shape
+        x = x.transpose([0, 3, 2, 1])
+        x = x.reshape([-1, num_patch_w, patch_h, patch_w])
+        x = x.transpose([0, 2, 1, 3])
+        x = x.reshape(
+            [-1, channels, num_patch_h * patch_h, num_patch_w * patch_w])
+
+        if interpolate:
+            x = F.interpolate(x, size=[orig_h, orig_w])
+        x = self.conv3(x)
+        x = paddle.concat((h, x), axis=1)
+        x = self.conv4(x)
+        return x
+
+
+class MobileViT(nn.Layer):
+    """ MobileViT
+        A PaddlePaddle impl of : `MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer`  -
+          https://arxiv.org/abs/2110.02178
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 dims=[16, 32, 48, 48, 48, 64, 80, 96, 384],
+                 hidden_dims=[96, 120, 144],
+                 mv2_expansion=4,
+                 class_num=1000):
+        super().__init__()
+        self.conv3x3 = ConvBnAct(
+            in_channels, dims[0], kernel_size=3, stride=2, padding=1)
+        self.mv2_block_1 = MobileV2Block(
+            dims[0], dims[1], expansion=mv2_expansion)
+        self.mv2_block_2 = MobileV2Block(
+            dims[1], dims[2], stride=2, expansion=mv2_expansion)
+        self.mv2_block_3 = MobileV2Block(
+            dims[2], dims[3], expansion=mv2_expansion)
+        self.mv2_block_4 = MobileV2Block(
+            dims[3], dims[4], expansion=mv2_expansion)
+
+        self.mv2_block_5 = MobileV2Block(
+            dims[4], dims[5], stride=2, expansion=mv2_expansion)
+        self.mvit_block_1 = MobileViTBlock(dims[5], hidden_dims[0], depth=2)
+
+        self.mv2_block_6 = MobileV2Block(
+            dims[5], dims[6], stride=2, expansion=mv2_expansion)
+        self.mvit_block_2 = MobileViTBlock(dims[6], hidden_dims[1], depth=4)
+
+        self.mv2_block_7 = MobileV2Block(
+            dims[6], dims[7], stride=2, expansion=mv2_expansion)
+        self.mvit_block_3 = MobileViTBlock(dims[7], hidden_dims[2], depth=3)
+        self.conv1x1 = ConvBnAct(dims[7], dims[8], kernel_size=1)
+
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        self.dropout = nn.Dropout(0.1)
+        self.linear = nn.Linear(dims[8], class_num)
+
+    def forward(self, x):
+        x = self.conv3x3(x)
+        x = self.mv2_block_1(x)
+        x = self.mv2_block_2(x)
+        x = self.mv2_block_3(x)
+        x = self.mv2_block_4(x)
+
+        x = self.mv2_block_5(x)
+        x = self.mvit_block_1(x)
+
+        x = self.mv2_block_6(x)
+        x = self.mvit_block_2(x)
+
+        x = self.mv2_block_7(x)
+        x = self.mvit_block_3(x)
+        x = self.conv1x1(x)
+
+        x = self.pool(x)
+        x = x.reshape(x.shape[:2])
+
+        x = self.dropout(x)
+        x = self.linear(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileViT_XXS(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileViT(
+        in_channels=3,
+        dims=[16, 16, 24, 24, 24, 48, 64, 80, 320],
+        hidden_dims=[64, 80, 96],
+        mv2_expansion=2,
+        **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViT_XXS"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViT_XS(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileViT(
+        in_channels=3,
+        dims=[16, 32, 48, 48, 48, 64, 80, 96, 384],
+        hidden_dims=[96, 120, 144],
+        mv2_expansion=4,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViT_XS"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViT_S(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileViT(
+        in_channels=3,
+        dims=[16, 32, 64, 64, 64, 96, 128, 160, 640],
+        hidden_dims=[144, 192, 240],
+        mv2_expansion=4,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViT_S"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit_v2.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit_v2.py
new file mode 100755
index 000000000..9a57448e9
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit_v2.py
@@ -0,0 +1,593 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/apple/ml-cvnets/blob/7be93d3debd45c240a058e3f34a9e88d33c07a7d/cvnets/models/classification/mobilevit_v2.py
+# reference: https://arxiv.org/abs/2206.02680
+
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileViTV2_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV2_x0_5_pretrained.pdparams",
+    "MobileViTV2_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV2_x1_0_pretrained.pdparams",
+    "MobileViTV2_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV2_x1_5_pretrained.pdparams",
+    "MobileViTV2_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV2_x2_0_pretrained.pdparams",
+}
+
+layer_norm_2d = partial(nn.GroupNorm, num_groups=1)
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class InvertedResidual(nn.Layer):
+    """
+    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 dilation=1,
+                 skip_connection=True):
+        super().__init__()
+        assert stride in [1, 2]
+        self.stride = stride
+
+        hidden_dim = make_divisible(int(round(in_channels * expand_ratio)), 8)
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels and skip_connection
+
+        block = nn.Sequential()
+        if expand_ratio != 1:
+            block.add_sublayer(
+                name="exp_1x1",
+                sublayer=nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, hidden_dim, 1, bias_attr=False)),
+                    ('norm', nn.BatchNorm2D(hidden_dim)), ('act', nn.Silu())))
+
+        block.add_sublayer(
+            name="conv_3x3",
+            sublayer=nn.Sequential(
+                ('conv', nn.Conv2D(
+                    hidden_dim,
+                    hidden_dim,
+                    3,
+                    bias_attr=False,
+                    stride=stride,
+                    padding=dilation,
+                    dilation=dilation,
+                    groups=hidden_dim)), ('norm', nn.BatchNorm2D(hidden_dim)),
+                ('act', nn.Silu())))
+
+        block.add_sublayer(
+            name="red_1x1",
+            sublayer=nn.Sequential(
+                ('conv', nn.Conv2D(
+                    hidden_dim, out_channels, 1, bias_attr=False)),
+                ('norm', nn.BatchNorm2D(out_channels))))
+
+        self.block = block
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.exp = expand_ratio
+        self.dilation = dilation
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.block(x)
+        else:
+            return self.block(x)
+
+
+class LinearSelfAttention(nn.Layer):
+    def __init__(self, embed_dim, attn_dropout=0.0, bias=True):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.qkv_proj = nn.Conv2D(
+            embed_dim, 1 + (2 * embed_dim), 1, bias_attr=bias)
+        self.attn_dropout = nn.Dropout(p=attn_dropout)
+        self.out_proj = nn.Conv2D(embed_dim, embed_dim, 1, bias_attr=bias)
+
+    def forward(self, x):
+        # [B, C, P, N] --> [B, h + 2d, P, N]
+        qkv = self.qkv_proj(x)
+
+        # Project x into query, key and value
+        # Query --> [B, 1, P, N]
+        # value, key --> [B, d, P, N]
+        query, key, value = paddle.split(
+            qkv, [1, self.embed_dim, self.embed_dim], axis=1)
+
+        # apply softmax along N dimension
+        context_scores = F.softmax(query, axis=-1)
+        # Uncomment below line to visualize context scores
+        # self.visualize_context_scores(context_scores=context_scores)
+        context_scores = self.attn_dropout(context_scores)
+
+        # Compute context vector
+        # [B, d, P, N] x [B, 1, P, N] -> [B, d, P, N]
+        context_vector = key * context_scores
+        # [B, d, P, N] --> [B, d, P, 1]
+        context_vector = paddle.sum(context_vector, axis=-1, keepdim=True)
+
+        # combine context vector with values
+        # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N]
+        out = F.relu(value) * context_vector
+        out = self.out_proj(out)
+        return out
+
+
+class LinearAttnFFN(nn.Layer):
+    def __init__(self,
+                 embed_dim,
+                 ffn_latent_dim,
+                 attn_dropout=0.0,
+                 dropout=0.1,
+                 ffn_dropout=0.0,
+                 norm_layer=layer_norm_2d) -> None:
+        super().__init__()
+        attn_unit = LinearSelfAttention(
+            embed_dim=embed_dim, attn_dropout=attn_dropout, bias=True)
+
+        self.pre_norm_attn = nn.Sequential(
+            norm_layer(num_channels=embed_dim),
+            attn_unit,
+            nn.Dropout(p=dropout))
+
+        self.pre_norm_ffn = nn.Sequential(
+            norm_layer(num_channels=embed_dim),
+            nn.Conv2D(embed_dim, ffn_latent_dim, 1),
+            nn.Silu(),
+            nn.Dropout(p=ffn_dropout),
+            nn.Conv2D(ffn_latent_dim, embed_dim, 1),
+            nn.Dropout(p=dropout))
+
+    def forward(self, x):
+        # self-attention
+        x = x + self.pre_norm_attn(x)
+        # Feed forward network
+        x = x + self.pre_norm_ffn(x)
+        return x
+
+
+class MobileViTV2Block(nn.Layer):
+    """
+    This class defines the `MobileViTV2 block`
+    """
+
+    def __init__(self,
+                 in_channels,
+                 attn_unit_dim,
+                 ffn_multiplier=2.0,
+                 n_attn_blocks=2,
+                 attn_dropout=0.0,
+                 dropout=0.0,
+                 ffn_dropout=0.0,
+                 patch_h=8,
+                 patch_w=8,
+                 conv_ksize=3,
+                 dilation=1,
+                 attn_norm_layer=layer_norm_2d):
+        super().__init__()
+        cnn_out_dim = attn_unit_dim
+        padding = (conv_ksize - 1) // 2 * dilation
+        conv_3x3_in = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                in_channels,
+                conv_ksize,
+                bias_attr=False,
+                padding=padding,
+                dilation=dilation,
+                groups=in_channels)), ('norm', nn.BatchNorm2D(in_channels)),
+            ('act', nn.Silu()))
+        conv_1x1_in = nn.Sequential(('conv', nn.Conv2D(
+            in_channels, cnn_out_dim, 1, bias_attr=False)))
+
+        self.local_rep = nn.Sequential(conv_3x3_in, conv_1x1_in)
+
+        self.global_rep, attn_unit_dim = self._build_attn_layer(
+            d_model=attn_unit_dim,
+            ffn_mult=ffn_multiplier,
+            n_layers=n_attn_blocks,
+            attn_dropout=attn_dropout,
+            dropout=dropout,
+            ffn_dropout=ffn_dropout,
+            attn_norm_layer=attn_norm_layer)
+
+        self.conv_proj = nn.Sequential(
+            ('conv', nn.Conv2D(
+                cnn_out_dim, in_channels, 1, bias_attr=False)),
+            ('norm', nn.BatchNorm2D(in_channels)))
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+
+    def _build_attn_layer(self, d_model, ffn_mult, n_layers, attn_dropout,
+                          dropout, ffn_dropout, attn_norm_layer):
+        # ensure that dims are multiple of 16
+        ffn_dims = [ffn_mult * d_model // 16 * 16] * n_layers
+
+        global_rep = [
+            LinearAttnFFN(
+                embed_dim=d_model,
+                ffn_latent_dim=ffn_dims[block_idx],
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                norm_layer=attn_norm_layer) for block_idx in range(n_layers)
+        ]
+        global_rep.append(attn_norm_layer(num_channels=d_model))
+
+        return nn.Sequential(*global_rep), d_model
+
+    def unfolding(self, feature_map):
+        batch_size, in_channels, img_h, img_w = feature_map.shape
+
+        # [B, C, H, W] --> [B, C, P, N]
+        patches = F.unfold(
+            feature_map,
+            kernel_sizes=[self.patch_h, self.patch_w],
+            strides=[self.patch_h, self.patch_w])
+        n_patches = img_h * img_w // (self.patch_h * self.patch_w)
+        patches = patches.reshape(
+            [batch_size, in_channels, self.patch_h * self.patch_w, n_patches])
+
+        return patches, (img_h, img_w)
+
+    def folding(self, patches, output_size):
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+
+        # [B, C, P, N]
+        patches = patches.reshape([batch_size, in_dim * patch_size, n_patches])
+
+        feature_map = F.fold(
+            patches,
+            output_size,
+            kernel_sizes=[self.patch_h, self.patch_w],
+            strides=[self.patch_h, self.patch_w])
+
+        return feature_map
+
+    def forward(self, x):
+        fm = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, output_size = self.unfolding(fm)
+
+        # learn global representations on all patches
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, output_size=output_size)
+        fm = self.conv_proj(fm)
+
+        return fm
+
+
+class MobileViTV2(nn.Layer):
+    """
+        MobileViTV2
+    """
+
+    def __init__(self, mobilevit_config, class_num=1000, output_stride=None):
+        super().__init__()
+        self.round_nearest = 8
+        self.dilation = 1
+
+        dilate_l4 = dilate_l5 = False
+        if output_stride == 8:
+            dilate_l4 = True
+            dilate_l5 = True
+        elif output_stride == 16:
+            dilate_l5 = True
+
+        # store model configuration in a dictionary
+        in_channels = mobilevit_config["layer0"]["img_channels"]
+        out_channels = mobilevit_config["layer0"]["out_channels"]
+        self.conv_1 = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                out_channels,
+                3,
+                bias_attr=False,
+                stride=2,
+                padding=1)), ('norm', nn.BatchNorm2D(out_channels)),
+            ('act', nn.Silu()))
+
+        in_channels = out_channels
+        self.layer_1, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer1"])
+
+        in_channels = out_channels
+        self.layer_2, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer2"])
+
+        in_channels = out_channels
+        self.layer_3, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer3"])
+
+        in_channels = out_channels
+        self.layer_4, out_channels = self._make_layer(
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer4"],
+            dilate=dilate_l4)
+
+        in_channels = out_channels
+        self.layer_5, out_channels = self._make_layer(
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer5"],
+            dilate=dilate_l5)
+
+        self.conv_1x1_exp = nn.Identity()
+        self.classifier = nn.Sequential()
+        self.classifier.add_sublayer(
+            name="global_pool",
+            sublayer=nn.Sequential(nn.AdaptiveAvgPool2D(1), nn.Flatten()))
+        self.classifier.add_sublayer(
+            name="fc", sublayer=nn.Linear(out_channels, class_num))
+
+        # weight initialization
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            fan_in = m.weight.shape[1] * m.weight.shape[2] * m.weight.shape[3]
+            bound = 1.0 / fan_in**0.5
+            nn.initializer.Uniform(-bound, bound)(m.weight)
+            if m.bias is not None:
+                nn.initializer.Uniform(-bound, bound)(m.bias)
+        elif isinstance(m, (nn.BatchNorm2D, nn.GroupNorm)):
+            nn.initializer.Constant(1)(m.weight)
+            nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.initializer.XavierUniform()(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+
+    def _make_layer(self, input_channel, cfg, dilate=False):
+        block_type = cfg.get("block_type", "mobilevit")
+        if block_type.lower() == "mobilevit":
+            return self._make_mit_layer(
+                input_channel=input_channel, cfg=cfg, dilate=dilate)
+        else:
+            return self._make_mobilenet_layer(
+                input_channel=input_channel, cfg=cfg)
+
+    def _make_mit_layer(self, input_channel, cfg, dilate=False):
+        prev_dilation = self.dilation
+        block = []
+        stride = cfg.get("stride", 1)
+
+        if stride == 2:
+            if dilate:
+                self.dilation *= 2
+                stride = 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=cfg.get("out_channels"),
+                stride=stride,
+                expand_ratio=cfg.get("mv_expand_ratio", 4),
+                dilation=prev_dilation)
+
+            block.append(layer)
+            input_channel = cfg.get("out_channels")
+
+        block.append(
+            MobileViTV2Block(
+                in_channels=input_channel,
+                attn_unit_dim=cfg["attn_unit_dim"],
+                ffn_multiplier=cfg.get("ffn_multiplier"),
+                n_attn_blocks=cfg.get("attn_blocks", 1),
+                ffn_dropout=0.,
+                attn_dropout=0.,
+                dilation=self.dilation,
+                patch_h=cfg.get("patch_h", 2),
+                patch_w=cfg.get("patch_w", 2)))
+
+        return nn.Sequential(*block), input_channel
+
+    def _make_mobilenet_layer(self, input_channel, cfg):
+        output_channels = cfg.get("out_channels")
+        num_blocks = cfg.get("num_blocks", 2)
+        expand_ratio = cfg.get("expand_ratio", 4)
+        block = []
+
+        for i in range(num_blocks):
+            stride = cfg.get("stride", 1) if i == 0 else 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=output_channels,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            block.append(layer)
+            input_channel = output_channels
+        return nn.Sequential(*block), input_channel
+
+    def extract_features(self, x):
+        x = self.conv_1(x)
+        x = self.layer_1(x)
+        x = self.layer_2(x)
+        x = self.layer_3(x)
+
+        x = self.layer_4(x)
+        x = self.layer_5(x)
+        x = self.conv_1x1_exp(x)
+        return x
+
+    def forward(self, x):
+        x = self.extract_features(x)
+        x = self.classifier(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def get_configuration(width_multiplier):
+    ffn_multiplier = 2
+    mv2_exp_mult = 2  # max(1.0, min(2.0, 2.0 * width_multiplier))
+
+    layer_0_dim = max(16, min(64, 32 * width_multiplier))
+    layer_0_dim = int(make_divisible(layer_0_dim, divisor=8, min_value=16))
+    config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": layer_0_dim,
+        },
+        "layer1": {
+            "out_channels": int(make_divisible(64 * width_multiplier, divisor=16)),
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": int(make_divisible(128 * width_multiplier, divisor=8)),
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": int(make_divisible(256 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(128 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": int(make_divisible(384 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(192 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": int(make_divisible(512 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(256 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    return config
+
+
+def MobileViTV2_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 2.0
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x2_0"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x1_75(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 1.75
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x1_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 1.5
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x1_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 1.25
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x1_25"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 1.0
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 0.75
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 0.5
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x0_5"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit_v3.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit_v3.py
new file mode 100755
index 000000000..652ac5626
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/mobilevit_v3.py
@@ -0,0 +1,1445 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/micronDLA/MobileViTv3/blob/main/MobileViTv3-v1/cvnets/models/classification/mobilevit.py
+# reference: https://arxiv.org/abs/2209.15159
+
+import math
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileViTV3_XXS":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_XXS_pretrained.pdparams",
+    "MobileViTV3_XS":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_XS_pretrained.pdparams",
+    "MobileViTV3_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_S_pretrained.pdparams",
+    "MobileViTV3_XXS_L2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_XXS_L2_pretrained.pdparams",
+    "MobileViTV3_XS_L2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_XS_L2_pretrained.pdparams",
+    "MobileViTV3_S_L2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_S_L2_pretrained.pdparams",
+    "MobileViTV3_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_x0_5_pretrained.pdparams",
+    "MobileViTV3_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_x0_75_pretrained.pdparams",
+    "MobileViTV3_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_x1_0_pretrained.pdparams",
+}
+
+layer_norm_2d = partial(nn.GroupNorm, num_groups=1)
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class InvertedResidual(nn.Layer):
+    """
+    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 stride: int,
+                 expand_ratio: Union[int, float],
+                 dilation: int=1) -> None:
+        assert stride in [1, 2]
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+
+        hidden_dim = make_divisible(int(round(in_channels * expand_ratio)), 8)
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+
+        block = nn.Sequential()
+        if expand_ratio != 1:
+            block.add_sublayer(
+                name="exp_1x1",
+                sublayer=nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, hidden_dim, 1, bias_attr=False)),
+                    ('norm', nn.BatchNorm2D(hidden_dim)), ('act', nn.Silu())))
+
+        block.add_sublayer(
+            name="conv_3x3",
+            sublayer=nn.Sequential(
+                ('conv', nn.Conv2D(
+                    hidden_dim,
+                    hidden_dim,
+                    3,
+                    bias_attr=False,
+                    stride=stride,
+                    padding=dilation,
+                    dilation=dilation,
+                    groups=hidden_dim)), ('norm', nn.BatchNorm2D(hidden_dim)),
+                ('act', nn.Silu())))
+
+        block.add_sublayer(
+            name="red_1x1",
+            sublayer=nn.Sequential(
+                ('conv', nn.Conv2D(
+                    hidden_dim, out_channels, 1, bias_attr=False)),
+                ('norm', nn.BatchNorm2D(out_channels))))
+
+        self.block = block
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.exp = expand_ratio
+        self.dilation = dilation
+
+    def forward(self, x, *args, **kwargs):
+        if self.use_res_connect:
+            return x + self.block(x)
+        else:
+            return self.block(x)
+
+
+class MultiHeadAttention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv_proj = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.out_proj = nn.Linear(dim, dim, bias_attr=qkv_bias)
+
+    def forward(self, x):
+        # B = x.shape[0]
+        N, C = x.shape[1:]
+        qkv = self.qkv_proj(x).reshape((-1, N, 3, self.num_heads,
+                                        C // self.num_heads)).transpose(
+                                            (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.out_proj(x)
+        return x
+
+
+class TransformerEncoder(nn.Layer):
+    """
+        This class defines the Transformer encoder (pre-norm) as described in "Attention is all you need" paper
+            https://arxiv.org/abs/1706.03762
+    """
+
+    def __init__(self,
+                 embed_dim: int,
+                 ffn_latent_dim: int,
+                 num_heads: Optional[int]=8,
+                 attn_dropout: Optional[float]=0.0,
+                 dropout: Optional[float]=0.1,
+                 ffn_dropout: Optional[float]=0.0,
+                 transformer_norm_layer: nn.Layer=nn.LayerNorm):
+        super(TransformerEncoder, self).__init__()
+
+        self.pre_norm_mha = nn.Sequential(
+            transformer_norm_layer(embed_dim),
+            MultiHeadAttention(
+                embed_dim, num_heads, attn_drop=attn_dropout, qkv_bias=True),
+            nn.Dropout(p=dropout))
+
+        self.pre_norm_ffn = nn.Sequential(
+            transformer_norm_layer(embed_dim),
+            nn.Linear(embed_dim, ffn_latent_dim),
+            nn.Silu(),
+            nn.Dropout(p=ffn_dropout),
+            nn.Linear(ffn_latent_dim, embed_dim),
+            nn.Dropout(p=dropout))
+        self.embed_dim = embed_dim
+        self.ffn_dim = ffn_latent_dim
+        self.ffn_dropout = ffn_dropout
+
+    def forward(self, x):
+        # Multi-head attention
+        x = x + self.pre_norm_mha(x)
+
+        # Feed forward network
+        x = x + self.pre_norm_ffn(x)
+        return x
+
+
+class MobileViTV3Block(nn.Layer):
+    """
+        MobileViTV3 block
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 transformer_dim: int,
+                 ffn_dim: int,
+                 n_transformer_blocks: Optional[int]=2,
+                 head_dim: Optional[int]=32,
+                 attn_dropout: Optional[float]=0.1,
+                 dropout: Optional[int]=0.1,
+                 ffn_dropout: Optional[int]=0.1,
+                 patch_h: Optional[int]=8,
+                 patch_w: Optional[int]=8,
+                 transformer_norm_layer: nn.Layer=nn.LayerNorm,
+                 conv_ksize: Optional[int]=3,
+                 dilation: Optional[int]=1,
+                 var_ffn: Optional[bool]=False,
+                 no_fusion: Optional[bool]=False):
+
+        # For MobileViTV3: Normal 3x3 convolution --> Depthwise 3x3 convolution
+        padding = (conv_ksize - 1) // 2 * dilation
+        conv_3x3_in = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                in_channels,
+                conv_ksize,
+                bias_attr=False,
+                padding=padding,
+                dilation=dilation,
+                groups=in_channels)), ('norm', nn.BatchNorm2D(in_channels)),
+            ('act', nn.Silu()))
+        conv_1x1_in = nn.Sequential(('conv', nn.Conv2D(
+            in_channels, transformer_dim, 1, bias_attr=False)))
+
+        conv_1x1_out = nn.Sequential(
+            ('conv', nn.Conv2D(
+                transformer_dim, in_channels, 1, bias_attr=False)),
+            ('norm', nn.BatchNorm2D(in_channels)), ('act', nn.Silu()))
+        conv_3x3_out = None
+
+        # For MobileViTV3: input+global --> local+global
+        if not no_fusion:
+            #input_ch = tr_dim + in_ch
+            conv_3x3_out = nn.Sequential(
+                ('conv', nn.Conv2D(
+                    transformer_dim + in_channels,
+                    in_channels,
+                    1,
+                    bias_attr=False)), ('norm', nn.BatchNorm2D(in_channels)),
+                ('act', nn.Silu()))
+
+        super().__init__()
+        self.local_rep = nn.Sequential()
+        self.local_rep.add_sublayer(name="conv_3x3", sublayer=conv_3x3_in)
+        self.local_rep.add_sublayer(name="conv_1x1", sublayer=conv_1x1_in)
+
+        assert transformer_dim % head_dim == 0
+        num_heads = transformer_dim // head_dim
+
+        ffn_dims = [ffn_dim] * n_transformer_blocks
+
+        global_rep = [
+            TransformerEncoder(
+                embed_dim=transformer_dim,
+                ffn_latent_dim=ffn_dims[block_idx],
+                num_heads=num_heads,
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                transformer_norm_layer=transformer_norm_layer)
+            for block_idx in range(n_transformer_blocks)
+        ]
+        global_rep.append(transformer_norm_layer(transformer_dim))
+        self.global_rep = nn.Sequential(*global_rep)
+
+        self.conv_proj = conv_1x1_out
+
+        self.fusion = conv_3x3_out
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+        self.patch_area = self.patch_w * self.patch_h
+
+        self.cnn_in_dim = in_channels
+        self.cnn_out_dim = transformer_dim
+        self.n_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.ffn_dropout = ffn_dropout
+        self.dilation = dilation
+        self.ffn_max_dim = ffn_dims[0]
+        self.ffn_min_dim = ffn_dims[-1]
+        self.var_ffn = var_ffn
+        self.n_blocks = n_transformer_blocks
+        self.conv_ksize = conv_ksize
+
+    def unfolding(self, feature_map):
+        patch_w, patch_h = self.patch_w, self.patch_h
+        patch_area = int(patch_w * patch_h)
+        batch_size, in_channels, orig_h, orig_w = feature_map.shape
+
+        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+
+        interpolate = False
+        if new_w != orig_w or new_h != orig_h:
+            # Note: Padding can be done, but then it needs to be handled in attention function.
+            feature_map = F.interpolate(
+                feature_map,
+                size=(new_h, new_w),
+                mode="bilinear",
+                align_corners=False)
+            interpolate = True
+
+        # number of patches along width and height
+        num_patch_w = new_w // patch_w  # n_w
+        num_patch_h = new_h // patch_h  # n_h
+        num_patches = num_patch_h * num_patch_w  # N
+
+        # [B, C, H, W] --> [B * C * n_h, p_h, n_w, p_w]
+        reshaped_fm = feature_map.reshape([
+            batch_size * in_channels * num_patch_h, patch_h, num_patch_w,
+            patch_w
+        ])
+        # [B * C * n_h, p_h, n_w, p_w] --> [B * C * n_h, n_w, p_h, p_w]
+        transposed_fm = reshaped_fm.transpose([0, 2, 1, 3])
+        # [B * C * n_h, n_w, p_h, p_w] --> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
+        reshaped_fm = transposed_fm.reshape(
+            [batch_size, in_channels, num_patches, patch_area])
+        # [B, C, N, P] --> [B, P, N, C]
+        transposed_fm = reshaped_fm.transpose([0, 3, 2, 1])
+        # [B, P, N, C] --> [BP, N, C]
+        patches = transposed_fm.reshape(
+            [batch_size * patch_area, num_patches, in_channels])
+
+        info_dict = {
+            "orig_size": (orig_h, orig_w),
+            "batch_size": batch_size,
+            "interpolate": interpolate,
+            "total_patches": num_patches,
+            "num_patches_w": num_patch_w,
+            "num_patches_h": num_patch_h
+        }
+
+        return patches, info_dict
+
+    def folding(self, patches, info_dict):
+        n_dim = patches.dim()
+        assert n_dim == 3, "Tensor should be of shape BPxNxC. Got: {}".format(
+            patches.shape)
+        # [BP, N, C] --> [B, P, N, C]
+        patches = patches.reshape([
+            info_dict["batch_size"], self.patch_area,
+            info_dict["total_patches"], patches.shape[2]
+        ])
+
+        batch_size, pixels, num_patches, channels = patches.shape
+        num_patch_h = info_dict["num_patches_h"]
+        num_patch_w = info_dict["num_patches_w"]
+
+        # [B, P, N, C] --> [B, C, N, P]
+        patches = patches.transpose([0, 3, 2, 1])
+
+        # [B, C, N, P] --> [B*C*n_h, n_w, p_h, p_w]
+        feature_map = patches.reshape([
+            batch_size * channels * num_patch_h, num_patch_w, self.patch_h,
+            self.patch_w
+        ])
+        # [B*C*n_h, n_w, p_h, p_w] --> [B*C*n_h, p_h, n_w, p_w]
+        feature_map = feature_map.transpose([0, 2, 1, 3])
+        # [B*C*n_h, p_h, n_w, p_w] --> [B, C, H, W]
+        feature_map = feature_map.reshape([
+            batch_size, channels, num_patch_h * self.patch_h,
+            num_patch_w * self.patch_w
+        ])
+        if info_dict["interpolate"]:
+            feature_map = F.interpolate(
+                feature_map,
+                size=info_dict["orig_size"],
+                mode="bilinear",
+                align_corners=False)
+        return feature_map
+
+    def forward(self, x):
+        res = x
+
+        # For MobileViTV3: Normal 3x3 convolution --> Depthwise 3x3 convolution
+        fm_conv = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, info_dict = self.unfolding(fm_conv)
+
+        # learn global representations
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, info_dict=info_dict)
+
+        fm = self.conv_proj(fm)
+
+        if self.fusion is not None:
+            # For MobileViTV3: input+global --> local+global
+            fm = self.fusion(paddle.concat((fm_conv, fm), axis=1))
+
+        # For MobileViTV3: Skip connection
+        fm = fm + res
+
+        return fm
+
+
+class LinearSelfAttention(nn.Layer):
+    def __init__(self, embed_dim, attn_dropout=0.0, bias=True):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.qkv_proj = nn.Conv2D(
+            embed_dim, 1 + (2 * embed_dim), 1, bias_attr=bias)
+        self.attn_dropout = nn.Dropout(p=attn_dropout)
+        self.out_proj = nn.Conv2D(embed_dim, embed_dim, 1, bias_attr=bias)
+
+    def forward(self, x):
+        # [B, C, P, N] --> [B, h + 2d, P, N]
+        qkv = self.qkv_proj(x)
+
+        # Project x into query, key and value
+        # Query --> [B, 1, P, N]
+        # value, key --> [B, d, P, N]
+        query, key, value = paddle.split(
+            qkv, [1, self.embed_dim, self.embed_dim], axis=1)
+
+        # apply softmax along N dimension
+        context_scores = F.softmax(query, axis=-1)
+        # Uncomment below line to visualize context scores
+        # self.visualize_context_scores(context_scores=context_scores)
+        context_scores = self.attn_dropout(context_scores)
+
+        # Compute context vector
+        # [B, d, P, N] x [B, 1, P, N] -> [B, d, P, N]
+        context_vector = key * context_scores
+        # [B, d, P, N] --> [B, d, P, 1]
+        context_vector = paddle.sum(context_vector, axis=-1, keepdim=True)
+
+        # combine context vector with values
+        # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N]
+        out = F.relu(value) * context_vector
+        out = self.out_proj(out)
+        return out
+
+
+class LinearAttnFFN(nn.Layer):
+    def __init__(self,
+                 embed_dim: int,
+                 ffn_latent_dim: int,
+                 attn_dropout: Optional[float]=0.0,
+                 dropout: Optional[float]=0.1,
+                 ffn_dropout: Optional[float]=0.0,
+                 norm_layer: Optional[str]=layer_norm_2d) -> None:
+        super().__init__()
+        attn_unit = LinearSelfAttention(
+            embed_dim=embed_dim, attn_dropout=attn_dropout, bias=True)
+
+        self.pre_norm_attn = nn.Sequential(
+            norm_layer(num_channels=embed_dim),
+            attn_unit,
+            nn.Dropout(p=dropout))
+
+        self.pre_norm_ffn = nn.Sequential(
+            norm_layer(num_channels=embed_dim),
+            nn.Conv2D(embed_dim, ffn_latent_dim, 1),
+            nn.Silu(),
+            nn.Dropout(p=ffn_dropout),
+            nn.Conv2D(ffn_latent_dim, embed_dim, 1),
+            nn.Dropout(p=dropout))
+
+    def forward(self, x):
+        # self-attention
+        x = x + self.pre_norm_attn(x)
+        # Feed forward network
+        x = x + self.pre_norm_ffn(x)
+        return x
+
+
+class MobileViTV3BlockV2(nn.Layer):
+    """
+    This class defines the `MobileViTV3 block`
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 attn_unit_dim: int,
+                 ffn_multiplier: float=2.0,
+                 n_attn_blocks: Optional[int]=2,
+                 attn_dropout: Optional[float]=0.0,
+                 dropout: Optional[float]=0.0,
+                 ffn_dropout: Optional[float]=0.0,
+                 patch_h: Optional[int]=8,
+                 patch_w: Optional[int]=8,
+                 conv_ksize: Optional[int]=3,
+                 dilation: Optional[int]=1,
+                 attn_norm_layer: Optional[str]=layer_norm_2d):
+        cnn_out_dim = attn_unit_dim
+
+        padding = (conv_ksize - 1) // 2 * dilation
+        conv_3x3_in = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                in_channels,
+                conv_ksize,
+                bias_attr=False,
+                padding=padding,
+                dilation=dilation,
+                groups=in_channels)), ('norm', nn.BatchNorm2D(in_channels)),
+            ('act', nn.Silu()))
+        conv_1x1_in = nn.Sequential(('conv', nn.Conv2D(
+            in_channels, cnn_out_dim, 1, bias_attr=False)))
+
+        super().__init__()
+        self.local_rep = nn.Sequential(conv_3x3_in, conv_1x1_in)
+
+        self.global_rep, attn_unit_dim = self._build_attn_layer(
+            d_model=attn_unit_dim,
+            ffn_mult=ffn_multiplier,
+            n_layers=n_attn_blocks,
+            attn_dropout=attn_dropout,
+            dropout=dropout,
+            ffn_dropout=ffn_dropout,
+            attn_norm_layer=attn_norm_layer)
+
+        # MobileViTV3: input changed from just global to local+global
+        self.conv_proj = nn.Sequential(
+            ('conv', nn.Conv2D(
+                2 * cnn_out_dim, in_channels, 1, bias_attr=False)),
+            ('norm', nn.BatchNorm2D(in_channels)))
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+
+    def _build_attn_layer(self,
+                          d_model: int,
+                          ffn_mult: float,
+                          n_layers: int,
+                          attn_dropout: float,
+                          dropout: float,
+                          ffn_dropout: float,
+                          attn_norm_layer: nn.Layer):
+
+        # ensure that dims are multiple of 16
+        ffn_dims = [ffn_mult * d_model // 16 * 16] * n_layers
+
+        global_rep = [
+            LinearAttnFFN(
+                embed_dim=d_model,
+                ffn_latent_dim=ffn_dims[block_idx],
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                norm_layer=attn_norm_layer) for block_idx in range(n_layers)
+        ]
+        global_rep.append(attn_norm_layer(num_channels=d_model))
+
+        return nn.Sequential(*global_rep), d_model
+
+    def unfolding(self, feature_map):
+        batch_size, in_channels, img_h, img_w = feature_map.shape
+
+        # [B, C, H, W] --> [B, C, P, N]
+        patches = F.unfold(
+            feature_map,
+            kernel_sizes=[self.patch_h, self.patch_w],
+            strides=[self.patch_h, self.patch_w])
+        n_patches = img_h * img_w // (self.patch_h * self.patch_w)
+        patches = patches.reshape(
+            [batch_size, in_channels, self.patch_h * self.patch_w, n_patches])
+
+        return patches, (img_h, img_w)
+
+    def folding(self, patches, output_size: Tuple[int, int]):
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+
+        # [B, C, P, N]
+        patches = patches.reshape([batch_size, in_dim * patch_size, n_patches])
+
+        feature_map = F.fold(
+            patches,
+            output_size,
+            kernel_sizes=[self.patch_h, self.patch_w],
+            strides=[self.patch_h, self.patch_w])
+
+        return feature_map
+
+    def forward(self, x):
+        fm_conv = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, output_size = self.unfolding(fm_conv)
+
+        # learn global representations on all patches
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, output_size=output_size)
+
+        # MobileViTV3: local+global instead of only global
+        fm = self.conv_proj(paddle.concat((fm, fm_conv), axis=1))
+
+        # MobileViTV3: skip connection
+        fm = fm + x
+
+        return fm
+
+
+class MobileViTV3(nn.Layer):
+    """
+        MobileViTV3:
+    """
+
+    def __init__(self,
+                 mobilevit_config: Dict,
+                 dropout=0.1,
+                 class_num=1000,
+                 classifier_dropout=0.1,
+                 output_stride=None,
+                 mobilevit_v2_based=False):
+        super().__init__()
+        self.round_nearest = 8
+        self.dilation = 1
+        self.dropout = dropout
+        self.mobilevit_v2_based = mobilevit_v2_based
+
+        dilate_l4 = dilate_l5 = False
+        if output_stride == 8:
+            dilate_l4 = True
+            dilate_l5 = True
+        elif output_stride == 16:
+            dilate_l5 = True
+
+        # store model configuration in a dictionary
+        in_channels = mobilevit_config["layer0"]["img_channels"]
+        out_channels = mobilevit_config["layer0"]["out_channels"]
+        self.conv_1 = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                out_channels,
+                3,
+                bias_attr=False,
+                stride=2,
+                padding=1)), ('norm', nn.BatchNorm2D(out_channels)),
+            ('act', nn.Silu()))
+
+        in_channels = out_channels
+        self.layer_1, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer1"])
+
+        in_channels = out_channels
+        self.layer_2, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer2"])
+
+        in_channels = out_channels
+        self.layer_3, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer3"])
+
+        in_channels = out_channels
+        self.layer_4, out_channels = self._make_layer(
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer4"],
+            dilate=dilate_l4)
+
+        in_channels = out_channels
+        self.layer_5, out_channels = self._make_layer(
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer5"],
+            dilate=dilate_l5)
+
+        if self.mobilevit_v2_based:
+            self.conv_1x1_exp = nn.Identity()
+        else:
+            in_channels = out_channels
+            out_channels = min(mobilevit_config["last_layer_exp_factor"] *
+                               in_channels, 960)
+            self.conv_1x1_exp = nn.Sequential(
+                ('conv', nn.Conv2D(
+                    in_channels, out_channels, 1, bias_attr=False)),
+                ('norm', nn.BatchNorm2D(out_channels)), ('act', nn.Silu()))
+
+        self.classifier = nn.Sequential()
+        self.classifier.add_sublayer(
+            name="global_pool",
+            sublayer=nn.Sequential(nn.AdaptiveAvgPool2D(1), nn.Flatten()))
+        if 0.0 < classifier_dropout < 1.0:
+            self.classifier.add_sublayer(
+                name="dropout", sublayer=nn.Dropout(p=classifier_dropout))
+        self.classifier.add_sublayer(
+            name="fc", sublayer=nn.Linear(out_channels, class_num))
+
+        # weight initialization
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            fan_in = m.weight.shape[1] * m.weight.shape[2] * m.weight.shape[3]
+            fan_out = m.weight.shape[0] * m.weight.shape[2] * m.weight.shape[3]
+            if self.mobilevit_v2_based:
+                bound = 1.0 / fan_in**0.5
+                nn.initializer.Uniform(-bound, bound)(m.weight)
+                if m.bias is not None:
+                    nn.initializer.Uniform(-bound, bound)(m.bias)
+            else:
+                nn.initializer.KaimingNormal(fan_in=fan_out)(m.weight)
+                if m.bias is not None:
+                    nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            nn.initializer.Constant(1)(m.weight)
+            nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            if self.mobilevit_v2_based:
+                nn.initializer.XavierUniform()(m.weight)
+            else:
+                nn.initializer.TruncatedNormal(std=.02)(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+
+    def _make_layer(self, input_channel, cfg, dilate=False):
+        block_type = cfg.get("block_type", "mobilevit")
+        if block_type.lower() == "mobilevit":
+            return self._make_mit_layer(
+                input_channel=input_channel, cfg=cfg, dilate=dilate)
+        else:
+            return self._make_mobilenet_layer(
+                input_channel=input_channel, cfg=cfg)
+
+    def _make_mit_layer(self, input_channel, cfg, dilate=False):
+        prev_dilation = self.dilation
+        block = []
+        stride = cfg.get("stride", 1)
+
+        if stride == 2:
+            if dilate:
+                self.dilation *= 2
+                stride = 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=cfg.get("out_channels"),
+                stride=stride,
+                expand_ratio=cfg.get("mv_expand_ratio", 4),
+                dilation=prev_dilation)
+
+            block.append(layer)
+            input_channel = cfg.get("out_channels")
+
+        if self.mobilevit_v2_based:
+            block.append(
+                MobileViTV3BlockV2(
+                    in_channels=input_channel,
+                    attn_unit_dim=cfg["attn_unit_dim"],
+                    ffn_multiplier=cfg.get("ffn_multiplier"),
+                    n_attn_blocks=cfg.get("attn_blocks", 1),
+                    ffn_dropout=0.,
+                    attn_dropout=0.,
+                    dilation=self.dilation,
+                    patch_h=cfg.get("patch_h", 2),
+                    patch_w=cfg.get("patch_w", 2)))
+        else:
+            head_dim = cfg.get("head_dim", 32)
+            transformer_dim = cfg["transformer_channels"]
+            ffn_dim = cfg.get("ffn_dim")
+            if head_dim is None:
+                num_heads = cfg.get("num_heads", 4)
+                if num_heads is None:
+                    num_heads = 4
+                head_dim = transformer_dim // num_heads
+
+            assert transformer_dim % head_dim == 0, (
+                "Transformer input dimension should be divisible by head dimension. "
+                "Got {} and {}.".format(transformer_dim, head_dim))
+
+            block.append(
+                MobileViTV3Block(
+                    in_channels=input_channel,
+                    transformer_dim=transformer_dim,
+                    ffn_dim=ffn_dim,
+                    n_transformer_blocks=cfg.get("transformer_blocks", 1),
+                    patch_h=cfg.get("patch_h", 2),
+                    patch_w=cfg.get("patch_w", 2),
+                    dropout=self.dropout,
+                    ffn_dropout=0.,
+                    attn_dropout=0.,
+                    head_dim=head_dim))
+
+        return nn.Sequential(*block), input_channel
+
+    def _make_mobilenet_layer(self, input_channel, cfg):
+        output_channels = cfg.get("out_channels")
+        num_blocks = cfg.get("num_blocks", 2)
+        expand_ratio = cfg.get("expand_ratio", 4)
+        block = []
+
+        for i in range(num_blocks):
+            stride = cfg.get("stride", 1) if i == 0 else 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=output_channels,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            block.append(layer)
+            input_channel = output_channels
+        return nn.Sequential(*block), input_channel
+
+    def extract_features(self, x):
+        x = self.conv_1(x)
+        x = self.layer_1(x)
+        x = self.layer_2(x)
+        x = self.layer_3(x)
+
+        x = self.layer_4(x)
+        x = self.layer_5(x)
+        x = self.conv_1x1_exp(x)
+        return x
+
+    def forward(self, x):
+        x = self.extract_features(x)
+        x = self.classifier(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileViTV3_S(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 4
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 64,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 128,
+            "transformer_channels": 144,
+            "ffn_dim": 288,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 256,
+            "transformer_channels": 192,
+            "ffn_dim": 384,
+            "transformer_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 320,
+            "transformer_channels": 240,
+            "ffn_dim": 480,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_S"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_XS(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 4
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 48,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 96,
+            "transformer_channels": 96,
+            "ffn_dim": 192,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 160,
+            "transformer_channels": 120,
+            "ffn_dim": 240,
+            "transformer_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 160,
+            "transformer_channels": 144,
+            "ffn_dim": 288,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_XS"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_XXS(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 2
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 16,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 24,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 64,
+            "transformer_channels": 64,
+            "ffn_dim": 128,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 80,
+            "transformer_channels": 80,
+            "ffn_dim": 160,
+            "transformer_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 128,
+            "transformer_channels": 96,
+            "ffn_dim": 192,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_XXS"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_S_L2(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 4
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 64,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 128,
+            "transformer_channels": 144,
+            "ffn_dim": 288,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 256,
+            "transformer_channels": 192,
+            "ffn_dim": 384,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 320,
+            "transformer_channels": 240,
+            "ffn_dim": 480,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_S_L2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_XS_L2(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 4
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 48,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 96,
+            "transformer_channels": 96,
+            "ffn_dim": 192,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 160,
+            "transformer_channels": 120,
+            "ffn_dim": 240,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 160,
+            "transformer_channels": 144,
+            "ffn_dim": 288,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_XS_L2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_XXS_L2(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 2
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 16,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 24,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 64,
+            "transformer_channels": 64,
+            "ffn_dim": 128,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 80,
+            "transformer_channels": 80,
+            "ffn_dim": 160,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 128,
+            "transformer_channels": 96,
+            "ffn_dim": 192,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_XXS_L2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 32,
+        },
+        "layer1": {
+            "out_channels": 64,
+            "expand_ratio": 2,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": 128,
+            "expand_ratio": 2,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": 256,
+            "attn_unit_dim": 128,
+            "ffn_multiplier": 2,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": 384,
+            "attn_unit_dim": 192,
+            "ffn_multiplier": 2,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": 512,
+            "attn_unit_dim": 256,
+            "ffn_multiplier": 2,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    model = MobileViTV3(mobilevit_config, mobilevit_v2_based=True, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 24,
+        },
+        "layer1": {
+            "out_channels": 48,
+            "expand_ratio": 2,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": 96,
+            "expand_ratio": 2,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": 192,
+            "attn_unit_dim": 96,
+            "ffn_multiplier": 2,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": 288,
+            "attn_unit_dim": 144,
+            "ffn_multiplier": 2,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": 384,
+            "attn_unit_dim": 192,
+            "ffn_multiplier": 2,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    model = MobileViTV3(mobilevit_config, mobilevit_v2_based=True, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": 2,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": 64,
+            "expand_ratio": 2,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": 128,
+            "attn_unit_dim": 64,
+            "ffn_multiplier": 2,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": 192,
+            "attn_unit_dim": 96,
+            "ffn_multiplier": 2,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": 256,
+            "attn_unit_dim": 128,
+            "ffn_multiplier": 2,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    model = MobileViTV3(mobilevit_config, mobilevit_v2_based=True, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_x0_5"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/nextvit.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/nextvit.py
new file mode 100755
index 000000000..383d6d1fe
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/nextvit.py
@@ -0,0 +1,643 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/bytedance/Next-ViT/blob/main/classification/nextvit.py
+# reference: https://arxiv.org/abs/2207.05501
+
+from functools import partial
+
+import paddle
+from paddle import nn
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "NextViT_small_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_small_224_pretrained.pdparams",
+    "NextViT_base_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_base_224_pretrained.pdparams",
+    "NextViT_large_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_large_224_pretrained.pdparams",
+    "NextViT_small_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_small_384_pretrained.pdparams",
+    "NextViT_base_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_base_384_pretrained.pdparams",
+    "NextViT_large_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_large_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+NORM_EPS = 1e-5
+
+
+def rearrange(x, pattern, **axes_lengths):
+    if 'b (h w) c -> b c h w' == pattern:
+        b, n, c = x.shape
+        h = axes_lengths.pop('h', -1)
+        w = axes_lengths.pop('w', -1)
+        h = h if w == -1 else n // w
+        w = w if h == -1 else n // h
+        return x.transpose([0, 2, 1]).reshape([b, c, h, w])
+    if 'b c h w -> b (h w) c' == pattern:
+        b, c, h, w = x.shape
+        return x.reshape([b, c, h * w]).transpose([0, 2, 1])
+    if 'b t (h d) -> b h t d' == pattern:
+        b, t, h_d = x.shape
+        h = axes_lengths['h']
+        return x.reshape([b, t, h, h_d // h]).transpose([0, 2, 1, 3])
+    if 'b h t d -> b t (h d)' == pattern:
+        b, h, t, d = x.shape
+        return x.transpose([0, 2, 1, 3]).reshape([b, t, h * d])
+
+    raise NotImplementedError(
+        "Rearrangement '{}' has not been implemented.".format(pattern))
+
+
+def merge_pre_bn(layer, pre_bn_1, pre_bn_2=None):
+    """ Merge pre BN to reduce inference runtime.
+    """
+    weight = layer.weight
+    if isinstance(layer, nn.Linear):
+        weight = weight.transpose([1, 0])
+    bias = layer.bias
+    if pre_bn_2 is None:
+        scale_invstd = (pre_bn_1._variance + pre_bn_1._epsilon).pow(-0.5)
+        extra_weight = scale_invstd * pre_bn_1.weight
+        extra_bias = pre_bn_1.bias - pre_bn_1.weight * pre_bn_1._mean * scale_invstd
+    else:
+        scale_invstd_1 = (pre_bn_1._variance + pre_bn_1._epsilon).pow(-0.5)
+        scale_invstd_2 = (pre_bn_2._variance + pre_bn_2._epsilon).pow(-0.5)
+
+        extra_weight = scale_invstd_1 * pre_bn_1.weight * scale_invstd_2 * pre_bn_2.weight
+        extra_bias = scale_invstd_2 * pre_bn_2.weight * (
+            pre_bn_1.bias - pre_bn_1.weight * pre_bn_1._mean * scale_invstd_1 -
+            pre_bn_2._mean) + pre_bn_2.bias
+    if isinstance(layer, nn.Linear):
+        extra_bias = weight @extra_bias
+
+        weight = weight.multiply(
+            extra_weight.reshape([1, weight.shape[1]]).expand_as(weight))
+        weight = weight.transpose([1, 0])
+    elif isinstance(layer, nn.Conv2D):
+        assert weight.shape[2] == 1 and weight.shape[3] == 1
+
+        weight = weight.reshape([weight.shape[0], weight.shape[1]])
+        extra_bias = weight @extra_bias
+        weight = weight.multiply(
+            extra_weight.reshape([1, weight.shape[1]]).expand_as(weight))
+        weight = weight.reshape([weight.shape[0], weight.shape[1], 1, 1])
+    bias = bias.add(extra_bias)
+
+    layer.weight.set_value(weight)
+    layer.bias.set_value(bias)
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1):
+        super(ConvBNReLU, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            groups=groups,
+            bias_attr=False)
+        self.norm = nn.BatchNorm2D(out_channels, epsilon=NORM_EPS)
+        self.act = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        x = self.act(x)
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(PatchEmbed, self).__init__()
+        norm_layer = partial(nn.BatchNorm2D, epsilon=NORM_EPS)
+        if stride == 2:
+            self.avgpool = nn.AvgPool2D((2, 2), stride=2, ceil_mode=True)
+            self.conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                bias_attr=False)
+            self.norm = norm_layer(out_channels)
+        elif in_channels != out_channels:
+            self.avgpool = nn.Identity()
+            self.conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                bias_attr=False)
+            self.norm = norm_layer(out_channels)
+        else:
+            self.avgpool = nn.Identity()
+            self.conv = nn.Identity()
+            self.norm = nn.Identity()
+
+    def forward(self, x):
+        return self.norm(self.conv(self.avgpool(x)))
+
+
+class MHCA(nn.Layer):
+    """
+    Multi-Head Convolutional Attention
+    """
+
+    def __init__(self, out_channels, head_dim):
+        super(MHCA, self).__init__()
+        norm_layer = partial(nn.BatchNorm2D, epsilon=NORM_EPS)
+        self.group_conv3x3 = nn.Conv2D(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=out_channels // head_dim,
+            bias_attr=False)
+        self.norm = norm_layer(out_channels)
+        self.act = nn.ReLU()
+        self.projection = nn.Conv2D(
+            out_channels, out_channels, kernel_size=1, bias_attr=False)
+
+    def forward(self, x):
+        out = self.group_conv3x3(x)
+        out = self.norm(out)
+        out = self.act(out)
+        out = self.projection(out)
+        return out
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 out_features=None,
+                 mlp_ratio=None,
+                 drop=0.,
+                 bias=True):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_dim = _make_divisible(in_features * mlp_ratio, 32)
+        self.conv1 = nn.Conv2D(
+            in_features,
+            hidden_dim,
+            kernel_size=1,
+            bias_attr=None if bias == True else False)
+        self.act = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            hidden_dim,
+            out_features,
+            kernel_size=1,
+            bias_attr=None if bias == True else False)
+        self.drop = nn.Dropout(drop)
+
+    def merge_bn(self, pre_norm):
+        merge_pre_bn(self.conv1, pre_norm)
+        self.is_bn_merged = True
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.conv2(x)
+        x = self.drop(x)
+        return x
+
+
+class NCB(nn.Layer):
+    """
+    Next Convolution Block
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 path_dropout=0.0,
+                 drop=0.0,
+                 head_dim=32,
+                 mlp_ratio=3):
+        super(NCB, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        norm_layer = partial(nn.BatchNorm2D, epsilon=NORM_EPS)
+        assert out_channels % head_dim == 0
+
+        self.patch_embed = PatchEmbed(in_channels, out_channels, stride)
+        self.mhca = MHCA(out_channels, head_dim)
+        self.attention_path_dropout = DropPath(path_dropout)
+
+        self.norm = norm_layer(out_channels)
+        self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop, bias=True)
+        self.mlp_path_dropout = DropPath(path_dropout)
+        self.is_bn_merged = False
+
+    def merge_bn(self):
+        if not self.is_bn_merged:
+            self.mlp.merge_bn(self.norm)
+            self.is_bn_merged = True
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x + self.attention_path_dropout(self.mhca(x))
+
+        if not self.is_bn_merged:
+            out = self.norm(x)
+        else:
+            out = x
+        x = x + self.mlp_path_dropout(self.mlp(out))
+        return x
+
+
+class E_MHSA(nn.Layer):
+    """
+    Efficient Multi-Head Self Attention
+    """
+
+    def __init__(self,
+                 dim,
+                 out_dim=None,
+                 head_dim=32,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim if out_dim is not None else dim
+        self.num_heads = self.dim // head_dim
+        self.scale = qk_scale or head_dim**-0.5
+        self.q = nn.Linear(dim, self.dim, bias_attr=qkv_bias)
+        self.k = nn.Linear(dim, self.dim, bias_attr=qkv_bias)
+        self.v = nn.Linear(dim, self.dim, bias_attr=qkv_bias)
+        self.proj = nn.Linear(self.dim, self.out_dim)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        self.N_ratio = sr_ratio**2
+        if sr_ratio > 1:
+            self.sr = nn.AvgPool1D(
+                kernel_size=self.N_ratio, stride=self.N_ratio)
+            self.norm = nn.BatchNorm1D(dim, epsilon=NORM_EPS)
+        self.is_bn_merged = False
+
+    def merge_bn(self, pre_bn):
+        merge_pre_bn(self.q, pre_bn)
+        if self.sr_ratio > 1:
+            merge_pre_bn(self.k, pre_bn, self.norm)
+            merge_pre_bn(self.v, pre_bn, self.norm)
+        else:
+            merge_pre_bn(self.k, pre_bn)
+            merge_pre_bn(self.v, pre_bn)
+        self.is_bn_merged = True
+
+    def forward(self, x):
+        B, N, C = x.shape
+        q = self.q(x)
+        q = q.reshape(
+            [B, N, self.num_heads, int(C // self.num_heads)]).transpose(
+                [0, 2, 1, 3])
+        if self.sr_ratio > 1:
+            x_ = x.transpose([0, 2, 1])
+            x_ = self.sr(x_)
+            if not self.is_bn_merged:
+                x_ = self.norm(x_)
+            x_ = x_.transpose([0, 2, 1])
+
+            k = self.k(x_)
+            k = k.reshape(
+                [B, k.shape[1], self.num_heads, int(C // self.num_heads)
+                 ]).transpose([0, 2, 3, 1])
+            v = self.v(x_)
+            v = v.reshape(
+                [B, v.shape[1], self.num_heads, int(C // self.num_heads)
+                 ]).transpose([0, 2, 1, 3])
+        else:
+            k = self.k(x)
+            k = k.reshape(
+                [B, k.shape[1], self.num_heads, int(C // self.num_heads)
+                 ]).transpose([0, 2, 3, 1])
+            v = self.v(x)
+            v = v.reshape(
+                [B, v.shape[1], self.num_heads, int(C // self.num_heads)
+                 ]).transpose([0, 2, 1, 3])
+        attn = (q @k) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class NTB(nn.Layer):
+    """
+    Next Transformer Block
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            path_dropout,
+            stride=1,
+            sr_ratio=1,
+            mlp_ratio=2,
+            head_dim=32,
+            mix_block_ratio=0.75,
+            attn_drop=0.0,
+            drop=0.0, ):
+        super(NTB, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.mix_block_ratio = mix_block_ratio
+        norm_func = partial(nn.BatchNorm2D, epsilon=NORM_EPS)
+
+        self.mhsa_out_channels = _make_divisible(
+            int(out_channels * mix_block_ratio), 32)
+        self.mhca_out_channels = out_channels - self.mhsa_out_channels
+
+        self.patch_embed = PatchEmbed(in_channels, self.mhsa_out_channels,
+                                      stride)
+        self.norm1 = norm_func(self.mhsa_out_channels)
+        self.e_mhsa = E_MHSA(
+            self.mhsa_out_channels,
+            head_dim=head_dim,
+            sr_ratio=sr_ratio,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.mhsa_path_dropout = DropPath(path_dropout * mix_block_ratio)
+
+        self.projection = PatchEmbed(
+            self.mhsa_out_channels, self.mhca_out_channels, stride=1)
+        self.mhca = MHCA(self.mhca_out_channels, head_dim=head_dim)
+        self.mhca_path_dropout = DropPath(path_dropout * (1 - mix_block_ratio))
+
+        self.norm2 = norm_func(out_channels)
+        self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop)
+        self.mlp_path_dropout = DropPath(path_dropout)
+
+        self.is_bn_merged = False
+
+    def merge_bn(self):
+        if not self.is_bn_merged:
+            self.e_mhsa.merge_bn(self.norm1)
+            self.mlp.merge_bn(self.norm2)
+            self.is_bn_merged = True
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+
+        B, C, H, W = x.shape
+        if not self.is_bn_merged:
+            out = self.norm1(x)
+        else:
+            out = x
+        out = rearrange(out, "b c h w -> b (h w) c")  # b n c
+        out = self.e_mhsa(out)
+        out = self.mhsa_path_dropout(out)
+        x = x + rearrange(out, "b (h w) c -> b c h w", h=H)
+
+        out = self.projection(x)
+        out = out + self.mhca_path_dropout(self.mhca(out))
+        x = paddle.concat([x, out], axis=1)
+
+        if not self.is_bn_merged:
+            out = self.norm2(x)
+        else:
+            out = x
+        x = x + self.mlp_path_dropout(self.mlp(out))
+        return x
+
+
+class NextViT(nn.Layer):
+    def __init__(self,
+                 stem_chs,
+                 depths,
+                 path_dropout,
+                 attn_drop=0,
+                 drop=0,
+                 class_num=1000,
+                 strides=[1, 2, 2, 2],
+                 sr_ratios=[8, 4, 2, 1],
+                 head_dim=32,
+                 mix_block_ratio=0.75):
+        super(NextViT, self).__init__()
+
+        self.stage_out_channels = [
+            [96] * (depths[0]), [192] * (depths[1] - 1) + [256],
+            [384, 384, 384, 384, 512] * (depths[2] // 5),
+            [768] * (depths[3] - 1) + [1024]
+        ]
+
+        # Next Hybrid Strategy
+        self.stage_block_types = [[NCB] * depths[0],
+                                  [NCB] * (depths[1] - 1) + [NTB],
+                                  [NCB, NCB, NCB, NCB, NTB] * (depths[2] // 5),
+                                  [NCB] * (depths[3] - 1) + [NTB]]
+
+        self.stem = nn.Sequential(
+            ConvBNReLU(
+                3, stem_chs[0], kernel_size=3, stride=2),
+            ConvBNReLU(
+                stem_chs[0], stem_chs[1], kernel_size=3, stride=1),
+            ConvBNReLU(
+                stem_chs[1], stem_chs[2], kernel_size=3, stride=1),
+            ConvBNReLU(
+                stem_chs[2], stem_chs[2], kernel_size=3, stride=2), )
+        input_channel = stem_chs[-1]
+        features = []
+        idx = 0
+        dpr = [
+            x.item() for x in paddle.linspace(0, path_dropout, sum(depths))
+        ]  # stochastic depth decay rule
+        for stage_id in range(len(depths)):
+            numrepeat = depths[stage_id]
+            output_channels = self.stage_out_channels[stage_id]
+            block_types = self.stage_block_types[stage_id]
+            for block_id in range(numrepeat):
+                if strides[stage_id] == 2 and block_id == 0:
+                    stride = 2
+                else:
+                    stride = 1
+                output_channel = output_channels[block_id]
+                block_type = block_types[block_id]
+                if block_type is NCB:
+                    layer = NCB(input_channel,
+                                output_channel,
+                                stride=stride,
+                                path_dropout=dpr[idx + block_id],
+                                drop=drop,
+                                head_dim=head_dim)
+                    features.append(layer)
+                elif block_type is NTB:
+                    layer = NTB(input_channel,
+                                output_channel,
+                                path_dropout=dpr[idx + block_id],
+                                stride=stride,
+                                sr_ratio=sr_ratios[stage_id],
+                                head_dim=head_dim,
+                                mix_block_ratio=mix_block_ratio,
+                                attn_drop=attn_drop,
+                                drop=drop)
+                    features.append(layer)
+                input_channel = output_channel
+            idx += numrepeat
+        self.features = nn.Sequential(*features)
+
+        self.norm = nn.BatchNorm2D(output_channel, epsilon=NORM_EPS)
+
+        self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
+        self.proj_head = nn.Sequential(nn.Linear(output_channel, class_num), )
+
+        self.stage_out_idx = [
+            sum(depths[:idx + 1]) - 1 for idx in range(len(depths))
+        ]
+        self._initialize_weights()
+
+    def merge_bn(self):
+        self.eval()
+        for idx, layer in self.named_sublayers():
+            if isinstance(layer, NCB) or isinstance(layer, NTB):
+                layer.merge_bn()
+
+    def _initialize_weights(self):
+        for n, m in self.named_sublayers():
+            if isinstance(m, (nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm,
+                              nn.BatchNorm1D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                trunc_normal_(m.weight)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    zeros_(m.bias)
+            elif isinstance(m, nn.Conv2D):
+                trunc_normal_(m.weight)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.stem(x)
+        for layer in self.features:
+            x = layer(x)
+        x = self.norm(x)
+        x = self.avgpool(x)
+        x = paddle.flatten(x, 1)
+        x = self.proj_head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def NextViT_small_224(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 10, 3],
+        path_dropout=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_small_224"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_base_224(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 20, 3],
+        path_dropout=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_base_224"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_large_224(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 30, 3],
+        path_dropout=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_large_224"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_small_384(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 10, 3],
+        path_dropout=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_small_384"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_base_384(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 20, 3],
+        path_dropout=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_base_384"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_large_384(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 30, 3],
+        path_dropout=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_large_384"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/peleenet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/peleenet.py
new file mode 100755
index 000000000..0584c9794
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/peleenet.py
@@ -0,0 +1,264 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Code was heavily based on https://github.com/Robert-JunWang/PeleeNet
+# reference: https://arxiv.org/pdf/1804.06882.pdf
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Normal, Constant
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PeleeNet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PeleeNet_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+normal_ = lambda x, mean=0, std=1: Normal(mean, std)(x)
+constant_ = lambda x, value=0: Constant(value)(x)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class _DenseLayer(nn.Layer):
+    def __init__(self, num_input_features, growth_rate, bottleneck_width,
+                 drop_rate):
+        super(_DenseLayer, self).__init__()
+
+        growth_rate = int(growth_rate / 2)
+        inter_channel = int(growth_rate * bottleneck_width / 4) * 4
+
+        if inter_channel > num_input_features / 2:
+            inter_channel = int(num_input_features / 8) * 4
+            print('adjust inter_channel to ', inter_channel)
+
+        self.branch1a = BasicConv2D(
+            num_input_features, inter_channel, kernel_size=1)
+        self.branch1b = BasicConv2D(
+            inter_channel, growth_rate, kernel_size=3, padding=1)
+
+        self.branch2a = BasicConv2D(
+            num_input_features, inter_channel, kernel_size=1)
+        self.branch2b = BasicConv2D(
+            inter_channel, growth_rate, kernel_size=3, padding=1)
+        self.branch2c = BasicConv2D(
+            growth_rate, growth_rate, kernel_size=3, padding=1)
+
+    def forward(self, x):
+        branch1 = self.branch1a(x)
+        branch1 = self.branch1b(branch1)
+
+        branch2 = self.branch2a(x)
+        branch2 = self.branch2b(branch2)
+        branch2 = self.branch2c(branch2)
+
+        return paddle.concat([x, branch1, branch2], 1)
+
+
+class _DenseBlock(nn.Sequential):
+    def __init__(self, num_layers, num_input_features, bn_size, growth_rate,
+                 drop_rate):
+        super(_DenseBlock, self).__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(num_input_features + i * growth_rate,
+                                growth_rate, bn_size, drop_rate)
+            setattr(self, 'denselayer%d' % (i + 1), layer)
+
+
+class _StemBlock(nn.Layer):
+    def __init__(self, num_input_channels, num_init_features):
+        super(_StemBlock, self).__init__()
+
+        num_stem_features = int(num_init_features / 2)
+
+        self.stem1 = BasicConv2D(
+            num_input_channels,
+            num_init_features,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.stem2a = BasicConv2D(
+            num_init_features,
+            num_stem_features,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.stem2b = BasicConv2D(
+            num_stem_features,
+            num_init_features,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.stem3 = BasicConv2D(
+            2 * num_init_features,
+            num_init_features,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.pool = nn.MaxPool2D(kernel_size=2, stride=2)
+
+    def forward(self, x):
+        out = self.stem1(x)
+
+        branch2 = self.stem2a(out)
+        branch2 = self.stem2b(branch2)
+        branch1 = self.pool(out)
+
+        out = paddle.concat([branch1, branch2], 1)
+        out = self.stem3(out)
+
+        return out
+
+
+class BasicConv2D(nn.Layer):
+    def __init__(self, in_channels, out_channels, activation=True, **kwargs):
+        super(BasicConv2D, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels, out_channels, bias_attr=False, **kwargs)
+        self.norm = nn.BatchNorm2D(out_channels)
+        self.activation = activation
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        if self.activation:
+            return F.relu(x)
+        else:
+            return x
+
+
+class PeleeNetDY(nn.Layer):
+    r"""PeleeNet model class, based on
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf> and
+     "Pelee: A Real-Time Object Detection System on Mobile Devices" <https://arxiv.org/pdf/1804.06882.pdf>`
+
+    Args:
+        growth_rate (int or list of 4 ints) - how many filters to add each layer (`k` in paper)
+        block_config (list of 4 ints) - how many layers in each pooling block
+        num_init_features (int) - the number of filters to learn in the first convolution layer
+        bottleneck_width (int or list of 4 ints) - multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+        drop_rate (float) - dropout rate after each dense layer
+        class_num (int) - number of classification classes
+    """
+
+    def __init__(self,
+                 growth_rate=32,
+                 block_config=[3, 4, 8, 6],
+                 num_init_features=32,
+                 bottleneck_width=[1, 2, 4, 4],
+                 drop_rate=0.05,
+                 class_num=1000):
+
+        super(PeleeNetDY, self).__init__()
+
+        self.features = nn.Sequential(* [('stemblock', _StemBlock(
+            3, num_init_features)), ])
+
+        if type(growth_rate) is list:
+            growth_rates = growth_rate
+            assert len(growth_rates) == 4, \
+                'The growth rate must be the list and the size must be 4'
+        else:
+            growth_rates = [growth_rate] * 4
+
+        if type(bottleneck_width) is list:
+            bottleneck_widths = bottleneck_width
+            assert len(bottleneck_widths) == 4, \
+                'The bottleneck width must be the list and the size must be 4'
+        else:
+            bottleneck_widths = [bottleneck_width] * 4
+
+        # Each denseblock
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = _DenseBlock(
+                num_layers=num_layers,
+                num_input_features=num_features,
+                bn_size=bottleneck_widths[i],
+                growth_rate=growth_rates[i],
+                drop_rate=drop_rate)
+            setattr(self.features, 'denseblock%d' % (i + 1), block)
+            num_features = num_features + num_layers * growth_rates[i]
+
+            setattr(
+                self.features,
+                'transition%d' % (i + 1),
+                BasicConv2D(
+                    num_features,
+                    num_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0))
+
+            if i != len(block_config) - 1:
+                setattr(
+                    self.features,
+                    'transition%d_pool' % (i + 1),
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2))
+                num_features = num_features
+
+        # Linear layer
+        self.classifier = nn.Linear(num_features, class_num)
+        self.drop_rate = drop_rate
+
+        self.apply(self._initialize_weights)
+
+    def forward(self, x):
+        features = self.features(x)
+        out = F.avg_pool2d(
+            features, kernel_size=features.shape[2:4]).flatten(1)
+        if self.drop_rate > 0:
+            out = F.dropout(out, p=self.drop_rate, training=self.training)
+        out = self.classifier(out)
+        return out
+
+    def _initialize_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            normal_(m.weight, std=math.sqrt(2. / n))
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            ones_(m.weight)
+            zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            normal_(m.weight, std=0.01)
+            zeros_(m.bias)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PeleeNet(pretrained=False, use_ssld=False, **kwargs):
+    model = PeleeNetDY(**kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PeleeNet"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/pvt_v2.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/pvt_v2.py
new file mode 100755
index 000000000..9e9f8d5fd
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/pvt_v2.py
@@ -0,0 +1,493 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/whai362/PVT
+# reference: https://arxiv.org/abs/2106.13797
+
+from functools import partial
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity, drop_path
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PVT_V2_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B0_pretrained.pdparams",
+    "PVT_V2_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B1_pretrained.pdparams",
+    "PVT_V2_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B2_pretrained.pdparams",
+    "PVT_V2_B2_Linear":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B2_Linear_pretrained.pdparams",
+    "PVT_V2_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B3_pretrained.pdparams",
+    "PVT_V2_B4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B4_pretrained.pdparams",
+    "PVT_V2_B5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B5_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+@paddle.jit.not_to_static
+def swapdim(x, dim1, dim2):
+    a = list(range(len(x.shape)))
+    a[dim1], a[dim2] = a[dim2], a[dim1]
+    return x.transpose(a)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.,
+                 linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1,
+                 linear=False):
+        super().__init__()
+        assert dim % num_heads == 0
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2D(
+                    dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2D(7)
+            self.sr = nn.Conv2D(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(
+            [B, N, self.num_heads, C // self.num_heads]).transpose(
+                [0, 2, 1, 3])
+
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+                x_ = self.sr(x_)
+                h_, w_ = x_.shape[-2:]
+                x_ = x_.reshape([B, C, h_ * w_]).transpose([0, 2, 1])
+                x_ = self.norm(x_)
+                kv = self.kv(x_)
+                kv = kv.reshape([
+                    B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                    C // self.num_heads
+                ]).transpose([2, 0, 3, 1, 4])
+            else:
+                kv = self.kv(x)
+                kv = kv.reshape([
+                    B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                    C // self.num_heads
+                ]).transpose([2, 0, 3, 1, 4])
+        else:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            x_ = self.sr(self.pool(x_))
+            x_ = x_.reshape([B, C, x_.shape[2] * x_.shape[3]]).transpose(
+                [0, 2, 1])
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_)
+            kv = kv.reshape([
+                B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                C // self.num_heads
+            ]).transpose([2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = (q @swapdim(k, -2, -1)) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = swapdim((attn @v), 1, 2).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1,
+                 linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+            linear=linear)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop,
+                       linear=linear)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2)
+        x = swapdim(x, 1, 2)
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class PyramidVisionTransformerV2(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 num_stages=4,
+                 linear=False):
+        super().__init__()
+        self.class_num = class_num
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [x for x in paddle.linspace(0, drop_path_rate, sum(depths))
+               ]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(
+                img_size=img_size if i == 0 else img_size // (2**(i + 1)),
+                patch_size=7 if i == 0 else 3,
+                stride=4 if i == 0 else 2,
+                in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                embed_dim=embed_dims[i])
+
+            block = nn.LayerList([
+                Block(
+                    dim=embed_dims[i],
+                    num_heads=num_heads[i],
+                    mlp_ratio=mlp_ratios[i],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + j].item(),
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[i],
+                    linear=linear) for j in range(depths[i])
+            ])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+
+        # classification head
+        self.head = nn.Linear(embed_dims[3],
+                              class_num) if class_num > 0 else Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+            for blk in block:
+                x = blk(x, H, W)
+            x = norm(x)
+            if i != self.num_stages - 1:
+                x = x.reshape([B, H, W, x.shape[2]]).transpose([0, 3, 1, 2])
+
+        return x.mean(axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = swapdim(x, 1, 2)
+        x = x.reshape([B, C, H, W])
+        x = self.dwconv(x)
+        x = x.flatten(2)
+        x = swapdim(x, 1, 2)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PVT_V2_B0(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[32, 64, 160, 256],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B0"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B1(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B1"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B2(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B2"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B3(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 18, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B3"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B4(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 8, 27, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B4"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B5(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 6, 40, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B5"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B2_Linear(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        linear=True,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B2_Linear"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/rednet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/rednet.py
new file mode 100755
index 000000000..70a8a2c67
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/rednet.py
@@ -0,0 +1,204 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/d-li14/involution
+# reference: https://arxiv.org/abs/2103.06255
+
+import paddle
+import paddle.nn as nn
+
+from paddle.vision.models import resnet
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "RedNet26":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet26_pretrained.pdparams",
+    "RedNet38":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet38_pretrained.pdparams",
+    "RedNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet50_pretrained.pdparams",
+    "RedNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet101_pretrained.pdparams",
+    "RedNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet152_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+class Involution(nn.Layer):
+    def __init__(self, channels, kernel_size, stride):
+        super(Involution, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.channels = channels
+        reduction_ratio = 4
+        self.group_channels = 16
+        self.groups = self.channels // self.group_channels
+        self.conv1 = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels=channels,
+                out_channels=channels // reduction_ratio,
+                kernel_size=1,
+                bias_attr=False)),
+            ('bn', nn.BatchNorm2D(channels // reduction_ratio)),
+            ('activate', nn.ReLU()))
+        self.conv2 = nn.Sequential(('conv', nn.Conv2D(
+            in_channels=channels // reduction_ratio,
+            out_channels=kernel_size**2 * self.groups,
+            kernel_size=1,
+            stride=1)))
+        if stride > 1:
+            self.avgpool = nn.AvgPool2D(stride, stride)
+
+    def forward(self, x):
+        weight = self.conv2(
+            self.conv1(x if self.stride == 1 else self.avgpool(x)))
+        b, c, h, w = weight.shape
+        weight = weight.reshape(
+            (b, self.groups, self.kernel_size**2, h, w)).unsqueeze(2)
+
+        out = nn.functional.unfold(x, self.kernel_size, self.stride,
+                                   (self.kernel_size - 1) // 2, 1)
+        out = out.reshape(
+            (b, self.groups, self.group_channels, self.kernel_size**2, h, w))
+        out = (weight * out).sum(axis=3).reshape((b, self.channels, h, w))
+        return out
+
+
+class BottleneckBlock(resnet.BottleneckBlock):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BottleneckBlock, self).__init__(inplanes, planes, stride,
+                                              downsample, groups, base_width,
+                                              dilation, norm_layer)
+        width = int(planes * (base_width / 64.)) * groups
+        self.conv2 = Involution(width, 7, stride)
+
+
+class RedNet(resnet.ResNet):
+    def __init__(self, block, depth, class_num=1000, with_pool=True):
+        super(RedNet, self).__init__(
+            block=block, depth=50, num_classes=class_num, with_pool=with_pool)
+        layer_cfg = {
+            26: [1, 2, 4, 1],
+            38: [2, 3, 5, 2],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3]
+        }
+        layers = layer_cfg[depth]
+
+        self.conv1 = None
+        self.bn1 = None
+        self.relu = None
+        self.inplanes = 64
+        self.class_num = class_num
+        self.stem = nn.Sequential(
+            nn.Sequential(
+                ('conv', nn.Conv2D(
+                    in_channels=3,
+                    out_channels=self.inplanes // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias_attr=False)),
+                ('bn', nn.BatchNorm2D(self.inplanes // 2)),
+                ('activate', nn.ReLU())),
+            Involution(self.inplanes // 2, 3, 1),
+            nn.BatchNorm2D(self.inplanes // 2),
+            nn.ReLU(),
+            nn.Sequential(
+                ('conv', nn.Conv2D(
+                    in_channels=self.inplanes // 2,
+                    out_channels=self.inplanes,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias_attr=False)), ('bn', nn.BatchNorm2D(self.inplanes)),
+                ('activate', nn.ReLU())))
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        if self.class_num > 0:
+            x = paddle.flatten(x, 1)
+            x = self.fc(x)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RedNet26(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 26, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet26"])
+    return model
+
+
+def RedNet38(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 38, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet38"])
+    return model
+
+
+def RedNet50(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 50, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet50"])
+    return model
+
+
+def RedNet101(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 101, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet101"])
+    return model
+
+
+def RedNet152(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 152, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet152"])
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/regnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/regnet.py
new file mode 100755
index 000000000..12f60012a
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/regnet.py
@@ -0,0 +1,531 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/facebookresearch/pycls
+# reference: https://arxiv.org/abs/1905.13214
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "RegNetX_200MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_200MF_pretrained.pdparams",
+    "RegNetX_400MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_400MF_pretrained.pdparams",
+    "RegNetX_600MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_600MF_pretrained.pdparams",
+    "RegNetX_800MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_800MF_pretrained.pdparams",
+    "RegNetX_1600MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_1600MF_pretrained.pdparams",
+    "RegNetX_3200MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_3200MF_pretrained.pdparams",
+    "RegNetX_4GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_4GF_pretrained.pdparams",
+    "RegNetX_6400MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_6400MF_pretrained.pdparams",
+    "RegNetX_8GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_8GF_pretrained.pdparams",
+    "RegNetX_12GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_12GF_pretrained.pdparams",
+    "RegNetX_16GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_16GF_pretrained.pdparams",
+    "RegNetX_32GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_32GF_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def quantize_float(f, q):
+    """Converts a float to closest non-zero int divisible by q."""
+    return int(round(f / q) * q)
+
+
+def adjust_ws_gs_comp(ws, bms, gs):
+    """Adjusts the compatibility of widths and groups."""
+    ws_bot = [int(w * b) for w, b in zip(ws, bms)]
+    gs = [min(g, w_bot) for g, w_bot in zip(gs, ws_bot)]
+    ws_bot = [quantize_float(w_bot, g) for w_bot, g in zip(ws_bot, gs)]
+    ws = [int(w_bot / b) for w_bot, b in zip(ws_bot, bms)]
+    return ws, gs
+
+
+def get_stages_from_blocks(ws, rs):
+    """Gets ws/ds of network at each stage from per block values."""
+    ts = [
+        w != wp or r != rp
+        for w, wp, r, rp in zip(ws + [0], [0] + ws, rs + [0], [0] + rs)
+    ]
+    s_ws = [w for w, t in zip(ws, ts[:-1]) if t]
+    s_ds = np.diff([d for d, t in zip(range(len(ts)), ts) if t]).tolist()
+    return s_ws, s_ds
+
+
+def generate_regnet(w_a, w_0, w_m, d, q=8):
+    """Generates per block ws from RegNet parameters."""
+    assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0
+    ws_cont = np.arange(d) * w_a + w_0
+    ks = np.round(np.log(ws_cont / w_0) / np.log(w_m))
+    ws = w_0 * np.power(w_m, ks)
+    ws = np.round(np.divide(ws, q)) * q
+    num_stages, max_stage = len(np.unique(ws)), ks.max() + 1
+    ws, ws_cont = ws.astype(int).tolist(), ws_cont.tolist()
+    return ws, num_stages, max_stage, ws_cont
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + ".conv2d.output.1.w_0"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + ".output.1.w_0"),
+            bias_attr=ParamAttr(bn_name + ".output.1.b_0"),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 bm,
+                 gw,
+                 se_on,
+                 se_r,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        # Compute the bottleneck width
+        w_b = int(round(num_filters * bm))
+        # Compute the number of groups
+        num_gs = w_b // gw
+        self.se_on = se_on
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=w_b,
+            filter_size=1,
+            padding=0,
+            act="relu",
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=w_b,
+            num_filters=w_b,
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            groups=num_gs,
+            act="relu",
+            name=name + "_branch2b")
+        if se_on:
+            w_se = int(round(num_channels * se_r))
+            self.se_block = SELayer(
+                num_channels=w_b,
+                num_filters=w_b,
+                reduction_ratio=w_se,
+                name=name + "_branch2se")
+        self.conv2 = ConvBNLayer(
+            num_channels=w_b,
+            num_filters=num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        if self.se_on:
+            conv1 = self.se_block(conv1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + "_sqz_offset"))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + "_exc_offset"))
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.reshape(pool, shape=[-1, self._num_channels])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.reshape(
+            excitation, shape=[-1, self._num_channels, 1, 1])
+        out = input * excitation
+        return out
+
+
+class RegNet(nn.Layer):
+    def __init__(self,
+                 w_a,
+                 w_0,
+                 w_m,
+                 d,
+                 group_w,
+                 bot_mul,
+                 q=8,
+                 se_on=False,
+                 class_num=1000):
+        super(RegNet, self).__init__()
+
+        # Generate RegNet ws per block
+        b_ws, num_s, max_s, ws_cont = generate_regnet(w_a, w_0, w_m, d, q)
+        # Convert to per stage format
+        ws, ds = get_stages_from_blocks(b_ws, b_ws)
+        # Generate group widths and bot muls
+        gws = [group_w for _ in range(num_s)]
+        bms = [bot_mul for _ in range(num_s)]
+        # Adjust the compatibility of ws and gws
+        ws, gws = adjust_ws_gs_comp(ws, bms, gws)
+        # Use the same stride for each stage
+        ss = [2 for _ in range(num_s)]
+        # Use SE for RegNetY
+        se_r = 0.25
+        # Construct the model
+        # Group params by stage
+        stage_params = list(zip(ds, ws, ss, bms, gws))
+        # Construct the stem
+        stem_type = "simple_stem_in"
+        stem_w = 32
+        block_type = "res_bottleneck_block"
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=stem_w,
+            filter_size=3,
+            stride=2,
+            padding=1,
+            act="relu",
+            name="stem_conv")
+
+        self.block_list = []
+        for block, (d, w_out, stride, bm, gw) in enumerate(stage_params):
+            shortcut = False
+            for i in range(d):
+                num_channels = stem_w if block == i == 0 else in_channels
+                # Stride apply to the first block of the stage
+                b_stride = stride if i == 0 else 1
+                conv_name = "s" + str(block + 1) + "_b" + str(i +
+                                                              1)  # chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    conv_name,
+                    BottleneckBlock(
+                        num_channels=num_channels,
+                        num_filters=w_out,
+                        stride=b_stride,
+                        bm=bm,
+                        gw=gw,
+                        se_on=se_on,
+                        se_r=se_r,
+                        shortcut=shortcut,
+                        name=conv_name))
+                in_channels = w_out
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = w_out
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_0.w_0"),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RegNetX_200MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=36.44,
+        w_0=24,
+        w_m=2.49,
+        d=13,
+        group_w=8,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_200MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_400MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=24.48,
+        w_0=24,
+        w_m=2.54,
+        d=22,
+        group_w=16,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_400MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_600MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=36.97,
+        w_0=48,
+        w_m=2.24,
+        d=16,
+        group_w=24,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_600MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_800MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=35.73,
+        w_0=56,
+        w_m=2.28,
+        d=16,
+        group_w=16,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_800MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_1600MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=34.01,
+        w_0=80,
+        w_m=2.25,
+        d=18,
+        group_w=24,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_1600MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_3200MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=26.31,
+        w_0=88,
+        w_m=2.25,
+        d=25,
+        group_w=48,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_3200MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_4GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=38.65,
+        w_0=96,
+        w_m=2.43,
+        d=23,
+        group_w=40,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_4GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_6400MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=60.83,
+        w_0=184,
+        w_m=2.07,
+        d=17,
+        group_w=56,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_6400MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_8GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=49.56,
+        w_0=80,
+        w_m=2.88,
+        d=23,
+        group_w=120,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_8GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_12GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=73.36,
+        w_0=168,
+        w_m=2.37,
+        d=19,
+        group_w=112,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_12GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_16GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=55.59,
+        w_0=216,
+        w_m=2.1,
+        d=22,
+        group_w=128,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_16GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_32GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=69.86,
+        w_0=320,
+        w_m=2.0,
+        d=23,
+        group_w=168,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_32GF"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/repvgg.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/repvgg.py
new file mode 100755
index 000000000..94309f578
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/repvgg.py
@@ -0,0 +1,451 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/DingXiaoH/RepVGG
+# reference: https://arxiv.org/abs/2101.03697
+
+import paddle.nn as nn
+import paddle
+import paddle.nn.functional as F
+import numpy as np
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "RepVGG_A0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A0_pretrained.pdparams",
+    "RepVGG_A1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A1_pretrained.pdparams",
+    "RepVGG_A2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A2_pretrained.pdparams",
+    "RepVGG_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B0_pretrained.pdparams",
+    "RepVGG_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1_pretrained.pdparams",
+    "RepVGG_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B2_pretrained.pdparams",
+    "RepVGG_B1g2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1g2_pretrained.pdparams",
+    "RepVGG_B1g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1g4_pretrained.pdparams",
+    "RepVGG_B2g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B2g4_pretrained.pdparams",
+    "RepVGG_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B3_pretrained.pdparams",
+    "RepVGG_B3g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B3g4_pretrained.pdparams",
+    "RepVGG_D2se":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_D2se_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+optional_groupwise_layers = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26]
+g2_map = {l: 2 for l in optional_groupwise_layers}
+g4_map = {l: 4 for l in optional_groupwise_layers}
+
+
+class ConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1):
+        super(ConvBN, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+        self.bn = nn.BatchNorm2D(num_features=out_channels)
+
+    def forward(self, x):
+        y = self.conv(x)
+        y = self.bn(y)
+        return y
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, input_channels, internal_neurons):
+        super(SEBlock, self).__init__()
+        self.down = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=internal_neurons,
+            kernel_size=1,
+            stride=1,
+            bias_attr=True)
+        self.up = nn.Conv2D(
+            in_channels=internal_neurons,
+            out_channels=input_channels,
+            kernel_size=1,
+            stride=1,
+            bias_attr=True)
+        self.input_channels = input_channels
+
+    def forward(self, inputs):
+        x = F.avg_pool2d(inputs, kernel_size=inputs.shape[3])
+        x = self.down(x)
+        x = F.relu(x)
+        x = self.up(x)
+        x = F.sigmoid(x)
+        x = x.reshape([-1, self.input_channels, 1, 1])
+        return inputs * x
+
+
+class RepVGGBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 use_se=False):
+        super(RepVGGBlock, self).__init__()
+        self.is_repped = False
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.padding_mode = padding_mode
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        self.nonlinearity = nn.ReLU()
+
+        if use_se:
+            self.se = SEBlock(
+                out_channels, internal_neurons=out_channels // 16)
+        else:
+            self.se = nn.Identity()
+        self.rbr_identity = nn.BatchNorm2D(
+            num_features=in_channels
+        ) if out_channels == in_channels and stride == 1 else None
+        self.rbr_dense = ConvBN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups)
+        self.rbr_1x1 = ConvBN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=padding_11,
+            groups=groups)
+
+    def forward(self, inputs):
+        if self.is_repped:
+            return self.nonlinearity(self.rbr_reparam(inputs))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+        return self.nonlinearity(
+            self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))
+
+    def re_parameterize(self):
+        if not hasattr(self, 'rbr_reparam'):
+            self.rbr_reparam = nn.Conv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.kernel_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                padding_mode=self.padding_mode)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam.weight.set_value(kernel)
+        self.rbr_reparam.bias.set_value(bias)
+        self.is_repped = True
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, ConvBN):
+            kernel = branch.conv.weight
+            running_mean = branch.bn._mean
+            running_var = branch.bn._variance
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2D)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros(
+                    (self.in_channels, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = paddle.to_tensor(kernel_value)
+            kernel = self.id_tensor
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class RepVGG(nn.Layer):
+    def __init__(self,
+                 num_blocks,
+                 width_multiplier=None,
+                 override_groups_map=None,
+                 class_num=1000,
+                 use_se=False):
+        super(RepVGG, self).__init__()
+        assert len(width_multiplier) == 4
+        self.override_groups_map = override_groups_map or dict()
+        assert 0 not in self.override_groups_map
+        self.in_planes = min(64, int(64 * width_multiplier[0]))
+
+        self.stage0 = RepVGGBlock(
+            in_channels=3,
+            out_channels=self.in_planes,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            use_se=use_se)
+        self.cur_layer_idx = 1
+        self.stage1 = self._make_stage(
+            int(64 * width_multiplier[0]),
+            num_blocks[0],
+            stride=2,
+            use_se=use_se)
+        self.stage2 = self._make_stage(
+            int(128 * width_multiplier[1]),
+            num_blocks[1],
+            stride=2,
+            use_se=use_se)
+        self.stage3 = self._make_stage(
+            int(256 * width_multiplier[2]),
+            num_blocks[2],
+            stride=2,
+            use_se=use_se)
+        self.stage4 = self._make_stage(
+            int(512 * width_multiplier[3]),
+            num_blocks[3],
+            stride=2,
+            use_se=use_se)
+        self.gap = nn.AdaptiveAvgPool2D(output_size=1)
+        self.linear = nn.Linear(int(512 * width_multiplier[3]), class_num)
+
+    def _make_stage(self, planes, num_blocks, stride, use_se=False):
+        strides = [stride] + [1] * (num_blocks - 1)
+        blocks = []
+        for stride in strides:
+            cur_groups = self.override_groups_map.get(self.cur_layer_idx, 1)
+            blocks.append(
+                RepVGGBlock(
+                    in_channels=self.in_planes,
+                    out_channels=planes,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=cur_groups,
+                    use_se=use_se))
+            self.in_planes = planes
+            self.cur_layer_idx += 1
+        return nn.Sequential(*blocks)
+
+    def forward(self, x):
+        out = self.stage0(x)
+        out = self.stage1(out)
+        out = self.stage2(out)
+        out = self.stage3(out)
+        out = self.stage4(out)
+        out = self.gap(out)
+        out = paddle.flatten(out, start_axis=1)
+        out = self.linear(out)
+        return out
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RepVGG_A0(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[0.75, 0.75, 0.75, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A0"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_A1(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[1, 1, 1, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A1"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_A2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[1.5, 1.5, 1.5, 2.75],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B0(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[1, 1, 1, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B0"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1g2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=g2_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1g2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1g4"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2.5, 2.5, 2.5, 5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B2g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2.5, 2.5, 2.5, 5],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B2g4"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B3(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[3, 3, 3, 5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B3"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B3g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[3, 3, 3, 5],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B3g4"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_D2se(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[8, 14, 24, 1],
+        width_multiplier=[2.5, 2.5, 2.5, 5],
+        override_groups_map=None,
+        use_se=True,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_D2se"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/res2net.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/res2net.py
new file mode 100755
index 000000000..33261fca7
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/res2net.py
@@ -0,0 +1,266 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1904.01169
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "Res2Net50_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_26w_4s_pretrained.pdparams",
+    "Res2Net50_14w_8s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_14w_8s_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels1,
+                 num_channels2,
+                 num_filters,
+                 stride,
+                 scales,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.stride = stride
+        self.scales = scales
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels1,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1_list = []
+        for s in range(scales - 1):
+            conv1 = self.add_sublayer(
+                name + '_branch2b_' + str(s + 1),
+                ConvBNLayer(
+                    num_channels=num_filters // scales,
+                    num_filters=num_filters // scales,
+                    filter_size=3,
+                    stride=stride,
+                    act='relu',
+                    name=name + '_branch2b_' + str(s + 1)))
+            self.conv1_list.append(conv1)
+        self.pool2d_avg = AvgPool2D(kernel_size=3, stride=stride, padding=1)
+
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_channels2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels1,
+                num_filters=num_channels2,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        xs = paddle.split(y, self.scales, 1)
+        ys = []
+        for s, conv1 in enumerate(self.conv1_list):
+            if s == 0 or self.stride == 2:
+                ys.append(conv1(xs[s]))
+            else:
+                ys.append(conv1(paddle.add(xs[s], ys[-1])))
+        if self.stride == 1:
+            ys.append(xs[-1])
+        else:
+            ys.append(self.pool2d_avg(xs[-1]))
+        conv1 = paddle.concat(ys, axis=1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class Res2Net(nn.Layer):
+    def __init__(self, layers=50, scales=4, width=26, class_num=1000):
+        super(Res2Net, self).__init__()
+
+        self.layers = layers
+        self.scales = scales
+        self.width = width
+        basic_width = self.width * self.scales
+        supported_layers = [50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_channels2 = [256, 512, 1024, 2048]
+        num_filters = [basic_width * t for t in [1, 2, 4, 8]]
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="conv1")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels1=num_channels[block]
+                        if i == 0 else num_channels2[block],
+                        num_channels2=num_channels2[block],
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        scales=scales,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Res2Net50_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net(layers=50, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Res2Net50_26w_4s"], use_ssld=use_ssld)
+    return model
+
+
+def Res2Net50_14w_8s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net(layers=50, scales=8, width=14, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Res2Net50_14w_8s"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/res2net_vd.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/res2net_vd.py
new file mode 100755
index 000000000..206d06028
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/res2net_vd.py
@@ -0,0 +1,308 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1904.01169 & https://arxiv.org/abs/1812.01187
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "Res2Net50_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_vd_26w_4s_pretrained.pdparams",
+    "Res2Net101_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net101_vd_26w_4s_pretrained.pdparams",
+    "Res2Net200_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net200_vd_26w_4s_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels1,
+                 num_channels2,
+                 num_filters,
+                 stride,
+                 scales,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.stride = stride
+        self.scales = scales
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels1,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1_list = []
+        for s in range(scales - 1):
+            conv1 = self.add_sublayer(
+                name + '_branch2b_' + str(s + 1),
+                ConvBNLayer(
+                    num_channels=num_filters // scales,
+                    num_filters=num_filters // scales,
+                    filter_size=3,
+                    stride=stride,
+                    act='relu',
+                    name=name + '_branch2b_' + str(s + 1)))
+            self.conv1_list.append(conv1)
+        self.pool2d_avg = AvgPool2D(kernel_size=3, stride=stride, padding=1)
+
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_channels2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels1,
+                num_filters=num_channels2,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        xs = paddle.split(y, self.scales, 1)
+        ys = []
+        for s, conv1 in enumerate(self.conv1_list):
+            if s == 0 or self.stride == 2:
+                ys.append(conv1(xs[s]))
+            else:
+                ys.append(conv1(xs[s] + ys[-1]))
+        if self.stride == 1:
+            ys.append(xs[-1])
+        else:
+            ys.append(self.pool2d_avg(xs[-1]))
+        conv1 = paddle.concat(ys, axis=1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class Res2Net_vd(nn.Layer):
+    def __init__(self, layers=50, scales=4, width=26, class_num=1000,
+                 **kwargs):
+        super(Res2Net_vd, self).__init__()
+
+        self.layers = layers
+        self.scales = scales
+        self.width = width
+        basic_width = self.width * self.scales
+        supported_layers = [50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_channels2 = [256, 512, 1024, 2048]
+        num_filters = [basic_width * t for t in [1, 2, 4, 8]]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152, 200] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels1=num_channels[block]
+                        if i == 0 else num_channels2[block],
+                        num_channels2=num_channels2[block],
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        scales=scales,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Res2Net50_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=50, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net50_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
+
+
+def Res2Net101_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=101, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net101_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
+
+
+def Res2Net200_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=200, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net200_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnest.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnest.py
new file mode 100755
index 000000000..171aa1f94
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnest.py
@@ -0,0 +1,780 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/zhanghang1989/ResNeSt
+# reference: https://arxiv.org/abs/2004.08955
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import math
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.regularizer import L2Decay
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNeSt50_fast_1s1x64d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt50_fast_1s1x64d_pretrained.pdparams",
+    "ResNeSt50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt50_pretrained.pdparams",
+    "ResNeSt101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt101_pretrained.pdparams",
+    "ResNeSt200":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt200_pretrained.pdparams",
+    "ResNeSt269":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt269_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        bn_decay = 0.0
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weight"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(
+                name=name + "_scale", regularizer=L2Decay(bn_decay)),
+            bias_attr=ParamAttr(
+                name + "_offset", regularizer=L2Decay(bn_decay)),
+            moving_mean_name=name + "_mean",
+            moving_variance_name=name + "_variance")
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        return x
+
+
+class rSoftmax(nn.Layer):
+    def __init__(self, radix, cardinality):
+        super(rSoftmax, self).__init__()
+        self.radix = radix
+        self.cardinality = cardinality
+
+    def forward(self, x):
+        cardinality = self.cardinality
+        radix = self.radix
+
+        batch, r, h, w = x.shape
+        if self.radix > 1:
+            x = paddle.reshape(
+                x=x,
+                shape=[
+                    batch, cardinality, radix,
+                    int(r * h * w / cardinality / radix)
+                ])
+            x = paddle.transpose(x=x, perm=[0, 2, 1, 3])
+            x = nn.functional.softmax(x, axis=1)
+            x = paddle.reshape(x=x, shape=[batch, r * h * w, 1, 1])
+        else:
+            x = nn.functional.sigmoid(x)
+        return x
+
+
+class SplatConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 radix=2,
+                 reduction_factor=4,
+                 rectify_avg=False,
+                 name=None):
+        super(SplatConv, self).__init__()
+
+        self.radix = radix
+
+        self.conv1 = ConvBNLayer(
+            num_channels=in_channels,
+            num_filters=channels * radix,
+            filter_size=kernel_size,
+            stride=stride,
+            groups=groups * radix,
+            act="relu",
+            name=name + "_1_weights")
+
+        self.avg_pool2d = AdaptiveAvgPool2D(1)
+
+        inter_channels = int(max(in_channels * radix // reduction_factor, 32))
+
+        # to calc gap
+        self.conv2 = ConvBNLayer(
+            num_channels=channels,
+            num_filters=inter_channels,
+            filter_size=1,
+            stride=1,
+            groups=groups,
+            act="relu",
+            name=name + "_2_weights")
+
+        # to calc atten
+        self.conv3 = Conv2D(
+            in_channels=inter_channels,
+            out_channels=channels * radix,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=groups,
+            weight_attr=ParamAttr(
+                name=name + "_weights", initializer=KaimingNormal()))
+
+        self.rsoftmax = rSoftmax(radix=radix, cardinality=groups)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        if self.radix > 1:
+            splited = paddle.split(x, num_or_sections=self.radix, axis=1)
+            gap = paddle.add_n(splited)
+        else:
+            gap = x
+
+        gap = self.avg_pool2d(gap)
+        gap = self.conv2(gap)
+
+        atten = self.conv3(gap)
+        atten = self.rsoftmax(atten)
+
+        if self.radix > 1:
+            attens = paddle.split(atten, num_or_sections=self.radix, axis=1)
+            y = paddle.add_n([
+                paddle.multiply(split, att)
+                for (att, split) in zip(attens, splited)
+            ])
+        else:
+            y = paddle.multiply(x, atten)
+
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 radix=1,
+                 cardinality=1,
+                 bottleneck_width=64,
+                 avd=False,
+                 avd_first=False,
+                 dilation=1,
+                 is_first=False,
+                 rectify_avg=False,
+                 last_gamma=False,
+                 avg_down=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.radix = radix
+        self.cardinality = cardinality
+        self.avd = avd
+        self.avd_first = avd_first
+        self.dilation = dilation
+        self.is_first = is_first
+        self.rectify_avg = rectify_avg
+        self.last_gamma = last_gamma
+        self.avg_down = avg_down
+
+        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
+
+        self.conv1 = ConvBNLayer(
+            num_channels=self.inplanes,
+            num_filters=group_width,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            act="relu",
+            name=name + "_conv1")
+
+        if avd and avd_first and (stride > 1 or is_first):
+            self.avg_pool2d_1 = AvgPool2D(
+                kernel_size=3, stride=stride, padding=1)
+
+        if radix >= 1:
+            self.conv2 = SplatConv(
+                in_channels=group_width,
+                channels=group_width,
+                kernel_size=3,
+                stride=1,
+                padding=dilation,
+                dilation=dilation,
+                groups=cardinality,
+                bias=False,
+                radix=radix,
+                rectify_avg=rectify_avg,
+                name=name + "_splat")
+        else:
+            self.conv2 = ConvBNLayer(
+                num_channels=group_width,
+                num_filters=group_width,
+                filter_size=3,
+                stride=1,
+                dilation=dilation,
+                groups=cardinality,
+                act="relu",
+                name=name + "_conv2")
+
+        if avd and avd_first == False and (stride > 1 or is_first):
+            self.avg_pool2d_2 = AvgPool2D(
+                kernel_size=3, stride=stride, padding=1)
+
+        self.conv3 = ConvBNLayer(
+            num_channels=group_width,
+            num_filters=planes * 4,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            act=None,
+            name=name + "_conv3")
+
+        if stride != 1 or self.inplanes != self.planes * 4:
+            if avg_down:
+                if dilation == 1:
+                    self.avg_pool2d_3 = AvgPool2D(
+                        kernel_size=stride, stride=stride, padding=0)
+                else:
+                    self.avg_pool2d_3 = AvgPool2D(
+                        kernel_size=1, stride=1, padding=0, ceil_mode=True)
+
+                self.conv4 = Conv2D(
+                    in_channels=self.inplanes,
+                    out_channels=planes * 4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                    weight_attr=ParamAttr(
+                        name=name + "_weights", initializer=KaimingNormal()),
+                    bias_attr=False)
+            else:
+                self.conv4 = Conv2D(
+                    in_channels=self.inplanes,
+                    out_channels=planes * 4,
+                    kernel_size=1,
+                    stride=stride,
+                    padding=0,
+                    groups=1,
+                    weight_attr=ParamAttr(
+                        name=name + "_shortcut_weights",
+                        initializer=KaimingNormal()),
+                    bias_attr=False)
+
+            bn_decay = 0.0
+            self._batch_norm = BatchNorm(
+                planes * 4,
+                act=None,
+                param_attr=ParamAttr(
+                    name=name + "_shortcut_scale",
+                    regularizer=L2Decay(bn_decay)),
+                bias_attr=ParamAttr(
+                    name + "_shortcut_offset", regularizer=L2Decay(bn_decay)),
+                moving_mean_name=name + "_shortcut_mean",
+                moving_variance_name=name + "_shortcut_variance")
+
+    def forward(self, x):
+        short = x
+
+        x = self.conv1(x)
+        if self.avd and self.avd_first and (self.stride > 1 or self.is_first):
+            x = self.avg_pool2d_1(x)
+
+        x = self.conv2(x)
+
+        if self.avd and self.avd_first == False and (self.stride > 1 or
+                                                     self.is_first):
+            x = self.avg_pool2d_2(x)
+
+        x = self.conv3(x)
+
+        if self.stride != 1 or self.inplanes != self.planes * 4:
+            if self.avg_down:
+                short = self.avg_pool2d_3(short)
+
+            short = self.conv4(short)
+
+            short = self._batch_norm(short)
+
+        y = paddle.add(x=short, y=x)
+        y = F.relu(y)
+        return y
+
+
+class ResNeStLayer(nn.Layer):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 blocks,
+                 radix,
+                 cardinality,
+                 bottleneck_width,
+                 avg_down,
+                 avd,
+                 avd_first,
+                 rectify_avg,
+                 last_gamma,
+                 stride=1,
+                 dilation=1,
+                 is_first=True,
+                 name=None):
+        super(ResNeStLayer, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+        self.blocks = blocks
+        self.radix = radix
+        self.cardinality = cardinality
+        self.bottleneck_width = bottleneck_width
+        self.avg_down = avg_down
+        self.avd = avd
+        self.avd_first = avd_first
+        self.rectify_avg = rectify_avg
+        self.last_gamma = last_gamma
+        self.is_first = is_first
+
+        if dilation == 1 or dilation == 2:
+            bottleneck_func = self.add_sublayer(
+                name + "_bottleneck_0",
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    stride=stride,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=1,
+                    is_first=is_first,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=name + "_bottleneck_0"))
+        elif dilation == 4:
+            bottleneck_func = self.add_sublayer(
+                name + "_bottleneck_0",
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    stride=stride,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=2,
+                    is_first=is_first,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=name + "_bottleneck_0"))
+        else:
+            raise RuntimeError("=>unknown dilation size")
+
+        self.inplanes = planes * 4
+        self.bottleneck_block_list = [bottleneck_func]
+        for i in range(1, blocks):
+            curr_name = name + "_bottleneck_" + str(i)
+
+            bottleneck_func = self.add_sublayer(
+                curr_name,
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=dilation,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=curr_name))
+            self.bottleneck_block_list.append(bottleneck_func)
+
+    def forward(self, x):
+        for bottleneck_block in self.bottleneck_block_list:
+            x = bottleneck_block(x)
+        return x
+
+
+class ResNeSt(nn.Layer):
+    def __init__(self,
+                 layers,
+                 radix=1,
+                 groups=1,
+                 bottleneck_width=64,
+                 dilated=False,
+                 dilation=1,
+                 deep_stem=False,
+                 stem_width=64,
+                 avg_down=False,
+                 rectify_avg=False,
+                 avd=False,
+                 avd_first=False,
+                 final_drop=0.0,
+                 last_gamma=False,
+                 class_num=1000):
+        super(ResNeSt, self).__init__()
+
+        self.cardinality = groups
+        self.bottleneck_width = bottleneck_width
+        # ResNet-D params
+        self.inplanes = stem_width * 2 if deep_stem else 64
+        self.avg_down = avg_down
+        self.last_gamma = last_gamma
+        # ResNeSt params
+        self.radix = radix
+        self.avd = avd
+        self.avd_first = avd_first
+
+        self.deep_stem = deep_stem
+        self.stem_width = stem_width
+        self.layers = layers
+        self.final_drop = final_drop
+        self.dilated = dilated
+        self.dilation = dilation
+
+        self.rectify_avg = rectify_avg
+
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ("conv1", ConvBNLayer(
+                    num_channels=3,
+                    num_filters=stem_width,
+                    filter_size=3,
+                    stride=2,
+                    act="relu",
+                    name="conv1")), ("conv2", ConvBNLayer(
+                        num_channels=stem_width,
+                        num_filters=stem_width,
+                        filter_size=3,
+                        stride=1,
+                        act="relu",
+                        name="conv2")), ("conv3", ConvBNLayer(
+                            num_channels=stem_width,
+                            num_filters=stem_width * 2,
+                            filter_size=3,
+                            stride=1,
+                            act="relu",
+                            name="conv3")))
+        else:
+            self.stem = ConvBNLayer(
+                num_channels=3,
+                num_filters=stem_width,
+                filter_size=7,
+                stride=2,
+                act="relu",
+                name="conv1")
+
+        self.max_pool2d = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = ResNeStLayer(
+            inplanes=self.stem_width * 2
+            if self.deep_stem else self.stem_width,
+            planes=64,
+            blocks=self.layers[0],
+            radix=radix,
+            cardinality=self.cardinality,
+            bottleneck_width=bottleneck_width,
+            avg_down=self.avg_down,
+            avd=avd,
+            avd_first=avd_first,
+            rectify_avg=rectify_avg,
+            last_gamma=last_gamma,
+            stride=1,
+            dilation=1,
+            is_first=False,
+            name="layer1")
+
+        #         return
+
+        self.layer2 = ResNeStLayer(
+            inplanes=256,
+            planes=128,
+            blocks=self.layers[1],
+            radix=radix,
+            cardinality=self.cardinality,
+            bottleneck_width=bottleneck_width,
+            avg_down=self.avg_down,
+            avd=avd,
+            avd_first=avd_first,
+            rectify_avg=rectify_avg,
+            last_gamma=last_gamma,
+            stride=2,
+            name="layer2")
+
+        if self.dilated or self.dilation == 4:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=2,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=4,
+                name="layer4")
+        elif self.dilation == 2:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                dilation=1,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=2,
+                name="layer4")
+        else:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                name="layer4")
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.out_channels = 2048
+
+        stdv = 1.0 / math.sqrt(self.out_channels * 1.0)
+
+        self.out = Linear(
+            self.out_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=nn.initializer.Uniform(-stdv, stdv),
+                name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.max_pool2d(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+
+        x = self.layer3(x)
+
+        x = self.layer4(x)
+        x = self.pool2d_avg(x)
+        x = paddle.reshape(x, shape=[-1, self.out_channels])
+        x = self.out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeSt50_fast_1s1x64d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 6, 3],
+        radix=1,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=32,
+        avg_down=True,
+        avd=True,
+        avd_first=True,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeSt50_fast_1s1x64d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt50(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 6, 3],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=32,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt50"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt101(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 23, 3],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=64,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt101"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt200(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 24, 36, 3],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=64,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt200"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt269(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 30, 48, 8],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=64,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt269"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnet_vc.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnet_vc.py
new file mode 100755
index 000000000..ba44a2ce0
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnet_vc.py
@@ -0,0 +1,311 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1812.01187
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNet50_vc":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vc_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet_vc(nn.Layer):
+    def __init__(self, layers=50, class_num=1000):
+        super(ResNet_vc, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_0.w_0"),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet50_vc(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNet_vc(layers=50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNet50_vc"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext.py
new file mode 100755
index 000000000..53f65b631
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext.py
@@ -0,0 +1,303 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1611.05431
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNeXt50_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_32x4d_pretrained.pdparams",
+    "ResNeXt50_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_64x4d_pretrained.pdparams",
+    "ResNeXt101_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x4d_pretrained.pdparams",
+    "ResNeXt101_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_64x4d_pretrained.pdparams",
+    "ResNeXt152_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_32x4d_pretrained.pdparams",
+    "ResNeXt152_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 shortcut=True,
+                 name=None,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a",
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b",
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c",
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1",
+                data_format=data_format)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self,
+                 layers=50,
+                 class_num=1000,
+                 cardinality=32,
+                 input_image_channel=3,
+                 data_format="NCHW"):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.data_format = data_format
+        self.input_image_channel = input_image_channel
+        self.cardinality = cardinality
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv = ConvBNLayer(
+            num_channels=self.input_image_channel,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="res_conv1",
+            data_format=self.data_format)
+        self.pool2d_max = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        shortcut=shortcut,
+                        name=conv_name,
+                        data_format=self.data_format))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        with paddle.static.amp.fp16_guard():
+            return self._forward(inputs)
+
+    def _forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.tensor.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt50_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt50_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt101_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt101_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt152_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt152_64x4d"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext101_wsl.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
new file mode 100755
index 000000000..a4478e70a
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext101_wsl.py
@@ -0,0 +1,506 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1805.00932
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNeXt101_32x8d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x8d_wsl_pretrained.pdparams",
+    "ResNeXt101_32x16d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x16_wsl_pretrained.pdparams",
+    "ResNeXt101_32x32d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x32d_wsl_pretrained.pdparams",
+    "ResNeXt101_32x48d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x48d_wsl_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        if "downsample" in name:
+            conv_name = name + ".0"
+        else:
+            conv_name = name
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=conv_name + ".weight"),
+            bias_attr=False)
+        if "downsample" in name:
+            bn_name = name[:9] + "downsample.1"
+        else:
+            if "conv1" == name:
+                bn_name = "bn" + name[-1]
+            else:
+                bn_name = (name[:10] if name[7:9].isdigit() else name[:9]
+                           ) + "bn" + name[-1]
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + ".weight"),
+            bias_attr=ParamAttr(name=bn_name + ".bias"),
+            moving_mean_name=bn_name + ".running_mean",
+            moving_variance_name=bn_name + ".running_var")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._bn(x)
+        return x
+
+
+class ShortCut(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride, name=None):
+        super(ShortCut, self).__init__()
+
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.stride = stride
+        if input_channels != output_channels or stride != 1:
+            self._conv = ConvBNLayer(
+                input_channels,
+                output_channels,
+                filter_size=1,
+                stride=stride,
+                name=name)
+
+    def forward(self, inputs):
+        if self.input_channels != self.output_channels or self.stride != 1:
+            return self._conv(inputs)
+        return inputs
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride, cardinality,
+                 width, name):
+        super(BottleneckBlock, self).__init__()
+
+        self._conv0 = ConvBNLayer(
+            input_channels,
+            output_channels,
+            filter_size=1,
+            act="relu",
+            name=name + ".conv1")
+        self._conv1 = ConvBNLayer(
+            output_channels,
+            output_channels,
+            filter_size=3,
+            act="relu",
+            stride=stride,
+            groups=cardinality,
+            name=name + ".conv2")
+        self._conv2 = ConvBNLayer(
+            output_channels,
+            output_channels // (width // 8),
+            filter_size=1,
+            act=None,
+            name=name + ".conv3")
+        self._short = ShortCut(
+            input_channels,
+            output_channels // (width // 8),
+            stride=stride,
+            name=name + ".downsample")
+
+    def forward(self, inputs):
+        x = self._conv0(inputs)
+        x = self._conv1(x)
+        x = self._conv2(x)
+        y = self._short(inputs)
+        y = paddle.add(x, y)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt101WSL(nn.Layer):
+    def __init__(self, layers=101, cardinality=32, width=48, class_num=1000):
+        super(ResNeXt101WSL, self).__init__()
+
+        self.class_num = class_num
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.width = width
+        self.scale = width // 8
+
+        self.depth = [3, 4, 23, 3]
+        self.base_width = cardinality * width
+        num_filters = [self.base_width * i
+                       for i in [1, 2, 4, 8]]  # [256, 512, 1024, 2048]
+        self._conv_stem = ConvBNLayer(
+            3, 64, 7, stride=2, act="relu", name="conv1")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self._conv1_0 = BottleneckBlock(
+            64,
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.0")
+        self._conv1_1 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.1")
+        self._conv1_2 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.2")
+
+        self._conv2_0 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[1],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.0")
+        self._conv2_1 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.1")
+        self._conv2_2 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.2")
+        self._conv2_3 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.3")
+
+        self._conv3_0 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[2],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.0")
+        self._conv3_1 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.1")
+        self._conv3_2 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.2")
+        self._conv3_3 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.3")
+        self._conv3_4 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.4")
+        self._conv3_5 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.5")
+        self._conv3_6 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.6")
+        self._conv3_7 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.7")
+        self._conv3_8 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.8")
+        self._conv3_9 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.9")
+        self._conv3_10 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.10")
+        self._conv3_11 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.11")
+        self._conv3_12 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.12")
+        self._conv3_13 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.13")
+        self._conv3_14 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.14")
+        self._conv3_15 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.15")
+        self._conv3_16 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.16")
+        self._conv3_17 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.17")
+        self._conv3_18 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.18")
+        self._conv3_19 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.19")
+        self._conv3_20 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.20")
+        self._conv3_21 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.21")
+        self._conv3_22 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.22")
+
+        self._conv4_0 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[3],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.0")
+        self._conv4_1 = BottleneckBlock(
+            num_filters[3] // (width // 8),
+            num_filters[3],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.1")
+        self._conv4_2 = BottleneckBlock(
+            num_filters[3] // (width // 8),
+            num_filters[3],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.2")
+
+        self._avg_pool = AdaptiveAvgPool2D(1)
+        self._out = Linear(
+            num_filters[3] // (width // 8),
+            class_num,
+            weight_attr=ParamAttr(name="fc.weight"),
+            bias_attr=ParamAttr(name="fc.bias"))
+
+    def forward(self, inputs):
+        x = self._conv_stem(inputs)
+        x = self._pool(x)
+
+        x = self._conv1_0(x)
+        x = self._conv1_1(x)
+        x = self._conv1_2(x)
+
+        x = self._conv2_0(x)
+        x = self._conv2_1(x)
+        x = self._conv2_2(x)
+        x = self._conv2_3(x)
+
+        x = self._conv3_0(x)
+        x = self._conv3_1(x)
+        x = self._conv3_2(x)
+        x = self._conv3_3(x)
+        x = self._conv3_4(x)
+        x = self._conv3_5(x)
+        x = self._conv3_6(x)
+        x = self._conv3_7(x)
+        x = self._conv3_8(x)
+        x = self._conv3_9(x)
+        x = self._conv3_10(x)
+        x = self._conv3_11(x)
+        x = self._conv3_12(x)
+        x = self._conv3_13(x)
+        x = self._conv3_14(x)
+        x = self._conv3_15(x)
+        x = self._conv3_16(x)
+        x = self._conv3_17(x)
+        x = self._conv3_18(x)
+        x = self._conv3_19(x)
+        x = self._conv3_20(x)
+        x = self._conv3_21(x)
+        x = self._conv3_22(x)
+
+        x = self._conv4_0(x)
+        x = self._conv4_1(x)
+        x = self._conv4_2(x)
+
+        x = self._avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt101_32x8d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=8, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x8d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x16d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=16, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x16d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x32d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x32d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x48d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=48, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x48d_wsl"],
+        use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext_vd.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext_vd.py
new file mode 100755
index 000000000..b2c7d7777
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/resnext_vd.py
@@ -0,0 +1,319 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1611.05431 & https://arxiv.org/abs/1812.01187
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNeXt50_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_vd_32x4d_pretrained.pdparams",
+    "ResNeXt50_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_vd_64x4d_pretrained.pdparams",
+    "ResNeXt101_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_vd_32x4d_pretrained.pdparams",
+    "ResNeXt101_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_vd_64x4d_pretrained.pdparams",
+    "ResNeXt152_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_vd_32x4d_pretrained.pdparams",
+    "ResNeXt152_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_vd_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self, layers=50, class_num=1000, cardinality=32):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt50_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_vd_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt50_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_vd_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_vd_64x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt152_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt152_vd_64x4d"],
+        use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/rexnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/rexnet.py
new file mode 100755
index 000000000..098fb2876
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/rexnet.py
@@ -0,0 +1,283 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2007.00992
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from math import ceil
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ReXNet_1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_0_pretrained.pdparams",
+    "ReXNet_1_3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_3_pretrained.pdparams",
+    "ReXNet_1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_5_pretrained.pdparams",
+    "ReXNet_2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_2_0_pretrained.pdparams",
+    "ReXNet_3_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_3_0_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def conv_bn_act(out,
+                in_channels,
+                channels,
+                kernel=1,
+                stride=1,
+                pad=0,
+                num_group=1,
+                active=True,
+                relu6=False):
+    out.append(
+        nn.Conv2D(
+            in_channels,
+            channels,
+            kernel,
+            stride,
+            pad,
+            groups=num_group,
+            bias_attr=False))
+    out.append(nn.BatchNorm2D(channels))
+    if active:
+        out.append(nn.ReLU6() if relu6 else nn.ReLU())
+
+
+def conv_bn_swish(out,
+                  in_channels,
+                  channels,
+                  kernel=1,
+                  stride=1,
+                  pad=0,
+                  num_group=1):
+    out.append(
+        nn.Conv2D(
+            in_channels,
+            channels,
+            kernel,
+            stride,
+            pad,
+            groups=num_group,
+            bias_attr=False))
+    out.append(nn.BatchNorm2D(channels))
+    out.append(nn.Swish())
+
+
+class SE(nn.Layer):
+    def __init__(self, in_channels, channels, se_ratio=12):
+        super(SE, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.fc = nn.Sequential(
+            nn.Conv2D(
+                in_channels, channels // se_ratio, kernel_size=1, padding=0),
+            nn.BatchNorm2D(channels // se_ratio),
+            nn.ReLU(),
+            nn.Conv2D(
+                channels // se_ratio, channels, kernel_size=1, padding=0),
+            nn.Sigmoid())
+
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.fc(y)
+        return x * y
+
+
+class LinearBottleneck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 t,
+                 stride,
+                 use_se=True,
+                 se_ratio=12,
+                 **kwargs):
+        super(LinearBottleneck, self).__init__(**kwargs)
+        self.use_shortcut = stride == 1 and in_channels <= channels
+        self.in_channels = in_channels
+        self.out_channels = channels
+
+        out = []
+        if t != 1:
+            dw_channels = in_channels * t
+            conv_bn_swish(out, in_channels=in_channels, channels=dw_channels)
+        else:
+            dw_channels = in_channels
+
+        conv_bn_act(
+            out,
+            in_channels=dw_channels,
+            channels=dw_channels,
+            kernel=3,
+            stride=stride,
+            pad=1,
+            num_group=dw_channels,
+            active=False)
+
+        if use_se:
+            out.append(SE(dw_channels, dw_channels, se_ratio))
+
+        out.append(nn.ReLU6())
+        conv_bn_act(
+            out,
+            in_channels=dw_channels,
+            channels=channels,
+            active=False,
+            relu6=True)
+        self.out = nn.Sequential(*out)
+
+    def forward(self, x):
+        out = self.out(x)
+        if self.use_shortcut:
+            out[:, 0:self.in_channels] += x
+
+        return out
+
+
+class ReXNetV1(nn.Layer):
+    def __init__(self,
+                 input_ch=16,
+                 final_ch=180,
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 class_num=1000,
+                 use_se=True,
+                 se_ratio=12,
+                 dropout_ratio=0.2,
+                 bn_momentum=0.9):
+        super(ReXNetV1, self).__init__()
+
+        layers = [1, 2, 2, 3, 3, 5]
+        strides = [1, 2, 2, 2, 1, 2]
+        use_ses = [False, False, True, True, True, True]
+
+        layers = [ceil(element * depth_mult) for element in layers]
+        strides = sum([[element] + [1] * (layers[idx] - 1)
+                       for idx, element in enumerate(strides)], [])
+        if use_se:
+            use_ses = sum([[element] * layers[idx]
+                           for idx, element in enumerate(use_ses)], [])
+        else:
+            use_ses = [False] * sum(layers[:])
+        ts = [1] * layers[0] + [6] * sum(layers[1:])
+
+        self.depth = sum(layers[:]) * 3
+        stem_channel = 32 / width_mult if width_mult < 1.0 else 32
+        inplanes = input_ch / width_mult if width_mult < 1.0 else input_ch
+
+        features = []
+        in_channels_group = []
+        channels_group = []
+
+        # The following channel configuration is a simple instance to make each layer become an expand layer.
+        for i in range(self.depth // 3):
+            if i == 0:
+                in_channels_group.append(int(round(stem_channel * width_mult)))
+                channels_group.append(int(round(inplanes * width_mult)))
+            else:
+                in_channels_group.append(int(round(inplanes * width_mult)))
+                inplanes += final_ch / (self.depth // 3 * 1.0)
+                channels_group.append(int(round(inplanes * width_mult)))
+
+        conv_bn_swish(
+            features,
+            3,
+            int(round(stem_channel * width_mult)),
+            kernel=3,
+            stride=2,
+            pad=1)
+
+        for block_idx, (in_c, c, t, s, se) in enumerate(
+                zip(in_channels_group, channels_group, ts, strides, use_ses)):
+            features.append(
+                LinearBottleneck(
+                    in_channels=in_c,
+                    channels=c,
+                    t=t,
+                    stride=s,
+                    use_se=se,
+                    se_ratio=se_ratio))
+
+        pen_channels = int(1280 * width_mult)
+        conv_bn_swish(features, c, pen_channels)
+
+        features.append(nn.AdaptiveAvgPool2D(1))
+        self.features = nn.Sequential(*features)
+        self.output = nn.Sequential(
+            nn.Dropout(dropout_ratio),
+            nn.Conv2D(
+                pen_channels, class_num, 1, bias_attr=True))
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.output(x).squeeze(axis=-1).squeeze(axis=-1)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ReXNet_1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_0"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_1_3(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.3, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_3"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_5"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_2_0"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_3_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=3.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_3_0"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnet_vd.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnet_vd.py
new file mode 100755
index 000000000..e08399eac
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnet_vd.py
@@ -0,0 +1,392 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1812.01187 & https://arxiv.org/abs/1709.01507
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SE_ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet18_vd_pretrained.pdparams",
+    "SE_ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet34_vd_pretrained.pdparams",
+    "SE_ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet50_vd_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 reduction_ratio=16,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        self.scale = SELayer(
+            num_channels=num_filters * 4,
+            num_filters=num_filters * 4,
+            reduction_ratio=reduction_ratio,
+            name='fc_' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 reduction_ratio=16,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        self.scale = SELayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc_' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        scale = self.scale(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = input * excitation
+        return out
+
+
+class SE_ResNet_vd(nn.Layer):
+    def __init__(self, layers=50, class_num=1000):
+        super(SE_ResNet_vd, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=18, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet18_vd"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=34, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet34_vd"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet50_vd"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnext.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnext.py
new file mode 100755
index 000000000..79556356f
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnext.py
@@ -0,0 +1,369 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1611.05431 & https://arxiv.org/abs/1709.01507
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SE_ResNeXt50_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt50_32x4d_pretrained.pdparams",
+    "SE_ResNeXt101_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt101_32x4d_pretrained.pdparams",
+    "SE_ResNeXt152_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt152_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format='NCHW'):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+        bn_name = name + '_bn'
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 reduction_ratio,
+                 shortcut=True,
+                 if_first=False,
+                 name=None,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1',
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name='conv' + name + '_x2',
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3',
+            data_format=data_format)
+        self.scale = SELayer(
+            num_channels=num_filters * 2 if cardinality == 32 else num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=stride,
+                name='conv' + name + '_prj',
+                data_format=data_format)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 reduction_ratio,
+                 name=None,
+                 data_format="NCHW"):
+        super(SELayer, self).__init__()
+
+        self.data_format = data_format
+        self.pool2d_gap = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        if self.data_format == "NHWC":
+            pool = paddle.squeeze(pool, axis=[1, 2])
+        else:
+            pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = self.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = self.sigmoid(excitation)
+        if self.data_format == "NHWC":
+            excitation = paddle.unsqueeze(excitation, axis=[1, 2])
+        else:
+            excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = input * excitation
+        return out
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self,
+                 layers=50,
+                 class_num=1000,
+                 cardinality=32,
+                 input_image_channel=3,
+                 data_format="NCHW"):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.reduction_ratio = 16
+        self.data_format = data_format
+        self.input_image_channel = input_image_channel
+
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+        if layers < 152:
+            self.conv = ConvBNLayer(
+                num_channels=self.input_image_channel,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu',
+                name="conv1",
+                data_format=self.data_format)
+        else:
+            self.conv1_1 = ConvBNLayer(
+                num_channels=self.input_image_channel,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name="conv1",
+                data_format=self.data_format)
+            self.conv1_2 = ConvBNLayer(
+                num_channels=64,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name="conv2",
+                data_format=self.data_format)
+            self.conv1_3 = ConvBNLayer(
+                num_channels=64,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name="conv3",
+                data_format=self.data_format)
+
+        self.pool2d_max = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
+
+        self.block_list = []
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        reduction_ratio=self.reduction_ratio,
+                        shortcut=shortcut,
+                        if_first=block == 0,
+                        name=str(n) + '_' + str(i + 1),
+                        data_format=self.data_format))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        with paddle.static.amp.fp16_guard():
+            return self._forward(inputs)
+
+    def _forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.tensor.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        if self.layers < 152:
+            y = self.conv(inputs)
+        else:
+            y = self.conv1_1(inputs)
+            y = self.conv1_2(y)
+            y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for i, block in enumerate(self.block_list):
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNeXt50_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNeXt50_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNeXt101_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt101_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNeXt152_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt152_64x4d"],
+        use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnext_vd.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnext_vd.py
new file mode 100755
index 000000000..659e190c8
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/se_resnext_vd.py
@@ -0,0 +1,311 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1611.05431 & https://arxiv.org/abs/1812.01187 & https://arxiv.org/abs/1709.01507
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SE_ResNeXt50_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt50_vd_32x4d_pretrained.pdparams",
+    "SENet154_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SENet154_vd_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 reduction_ratio,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name='conv' + name + '_x2')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3')
+        self.scale = SELayer(
+            num_channels=num_filters * 2 if cardinality == 32 else num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name='conv' + name + '_prj')
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = self.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = self.sigmoid(excitation)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = paddle.multiply(input, excitation)
+        return out
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self, layers=50, class_num=1000, cardinality=32):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.reduction_ratio = 16
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [128, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=64,
+            num_filters=128,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        reduction_ratio=self.reduction_ratio,
+                        shortcut=shortcut,
+                        if_first=block == 0,
+                        name=str(n) + '_' + str(i + 1)))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNeXt50_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt50_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SENet154_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SENet154_vd"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/shufflenet_v2.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/shufflenet_v2.py
new file mode 100755
index 000000000..e1058bb08
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/shufflenet_v2.py
@@ -0,0 +1,364 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1807.11164
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr, reshape, transpose, concat, split
+from paddle.nn import Layer, Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm, Linear
+from paddle.nn.initializer import KaimingNormal
+from paddle.nn.functional import swish
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ShuffleNetV2_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_25_pretrained.pdparams",
+    "ShuffleNetV2_x0_33":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_33_pretrained.pdparams",
+    "ShuffleNetV2_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_5_pretrained.pdparams",
+    "ShuffleNetV2_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_0_pretrained.pdparams",
+    "ShuffleNetV2_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_5_pretrained.pdparams",
+    "ShuffleNetV2_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x2_0_pretrained.pdparams",
+    "ShuffleNetV2_swish":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_swish_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+
+    # transpose
+    x = transpose(x=x, perm=[0, 2, 1, 3, 4])
+
+    # flatten
+    x = reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+class ConvBNLayer(Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups=1,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), name=name + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            out_channels,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            act=act,
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class InvertedResidual(Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 act="relu",
+                 name=None):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv1')
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None,
+            name='stage_' + name + '_conv2')
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv3')
+
+    def forward(self, inputs):
+        x1, x2 = split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 act="relu",
+                 name=None):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None,
+            name='stage_' + name + '_conv4')
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv5')
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv1')
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None,
+            name='stage_' + name + '_conv2')
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv3')
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+class ShuffleNet(Layer):
+    def __init__(self, class_num=1000, scale=1.0, act="relu"):
+        super(ShuffleNet, self).__init__()
+        self.scale = scale
+        self.class_num = class_num
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act,
+            name='stage1_conv')
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act,
+                            name=str(stage_id + 2) + '_' + str(i + 1)))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act,
+                            name=str(stage_id + 2) + '_' + str(i + 1)))
+                self._block_list.append(block)
+        # 3. last_conv
+        self._last_conv = ConvBNLayer(
+            in_channels=stage_out_channels[-2],
+            out_channels=stage_out_channels[-1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act=act,
+            name='conv5')
+        # 4. pool
+        self._pool2d_avg = AdaptiveAvgPool2D(1)
+        self._out_c = stage_out_channels[-1]
+        # 5. fc
+        self._fc = Linear(
+            stage_out_channels[-1],
+            class_num,
+            weight_attr=ParamAttr(name='fc6_weights'),
+            bias_attr=ParamAttr(name='fc6_offset'))
+
+    def forward(self, inputs):
+        y = self._conv1(inputs)
+        y = self._max_pool(y)
+        for inv in self._block_list:
+            y = inv(y)
+        y = self._last_conv(y)
+        y = self._pool2d_avg(y)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self._fc(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ShuffleNetV2_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.25, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_25"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x0_33(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.33, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_33"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x1_5"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x2_0"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_swish(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.0, act="swish", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_swish"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/squeezenet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/squeezenet.py
new file mode 100755
index 000000000..5edfaa0ad
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/squeezenet.py
@@ -0,0 +1,196 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1709.01507
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SqueezeNet1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SqueezeNet1_0_pretrained.pdparams",
+    "SqueezeNet1_1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SqueezeNet1_1_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class MakeFireConv(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 padding=0,
+                 name=None):
+        super(MakeFireConv, self).__init__()
+        self._conv = Conv2D(
+            input_channels,
+            output_channels,
+            filter_size,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=ParamAttr(name=name + "_offset"))
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = F.relu(x)
+        return x
+
+
+class MakeFire(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 squeeze_channels,
+                 expand1x1_channels,
+                 expand3x3_channels,
+                 name=None):
+        super(MakeFire, self).__init__()
+        self._conv = MakeFireConv(
+            input_channels, squeeze_channels, 1, name=name + "_squeeze1x1")
+        self._conv_path1 = MakeFireConv(
+            squeeze_channels, expand1x1_channels, 1, name=name + "_expand1x1")
+        self._conv_path2 = MakeFireConv(
+            squeeze_channels,
+            expand3x3_channels,
+            3,
+            padding=1,
+            name=name + "_expand3x3")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x1 = self._conv_path1(x)
+        x2 = self._conv_path2(x)
+        return paddle.concat([x1, x2], axis=1)
+
+
+class SqueezeNet(nn.Layer):
+    def __init__(self, version, class_num=1000):
+        super(SqueezeNet, self).__init__()
+        self.version = version
+
+        if self.version == "1.0":
+            self._conv = Conv2D(
+                3,
+                96,
+                7,
+                stride=2,
+                weight_attr=ParamAttr(name="conv1_weights"),
+                bias_attr=ParamAttr(name="conv1_offset"))
+            self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+            self._conv1 = MakeFire(96, 16, 64, 64, name="fire2")
+            self._conv2 = MakeFire(128, 16, 64, 64, name="fire3")
+            self._conv3 = MakeFire(128, 32, 128, 128, name="fire4")
+
+            self._conv4 = MakeFire(256, 32, 128, 128, name="fire5")
+            self._conv5 = MakeFire(256, 48, 192, 192, name="fire6")
+            self._conv6 = MakeFire(384, 48, 192, 192, name="fire7")
+            self._conv7 = MakeFire(384, 64, 256, 256, name="fire8")
+
+            self._conv8 = MakeFire(512, 64, 256, 256, name="fire9")
+        else:
+            self._conv = Conv2D(
+                3,
+                64,
+                3,
+                stride=2,
+                padding=1,
+                weight_attr=ParamAttr(name="conv1_weights"),
+                bias_attr=ParamAttr(name="conv1_offset"))
+            self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+            self._conv1 = MakeFire(64, 16, 64, 64, name="fire2")
+            self._conv2 = MakeFire(128, 16, 64, 64, name="fire3")
+
+            self._conv3 = MakeFire(128, 32, 128, 128, name="fire4")
+            self._conv4 = MakeFire(256, 32, 128, 128, name="fire5")
+
+            self._conv5 = MakeFire(256, 48, 192, 192, name="fire6")
+            self._conv6 = MakeFire(384, 48, 192, 192, name="fire7")
+            self._conv7 = MakeFire(384, 64, 256, 256, name="fire8")
+            self._conv8 = MakeFire(512, 64, 256, 256, name="fire9")
+
+        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self._conv9 = Conv2D(
+            512,
+            class_num,
+            1,
+            weight_attr=ParamAttr(name="conv10_weights"),
+            bias_attr=ParamAttr(name="conv10_offset"))
+        self._avg_pool = AdaptiveAvgPool2D(1)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = F.relu(x)
+        x = self._pool(x)
+        if self.version == "1.0":
+            x = self._conv1(x)
+            x = self._conv2(x)
+            x = self._conv3(x)
+            x = self._pool(x)
+            x = self._conv4(x)
+            x = self._conv5(x)
+            x = self._conv6(x)
+            x = self._conv7(x)
+            x = self._pool(x)
+            x = self._conv8(x)
+        else:
+            x = self._conv1(x)
+            x = self._conv2(x)
+            x = self._pool(x)
+            x = self._conv3(x)
+            x = self._conv4(x)
+            x = self._pool(x)
+            x = self._conv5(x)
+            x = self._conv6(x)
+            x = self._conv7(x)
+            x = self._conv8(x)
+        x = self._drop(x)
+        x = self._conv9(x)
+        x = F.relu(x)
+        x = self._avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SqueezeNet1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = SqueezeNet(version="1.0", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SqueezeNet1_0"], use_ssld=use_ssld)
+    return model
+
+
+def SqueezeNet1_1(pretrained=False, use_ssld=False, **kwargs):
+    model = SqueezeNet(version="1.1", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SqueezeNet1_1"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/starnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/starnet.py
new file mode 100755
index 000000000..4832be783
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/starnet.py
@@ -0,0 +1,197 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2403.19967
+
+import paddle
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+from ..model_zoo.vision_transformer import DropPath
+
+MODEL_URLS = {
+    "StarNet_S1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/StarNet_S1_pretrained.pdparams",
+    "StarNet_S2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/StarNet_S2_pretrained.pdparams",
+    "StarNet_S3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/StarNet_S3_pretrained.pdparams",
+    "StarNet_S4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/StarNet_S4_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+
+NET_CONFIG = {
+    "StarNet_S1": [24, [2, 2, 8, 3]],
+    "StarNet_S2": [32, [1, 2, 6, 2]],
+    "StarNet_S3": [32, [2, 2, 8, 4]],
+    "StarNet_S4": [32, [3, 3, 12, 5]],
+}
+
+
+class ConvBN(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 with_bn=True):
+        super().__init__()
+        self.add_sublayer(
+            name='conv',
+            sublayer=nn.Conv2D(
+                in_channels=in_planes,
+                out_channels=out_planes,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups))
+        if with_bn:
+            self.add_sublayer(
+                name='bn', sublayer=nn.BatchNorm2D(num_features=out_planes))
+            init_Constant = nn.initializer.Constant(value=1)
+            init_Constant(self.bn.weight)
+            init_Constant = nn.initializer.Constant(value=0)
+            init_Constant(self.bn.bias)
+
+
+class Block(nn.Layer):
+    def __init__(self, dim, mlp_ratio=3, drop_path=0.0):
+        super().__init__()
+        self.dwconv = ConvBN(
+            dim, dim, 7, 1, (7 - 1) // 2, groups=dim, with_bn=True)
+        self.f1 = ConvBN(dim, mlp_ratio * dim, 1, with_bn=False)
+        self.f2 = ConvBN(dim, mlp_ratio * dim, 1, with_bn=False)
+        self.g = ConvBN(mlp_ratio * dim, dim, 1, with_bn=True)
+        self.dwconv2 = ConvBN(
+            dim, dim, 7, 1, (7 - 1) // 2, groups=dim, with_bn=False)
+        self.act = nn.ReLU6()
+        self.drop_path = (DropPath(drop_path)
+                          if drop_path > 0. else nn.Identity())
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x1, x2 = self.f1(x), self.f2(x)
+        x = self.act(x1) * x2
+        x = self.dwconv2(self.g(x))
+        x = input + self.drop_path(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError("pretrained type is not available. ")
+
+
+class StarNet(nn.Layer):
+    """
+    StarNet: StarNet for Image Classification
+    Args:
+        base_dim: int, base dimension of the model, default 32.
+        depths: list, number of blocks in each stage, default [3, 3, 12, 5].
+        mlp_ratio: int, ratio of hidden dim to mlp_dim, default 4.
+        drop_path_rate: float, default 0.0, stochastic depth rate.
+        class_num: int, default 1000, number of classes.
+    """
+    def __init__(self,
+                 base_dim=32,
+                 depths=[3, 3, 12, 5],
+                 mlp_ratio=4,
+                 drop_path_rate=0.0,
+                 class_num=1000,
+                 **kwargs):
+        super().__init__()
+        self.class_num = class_num
+        self.in_channel = 32
+        self.stem = nn.Sequential(
+            ConvBN(
+                3, self.in_channel, kernel_size=3, stride=2, padding=1),
+            nn.ReLU6())
+        dpr = [
+            x.item()
+            for x in paddle.linspace(
+                start=0, stop=drop_path_rate, num=sum(depths))
+        ]
+        self.stages = nn.LayerList()
+        cur = 0
+        for i_layer in range(len(depths)):
+            embed_dim = base_dim * 2**i_layer
+            down_sampler = ConvBN(self.in_channel, embed_dim, 3, 2, 1)
+            self.in_channel = embed_dim
+            blocks = [
+                Block(self.in_channel, mlp_ratio, dpr[cur + i])
+                for i in range(depths[i_layer])
+            ]
+            cur += depths[i_layer]
+            self.stages.append(nn.Sequential(down_sampler, *blocks))
+        self.norm = nn.BatchNorm2D(num_features=self.in_channel)
+        self.avgpool = nn.AdaptiveAvgPool2D(output_size=1)
+        self.head = nn.Linear(
+            in_features=self.in_channel, out_features=class_num)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear or nn.Conv2D):
+            pass
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                init_Constant = nn.initializer.Constant(value=0)
+                init_Constant(m.bias)
+        elif isinstance(m, nn.LayerNorm or nn.BatchNorm2D):
+            init_Constant = nn.initializer.Constant(value=0)
+            init_Constant(m.bias)
+            init_Constant = nn.initializer.Constant(value=1.0)
+            init_Constant(m.weight)
+
+    def forward(self, x):
+        x = self.stem(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = paddle.flatten(x=self.avgpool(self.norm(x)), start_axis=1)
+        return self.head(x)
+
+
+def StarNet_S1(pretrained=False, use_ssld=False, **kwargs):
+    model = StarNet(*NET_CONFIG["StarNet_S1"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["StarNet_S1"], use_ssld)
+    return model
+
+
+def StarNet_S2(pretrained=False, use_ssld=False, **kwargs):
+    model = StarNet(*NET_CONFIG["StarNet_S2"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["StarNet_S2"], use_ssld)
+    return model
+
+
+def StarNet_S3(pretrained=False, use_ssld=False, **kwargs):
+    model = StarNet(*NET_CONFIG["StarNet_S3"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["StarNet_S3"], use_ssld)
+    return model
+
+
+def StarNet_S4(pretrained=False, use_ssld=False, **kwargs):
+    model = StarNet(*NET_CONFIG["StarNet_S4"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["StarNet_S4"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/svtrnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/svtrnet.py
new file mode 100755
index 000000000..1ee4ab9fc
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/svtrnet.py
@@ -0,0 +1,699 @@
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+from paddle.nn import functional as F
+
+trunc_normal_ = TruncatedNormal(std=.02)
+normal_ = Normal
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+def resize_pos_embed(pos_embed,
+                     src_shape,
+                     dst_shape,
+                     mode='bicubic',
+                     num_extra_tokens=1):
+    """Resize pos_embed weights.
+
+    Args:
+        pos_embed (paddle.Tensor): Position embedding weights with shape
+            [1, L, C].
+        src_shape (tuple): The resolution of downsampled origin training
+            image, in format (H, W).
+        dst_shape (tuple): The resolution of downsampled new training
+            image, in format (H, W).
+        mode (str): Algorithm used for upsampling. Choose one from 'nearest',
+            'linear', 'bilinear', 'bicubic' and 'trilinear'.
+            Defaults to 'bicubic'.
+        num_extra_tokens (int): The number of extra tokens, such as cls_token.
+            Defaults to 1.
+
+    Returns:
+        paddle.Tensor: The resized pos_embed of shape [1, L_new, C]
+    """
+    if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1]:
+        return pos_embed
+    assert pos_embed.ndim == 3, 'shape of pos_embed must be [1, L, C]'
+    _, L, C = pos_embed.shape
+    src_h, src_w = src_shape
+    assert L == src_h * src_w + num_extra_tokens, \
+        f"The length of `pos_embed` ({L}) doesn't match the expected " \
+        f'shape ({src_h}*{src_w}+{num_extra_tokens}). Please check the' \
+        '`img_size` argument.'
+    extra_tokens = pos_embed[:, :num_extra_tokens]
+
+    src_weight = pos_embed[:, num_extra_tokens:]
+    src_weight = src_weight.reshape([-1, src_h, src_w, C]).transpose(
+        [0, 3, 1, 2])
+
+    # The cubic interpolate algorithm only accepts float32
+    dst_weight = F.interpolate(
+        paddle.cast(src_weight, paddle.float32),
+        size=dst_shape,
+        align_corners=False,
+        mode=mode)
+    dst_weight = paddle.flatten(dst_weight, 2).transpose([0, 2, 1])
+    dst_weight = paddle.cast(dst_weight, src_weight.dtype)
+
+    return paddle.concat((extra_tokens, dst_weight), axis=1)
+
+def pading_for_not_divisible(pixel_values,
+                             height,
+                             width,
+                             patch_size,
+                             format="NCHW",
+                             function="split"):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    if height % patch_size[0] == 0 and width % patch_size[1] == 0:
+        return pixel_values, (0, 0, 0, 0, 0, 0, 0, 0)
+    if function == "split":
+        pading_width = patch_size[1] - width % patch_size[1]
+        pading_height = patch_size[0] - height % patch_size[0]
+    elif function == "merge":
+        pading_width = width % 2
+        pading_height = height % 2
+    if format == "NCHW":
+        pad_index = [0, 0, 0, 0, 0, pading_height, 0, pading_width]
+    elif format == "NHWC":
+        pad_index = [0, 0, 0, pading_height, 0, pading_width, 0, 0]
+    else:
+        assert ("vaild format")
+
+    return F.pad(pixel_values, pad_index), pad_index
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 bias_attr=False,
+                 groups=1,
+                 act=nn.GELU):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.KaimingUniform()),
+            bias_attr=bias_attr)
+        self.norm = nn.BatchNorm2D(out_channels)
+        self.act = act()
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+        out = self.act(out)
+        return out
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class ConvMixer(nn.Layer):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            HW=[8, 25],
+            local_k=[3, 3], ):
+        super().__init__()
+        self.HW = HW
+        self.dim = dim
+        self.local_mixer = nn.Conv2D(
+            dim,
+            dim,
+            local_k,
+            1, [local_k[0] // 2, local_k[1] // 2],
+            groups=num_heads,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+
+    def forward(self, x, input_dimension):
+        h, w = input_dimension
+        x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
+        x = self.local_mixer(x)
+        x = x.flatten(2).transpose([0, 2, 1])
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 mixer='Global',
+                 HW=None,
+                 local_k=[7, 11],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim = dim
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.HW = HW
+        self.local_k = local_k
+        self.mixer = mixer
+    def get_mask(self,input_dimension):
+        if self.HW is not None:
+            H = input_dimension[0]
+            W = input_dimension[1]
+            self.N = H * W
+            self.C = self.dim
+        if self.mixer == 'Local' and self.HW is not None:
+            hk = self.local_k[0]
+            wk = self.local_k[1]
+            mask = paddle.ones(
+                [H * W, H + hk - 1, W + wk - 1], dtype='float32')
+            for h in range(0, H):
+                for w in range(0, W):
+                    mask[h * W + w, h:h + hk, w:w + wk] = 0.
+            mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
+                               2].flatten(1)
+            mask_inf = paddle.full([H * W, H * W], '-inf', dtype='float32')
+            mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf)
+            return mask
+        return None
+    def forward(self, x, input_dimension):
+        qkv = self.qkv(x).reshape(
+            (0, -1, 3, self.num_heads, self.head_dim)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2))))
+        if self.mixer == 'Local':
+            attn += self.get_mask(input_dimension)
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, -1, self.dim))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mixer='Global',
+                 local_mixer=[7, 11],
+                 HW=None,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-6,
+                 prenorm=True):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        else:
+            self.norm1 = norm_layer(dim)
+        if mixer == 'Global' or mixer == 'Local':
+            self.mixer = Attention(
+                dim,
+                num_heads=num_heads,
+                mixer=mixer,
+                HW=HW,
+                local_k=local_mixer,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop)
+        elif mixer == 'Conv':
+            self.mixer = ConvMixer(
+                dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
+        else:
+            raise TypeError("The mixer must be one of [Global, Local, Conv]")
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        else:
+            self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp_ratio = mlp_ratio
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.prenorm = prenorm
+
+    def forward(self, x, input_dimension):
+        if self.prenorm:
+            x = self.norm1(x + self.drop_path(self.mixer(x,input_dimension)))
+            x = self.norm2(x + self.drop_path(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.mixer(self.norm1(x),input_dimension))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=[32, 100],
+                 in_channels=3,
+                 embed_dim=768,
+                 sub_num=2,
+                 patch_size=[4, 4],
+                 mode='pope'):
+        super().__init__()
+        num_patches = (img_size[1] // (2 ** sub_num)) * \
+                      (img_size[0] // (2 ** sub_num))
+        self.img_size = img_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+        self.patch_size = patch_size
+        self.window_size = ((img_size[0] // (2 ** sub_num), (img_size[1] // (2 ** sub_num))))
+        self.norm = None
+        if mode == 'pope':
+            if sub_num == 2:
+                self.proj = nn.Sequential(
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=embed_dim // 2,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 2,
+                        out_channels=embed_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None))
+            if sub_num == 3:
+                self.proj = nn.Sequential(
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=embed_dim // 4,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 4,
+                        out_channels=embed_dim // 2,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 2,
+                        out_channels=embed_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None))
+        elif mode == 'linear':
+            self.proj = nn.Conv2D(
+                1, embed_dim, kernel_size=patch_size, stride=patch_size)
+            self.num_patches = img_size[0] // patch_size[0] * img_size[
+                1] // patch_size[1]
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+
+        x, _ = pading_for_not_divisible(x, H, W, self.patch_size, "BCHW")
+        x = self.proj(x)
+        _, _, height, width = x.shape
+        output_dimensions = (height, width)
+        x = x.flatten(2).transpose((0, 2, 1))
+        return x, output_dimensions
+
+
+class SubSample(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 types='Pool',
+                 stride=[2, 1],
+                 sub_norm='nn.LayerNorm',
+                 act=None):
+        super().__init__()
+        self.types = types
+        if types == 'Pool':
+            self.avgpool = nn.AvgPool2D(
+                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+            self.maxpool = nn.MaxPool2D(
+                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+            self.proj = nn.Linear(in_channels, out_channels)
+        else:
+            self.conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                weight_attr=ParamAttr(initializer=KaimingNormal()))
+        self.norm = eval(sub_norm)(out_channels)
+        if act is not None:
+            self.act = act()
+        else:
+            self.act = None
+
+    def forward(self, x):
+
+        if self.types == 'Pool':
+            x1 = self.avgpool(x)
+            x2 = self.maxpool(x)
+            x = (x1 + x2) * 0.5
+            output_dimension = (x.shape[2],x.shape[3])
+            out = self.proj(x.flatten(2).transpose((0, 2, 1)))
+        else:
+            x = self.conv(x)
+            output_dimension = (x.shape[2],x.shape[3])
+            out = x.flatten(2).transpose((0, 2, 1))
+        out = self.norm(out)
+        if self.act is not None:
+            out = self.act(out)
+
+        return out, output_dimension
+
+
+class SVTRNet(nn.Layer):
+    def __init__(
+            self,
+            class_num=1000,
+            img_size=[48, 320],
+            in_channels=3,
+            embed_dim=[192, 256, 512],
+            depth=[6, 6, 9],
+            num_heads=[6, 8, 16],
+            mixer=['Conv'] * 9 + ['Global'] *
+            12,  # Local atten, Global atten, Conv
+            local_mixer=[[5, 5], [5, 5], [5, 5]],
+            patch_merging='Conv',  # Conv, Pool, None
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            last_drop=0.1,
+            attn_drop_rate=0.,
+            drop_path_rate=0.1,
+            norm_layer='nn.LayerNorm',
+            sub_norm='nn.LayerNorm',
+            epsilon=1e-6,
+            out_channels=512,
+            out_char_num=40,
+            block_unit='Block',
+            act='nn.GELU',
+            last_stage=False,
+            sub_num=2,
+            prenorm=True,
+            use_lenhead=False,
+            **kwargs):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.out_channels = out_channels
+        self.prenorm = prenorm
+        patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim[0],
+            sub_num=sub_num)
+        num_patches = self.patch_embed.num_patches
+        self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
+        self.pos_embed = self.create_parameter(
+            shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        Block_unit = eval(block_unit)
+
+        dpr = np.linspace(0, drop_path_rate, sum(depth))
+        self.blocks1 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[0],
+                num_heads=num_heads[0],
+                mixer=mixer[0:depth[0]][i],
+                HW=self.HW,
+                local_mixer=local_mixer[0],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[0:depth[0]][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[0])
+        ])
+        if patch_merging is not None:
+            self.sub_sample1 = SubSample(
+                embed_dim[0],
+                embed_dim[1],
+                sub_norm=sub_norm,
+                stride=[2, 1],
+                types=patch_merging)
+            HW = [self.HW[0] // 2, self.HW[1]]
+        else:
+            HW = self.HW
+        self.patch_merging = patch_merging
+        self.blocks2 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[1],
+                num_heads=num_heads[1],
+                mixer=mixer[depth[0]:depth[0] + depth[1]][i],
+                HW=HW,
+                local_mixer=local_mixer[1],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[1])
+        ])
+        if patch_merging is not None:
+            self.sub_sample2 = SubSample(
+                embed_dim[1],
+                embed_dim[2],
+                sub_norm=sub_norm,
+                stride=[2, 1],
+                types=patch_merging)
+            HW = [self.HW[0] // 4, self.HW[1]]
+        else:
+            HW = self.HW
+        self.blocks3 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mixer=mixer[depth[0] + depth[1]:][i],
+                HW=HW,
+                local_mixer=local_mixer[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[depth[0] + depth[1]:][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[2])
+        ])
+        self.flatten = nn.Flatten(start_axis=0, stop_axis=1)
+        self.fc = nn.Linear(embed_dim[2], class_num)
+
+        self.last_stage = last_stage
+        if last_stage:
+            self.avg_pool = nn.AdaptiveAvgPool2D([1, out_char_num])
+            self.last_conv = nn.Conv2D(
+                in_channels=embed_dim[2],
+                out_channels=self.out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.hardswish = nn.Hardswish()
+            self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer")
+        if not prenorm:
+            self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon)
+        self.use_lenhead = use_lenhead
+        if use_lenhead:
+            self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
+            self.hardswish_len = nn.Hardswish()
+            self.dropout_len = nn.Dropout(
+                p=last_drop, mode="downscale_in_infer")
+
+        trunc_normal_(self.pos_embed)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x,output_dimensions = self.patch_embed(x)
+        x = x + resize_pos_embed(self.pos_embed,self.patch_embed.window_size,output_dimensions,num_extra_tokens=0)
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x, output_dimensions)
+        if self.patch_merging is not None:
+            x, output_dimensions = self.sub_sample1(
+                x.transpose([0, 2, 1]).reshape(
+                    [0, self.embed_dim[0], output_dimensions[0], output_dimensions[1]]))
+        for blk in self.blocks2:
+            x = blk(x, output_dimensions)
+        if self.patch_merging is not None:
+            x, output_dimensions = self.sub_sample2(
+                x.transpose([0, 2, 1]).reshape(
+                    [0, self.embed_dim[1], output_dimensions[0], output_dimensions[1]]))
+        for blk in self.blocks3:
+            x = blk(x, output_dimensions)
+        if not self.prenorm:
+            x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = x.mean(1)
+        x = self.fc(x)
+        return x
+
+
+def SVTR_tiny(pretrained=False, use_ssld=False, **kwargs):
+    model = SVTRNet(
+        img_size=[48, 320],
+        embed_dim=[64, 128, 256],
+        depth=[3, 6, 3],
+        num_heads=[2, 4, 8],
+        mixer=['Conv'] * 6 + ['Global'] * 6,
+        local_mixer=[[5, 5], [5, 5], [5, 5]],
+        mlp_ratio=4,
+        qkv_bias=True,
+        out_channels=256,
+        out_char_num=40,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+def SVTR_base(pretrained=False, use_ssld=False, **kwargs):
+    model = SVTRNet(
+        img_size=[48, 320],
+        embed_dim=[128, 256, 384],
+        depth=[6, 6, 6],
+        num_heads=[4, 8, 12],
+        mixer=['Conv'] * 9 + ['Global'] * 12,
+        local_mixer=[[5, 5], [5, 5], [5, 5]],
+        mlp_ratio=4,
+        qkv_bias=True,
+        out_channels=384,
+        out_char_num=40,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+def SVTR_large(pretrained=False, use_ssld=False, **kwargs):
+    model = SVTRNet(
+        img_size=[48, 320],
+        embed_dim=[192, 256, 512],
+        depth=[6, 6, 9],
+        num_heads=[6, 8, 16],
+        mixer=['Conv'] * 9 + ['Global'] * 12,
+        local_mixer=[[5, 5], [5, 5], [5, 5]],
+        mlp_ratio=4,
+        qkv_bias=True,
+        out_channels=512,
+        out_char_num=40,
+        epsilon=1e-6,
+        **kwargs)
+    return model
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/swin_transformer_v2.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/swin_transformer_v2.py
new file mode 100755
index 000000000..d2f89446a
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/swin_transformer_v2.py
@@ -0,0 +1,1061 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/microsoft/Swin-Transformer
+# reference: https://arxiv.org/abs/2111.09883
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+import numpy as np
+import math
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SwinTransformerV2_tiny_patch4_window8_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_tiny_patch4_window8_256_pretrained.pdparams",
+    "SwinTransformerV2_tiny_patch4_window16_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_tiny_patch4_window16_256_pretrained.pdparams",
+    "SwinTransformerV2_small_patch4_window8_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_small_patch4_window8_256_pretrained.pdparams",
+    "SwinTransformerV2_small_patch4_window16_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_small_patch4_window16_256_pretrained.pdparams",
+    "SwinTransformerV2_base_patch4_window8_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_base_patch4_window8_256_pretrained.pdparams",
+    "SwinTransformerV2_base_patch4_window16_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_base_patch4_window16_256_pretrained.pdparams",
+    "SwinTransformerV2_base_patch4_window24_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_base_patch4_window24_384_pretrained.pdparams",
+    "SwinTransformerV2_large_patch4_window16_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_large_patch4_window16_256_pretrained.pdparams",
+    "SwinTransformerV2_large_patch4_window24_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_large_patch4_window24_384_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def pading_for_not_divisible(pixel_values,
+                             height,
+                             width,
+                             patch_size,
+                             format="BCHW",
+                             function="split"):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    if function == "split":
+        pading_width = patch_size[1] - width % patch_size[1]
+        pading_height = patch_size[0] - height % patch_size[0]
+    elif function == "merge":
+        pading_width = width % 2
+        pading_height = height % 2
+    if format == "BCHW":
+        pad_index = (0, 0, 0, 0, 0, pading_height, 0, pading_width)
+    elif format == "BHWC":
+        pad_index = (0, 0, 0, pading_height, 0, pading_width, 0, 0)
+    else:
+        assert ("vaild format")
+
+    return F.pad(pixel_values, pad_index), pad_index
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose(perm=[0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def pad_patch(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose(perm=[0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    C = windows.shape[-1]
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose(perm=[0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        pretrained_window_size (tuple[int]): The height and width of the window in pre-training.
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 pretrained_window_size=[0, 0]):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.pretrained_window_size = pretrained_window_size
+        self.num_heads = num_heads
+
+        self.logit_scale = self.create_parameter(
+            [num_heads, 1, 1],
+            dtype='float32',
+            default_initializer=Constant(math.log(10.)))
+
+        # mlp to generate continuous relative position bias
+        self.cpb_mlp = nn.Sequential(
+            nn.Linear(
+                2, 512, bias_attr=True),
+            nn.ReLU(),
+            nn.Linear(
+                512, num_heads, bias_attr=False))
+
+        # get relative_coords_table
+        relative_coords_h = paddle.arange(
+            -(self.window_size[0] - 1), self.window_size[0], dtype='float32')
+        relative_coords_w = paddle.arange(
+            -(self.window_size[1] - 1), self.window_size[1], dtype='float32')
+        relative_coords_table = paddle.stack(
+            paddle.meshgrid([relative_coords_h, relative_coords_w])).transpose(
+                perm=[1, 2, 0]).unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2
+        if pretrained_window_size[0] > 0:
+            relative_coords_table[:, :, :, 0] /= (
+                pretrained_window_size[0] - 1)
+            relative_coords_table[:, :, :, 1] /= (
+                pretrained_window_size[1] - 1)
+        else:
+            relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
+            relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
+        relative_coords_table *= 8  # normalize to -8, 8
+        relative_coords_table = paddle.sign(
+            relative_coords_table) * paddle.log2(
+                paddle.abs(relative_coords_table) + 1.0) / np.log2(8)
+
+        self.register_buffer("relative_coords_table", relative_coords_table)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose(
+            perm=[1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                [dim], dtype='float32', default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                [dim], dtype='float32', default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                x=[self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias])
+        qkv = F.linear(x=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(shape=[
+            B_, N, 3, self.num_heads, qkv.shape[-1] // (3 * self.num_heads)
+        ]).transpose(perm=[2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make paddlescript happy (cannot use tensor as tuple)
+
+        # cosine attention
+        attn = (F.normalize(
+            q, axis=-1) @F.normalize(
+                k, axis=-1).transpose(perm=[0, 1, 3, 2]))
+        logit_scale = paddle.clip(
+            self.logit_scale, max=math.log(1. / 0.01)).exp()
+        attn = attn * logit_scale
+
+        relative_position_bias_table = self.cpb_mlp(
+            self.relative_coords_table).reshape([-1, self.num_heads])
+        relative_position_bias = relative_position_bias_table[
+            self.relative_position_index.reshape([-1])].reshape([
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1], -1
+            ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            perm=[2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        relative_position_bias = 16 * F.sigmoid(relative_position_bias)
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @v).transpose(perm=[0, 2, 1, 3]).reshape(shape=[B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self):
+        return f'dim={self.dim}, window_size={self.window_size}, ' \
+               f'pretrained_window_size={self.pretrained_window_size}, num_heads={self.num_heads}'
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        flops += N * self.dim * 3 * self.dim
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class SwinTransformerBlock(nn.Layer):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        pretrained_window_size (int): Window size in pre-training.
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=8,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 pretrained_window_size=0):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            pretrained_window_size=to_2tuple(pretrained_window_size))
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        """
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = paddle.zeros([1, H, W, 1])  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(
+                img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.reshape(
+                shape=[-1, self.window_size * self.window_size])
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = masked_fill(attn_mask, attn_mask != 0, float(-100.0))
+            attn_mask = masked_fill(attn_mask, attn_mask == 0, float(0.0))
+        else:
+        """
+        H, W = self.input_resolution
+        attn_mask = paddle.zeros([1, H, W, 1])
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+
+        x = x.reshape([B, H, W, C])
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [-1, self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_mask = self.get_attn_mask(height_pad, width_pad, x.dtype)
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, height_pad,
+                                   width_pad)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([B, H * W, C])
+        x = shortcut + self.drop_path(self.norm1(x))
+
+        # FFN
+        x = x + self.drop_path(self.norm2(self.mlp(x)))
+        return x
+
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(2 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = input_dimensions
+        B, L, C = x.shape
+
+        x = x.reshape([B, H // 2, 2, W // 2, 2, C])
+        x = x.transpose((0, 1, 3, 4, 2, 5))
+        x = x.reshape([B, H * W // 4, 4 * C])  # B H/2*W/2 4*C
+        x = self.reduction(x)
+        x = self.norm(x)
+        return x
+
+    def extra_repr(self):
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        flops += H * W * self.dim // 2
+        return flops
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        pretrained_window_size (int): Local window size in pre-training.
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 pretrained_window_size=0):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                pretrained_window_size=pretrained_window_size)
+            for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, input_dimensions):
+        H, W = input_dimensions
+        for blk in self.blocks:
+            x = blk(x, input_dimensions)
+        if self.downsample is not None:
+            H, W = (H + 1) // 2, (W + 1) // 2
+            x = self.downsample(x, input_dimensions)
+
+        return x, (H, W)
+
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Layer):
+    r""" Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 256.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 img_size=256,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def maybe_pad(self, pixel_values, height, width):
+        if width % self.patch_size[1] != 0:
+            pad_values = (0, 0, 0, 0, 0, 0, 0,
+                          self.patch_size[1] - width % self.patch_size[1])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        if height % self.patch_size[0] != 0:
+            pad_values = (
+                0,
+                0,
+                0,
+                0,
+                0,
+                self.patch_size[0] - height % self.patch_size[0],
+                0,
+                0, )
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        return pixel_values
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose([0, 2, 1])  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, output_dimensions
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (
+            self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+
+class SwinTransformerV2(nn.Layer):
+    r""" Swin TransformerV2
+        A PaddlePaddle impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`  -
+          https://arxiv.org/abs/2111.09883
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 256
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        class_num (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        pretrained_window_sizes (tuple(int)): Pretrained window sizes of each layer.
+    """
+
+    def __init__(self,
+                 img_size=256,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 pretrained_window_sizes=[0, 0, 0, 0],
+                 **kwargs):
+        super().__init__()
+
+        self.class_num = class_num
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.img_size = img_size
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = self.create_parameter(
+                shape=(1, num_patches, embed_dim), default_initializer=zeros_)
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                input_resolution=(patches_resolution[0] // (2**i_layer),
+                                  patches_resolution[1] // (2**i_layer)),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None,
+                pretrained_window_size=pretrained_window_sizes[i_layer])
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1D(1)
+        self.head = nn.Linear(self.num_features,
+                              class_num) if class_num > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x, output_dimensions = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x, output_dimensions = layer(x, input_dimensions=output_dimensions)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose([0, 2, 1]))  # B C 1
+        x = paddle.flatten(x, 1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for i, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += self.num_features * self.patches_resolution[
+            0] * self.patches_resolution[1] // (2**self.num_layers)
+        flops += self.num_features * self.class_num
+        return flops
+
+
+def _load_pretrained(pretrained,
+                     model,
+                     model_url,
+                     use_ssld=False,
+                     use_imagenet22k_pretrained=False,
+                     use_imagenet22kto1k_pretrained=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(
+            model,
+            model_url,
+            use_ssld=use_ssld,
+            use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+            use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained, **kwargs)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SwinTransformerV2_tiny_patch4_window8_256(pretrained=False,
+                                              use_ssld=False,
+                                              **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=8,
+        drop_path_rate=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_tiny_patch4_window8_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_tiny_patch4_window16_256(pretrained=False,
+                                               use_ssld=False,
+                                               **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_tiny_patch4_window16_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_small_patch4_window8_256(pretrained=False,
+                                               use_ssld=False,
+                                               **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=8,
+        drop_path_rate=0.3,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_small_patch4_window8_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_small_patch4_window16_256(pretrained=False,
+                                                use_ssld=False,
+                                                **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_small_patch4_window16_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_base_patch4_window8_256(pretrained=False,
+                                              use_ssld=False,
+                                              **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=8,
+        drop_path_rate=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_base_patch4_window8_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_base_patch4_window16_256(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=16,
+        drop_path_rate=0.5,  # if use imagenet22k or imagenet22kto1k, drop_path_rate=0.2
+        **kwargs
+    )  # if use imagenet22k, set pretrained_window_sizes=[12, 12, 12, 6]
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_base_patch4_window16_256"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformerV2_base_patch4_window24_384(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformerV2(
+        img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=24,
+        drop_path_rate=0.2,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_base_patch4_window24_384"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformerV2_large_patch4_window16_256(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=16,
+        drop_path_rate=0.2,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_large_patch4_window16_256"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformerV2_large_patch4_window24_384(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformerV2(
+        img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=24,
+        drop_path_rate=0.2,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_large_patch4_window24_384"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/tinynet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/tinynet.py
new file mode 100755
index 000000000..484004615
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/tinynet.py
@@ -0,0 +1,196 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://gitee.com/mindspore/models/tree/master/research/cv/tinynet
+# reference: https://arxiv.org/abs/2010.14819
+
+import paddle.nn as nn
+
+from .efficientnet import EfficientNet, efficientnet
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "TinyNet_A":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_A_pretrained.pdparams",
+    "TinyNet_B":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_B_pretrained.pdparams",
+    "TinyNet_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_C_pretrained.pdparams",
+    "TinyNet_D":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_D_pretrained.pdparams",
+    "TinyNet_E":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_E_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def tinynet_params(model_name):
+    """ Map TinyNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients: width,depth,resolution,dropout
+        "tinynet-a": (1.00, 1.200, 192, 0.2),
+        "tinynet-b": (0.75, 1.100, 188, 0.2),
+        "tinynet-c": (0.54, 0.850, 184, 0.2),
+        "tinynet-d": (0.54, 0.695, 152, 0.2),
+        "tinynet-e": (0.51, 0.600, 106, 0.2),
+    }
+    return params_dict[model_name]
+
+
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('tinynet'):
+        w, d, _, p = tinynet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' %
+                                  model_name)
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+class TinyNet(EfficientNet):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            fin_in = m.weight.shape[1] * m.weight.shape[2] * m.weight.shape[3]
+            std = (2 / fin_in)**0.5
+            nn.initializer.Normal(std=std)(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            fin_in = m.weight.shape[0]
+            bound = 1 / fin_in**0.5
+            nn.initializer.Uniform(-bound, bound)(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def TinyNet_A(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-a", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='a',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_A"], use_ssld)
+    return model
+
+
+def TinyNet_B(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-b", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='b',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_B"], use_ssld)
+    return model
+
+
+def TinyNet_C(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-c", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='c',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_C"], use_ssld)
+    return model
+
+
+def TinyNet_D(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-d", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='d',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_D"], use_ssld)
+    return model
+
+
+def TinyNet_E(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-e", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='e',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_E"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/tnt.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/tnt.py
new file mode 100755
index 000000000..a1841d367
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/tnt.py
@@ -0,0 +1,410 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/tnt_pytorch
+# reference: https://arxiv.org/abs/2103.00112
+
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ..base.theseus_layer import Identity
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "TNT_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TNT_small_pretrained.pdparams",
+    "TNT_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TNT_base_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = paddle.add(keep_prob, paddle.rand(shape, dtype=x.dtype))
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 hidden_dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        head_dim = hidden_dim // num_heads
+        self.head_dim = head_dim
+        self.scale = head_dim**-0.5
+
+        self.qk = nn.Linear(dim, hidden_dim * 2, bias_attr=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qk = self.qk(x).reshape(
+            (B, N, 2, self.num_heads, self.head_dim)).transpose(
+                (2, 0, 3, 1, 4))
+
+        q, k = qk[0], qk[1]
+        v = self.v(x).reshape(
+            (B, N, self.num_heads, x.shape[-1] // self.num_heads)).transpose(
+                (0, 2, 1, 3))
+
+        attn = paddle.matmul(q, k.transpose((0, 1, 3, 2))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = paddle.matmul(attn, v)
+        x = x.transpose((0, 2, 1, 3)).reshape(
+            (B, N, x.shape[-1] * x.shape[-3]))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 in_dim,
+                 num_pixel,
+                 num_heads=12,
+                 in_num_head=4,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        # Inner transformer
+        self.norm_in = norm_layer(in_dim)
+        self.attn_in = Attention(
+            in_dim,
+            in_dim,
+            num_heads=in_num_head,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.norm_mlp_in = norm_layer(in_dim)
+        self.mlp_in = Mlp(in_features=in_dim,
+                          hidden_features=int(in_dim * 4),
+                          out_features=in_dim,
+                          act_layer=act_layer,
+                          drop=drop)
+
+        self.norm1_proj = norm_layer(in_dim * num_pixel)
+        self.proj = nn.Linear(in_dim * num_pixel, dim, bias_attr=False)
+        self.norm2_proj = norm_layer(in_dim * num_pixel)
+
+        # Outer transformer
+        self.norm_out = norm_layer(dim)
+        self.attn_out = Attention(
+            dim,
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+
+        self.norm_mlp = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       out_features=dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, pixel_embed, patch_embed):
+        # inner
+        pixel_embed = paddle.add(
+            pixel_embed,
+            self.drop_path(self.attn_in(self.norm_in(pixel_embed))))
+        pixel_embed = paddle.add(
+            pixel_embed,
+            self.drop_path(self.mlp_in(self.norm_mlp_in(pixel_embed))))
+        # outer
+        B, N, C = patch_embed.shape
+        norm1_proj = pixel_embed.reshape(shape=[B, N - 1, C])
+        norm1_proj = self.norm1_proj(norm1_proj)
+        patch_embed[:, 1:] = paddle.add(
+            patch_embed[:, 1:], self.norm2_proj(self.proj(norm1_proj)))
+        patch_embed = paddle.add(
+            patch_embed,
+            self.drop_path(self.attn_out(self.norm_out(patch_embed))))
+        patch_embed = paddle.add(
+            patch_embed, self.drop_path(self.mlp(self.norm_mlp(patch_embed))))
+        return pixel_embed, patch_embed
+
+
+class PixelEmbed(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 in_dim=48,
+                 stride=4):
+        super().__init__()
+        self.patch_size = patch_size
+        num_patches = (img_size // patch_size)**2
+        self.img_size = img_size
+        self.num_patches = num_patches
+        self.in_dim = in_dim
+        new_patch_size = math.ceil(patch_size / stride)
+        self.new_patch_size = new_patch_size
+
+        self.proj = nn.Conv2D(
+            in_chans, self.in_dim, kernel_size=7, padding=3, stride=stride)
+
+    def forward(self, x, pixel_pos):
+        B, C, H, W = x.shape
+        assert H == self.img_size and W == self.img_size, f"Input image size ({H}*{W}) doesn't match model ({self.img_size}*{self.img_size})."
+        x = nn.functional.unfold(x, self.patch_size, self.patch_size)
+        x = x.transpose((0, 2, 1)).reshape(
+            (-1, C, self.patch_size, self.patch_size))
+        x = self.proj(x)
+        x = x.reshape((-1, self.in_dim, self.patch_size)).transpose((0, 2, 1))
+        x = x + pixel_pos
+        return x
+
+
+class TNT(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 in_dim=48,
+                 depth=12,
+                 num_heads=12,
+                 in_num_head=4,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 first_stride=4,
+                 class_num=1000):
+        super().__init__()
+        self.class_num = class_num
+        # num_features for consistency with other models
+        self.num_features = self.embed_dim = embed_dim
+
+        self.pixel_embed = PixelEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            in_dim=in_dim,
+            stride=first_stride)
+        num_patches = self.pixel_embed.num_patches
+        self.num_patches = num_patches
+        new_patch_size = self.pixel_embed.new_patch_size
+        num_pixel = new_patch_size**2
+
+        self.norm1_proj = norm_layer(num_pixel * in_dim)
+        self.proj = nn.Linear(num_pixel * in_dim, embed_dim)
+        self.norm2_proj = norm_layer(embed_dim)
+
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+
+        self.patch_pos = self.create_parameter(
+            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("patch_pos", self.patch_pos)
+
+        self.pixel_pos = self.create_parameter(
+            shape=(1, patch_size, in_dim), default_initializer=zeros_)
+        self.add_parameter("pixel_pos", self.pixel_pos)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        blocks = []
+        for i in range(depth):
+            blocks.append(
+                Block(
+                    dim=embed_dim,
+                    in_dim=in_dim,
+                    num_pixel=num_pixel,
+                    num_heads=num_heads,
+                    in_num_head=in_num_head,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer))
+        self.blocks = nn.LayerList(blocks)
+        self.norm = norm_layer(embed_dim)
+
+        if class_num > 0:
+            self.head = nn.Linear(embed_dim, class_num)
+
+        trunc_normal_(self.cls_token)
+        trunc_normal_(self.patch_pos)
+        trunc_normal_(self.pixel_pos)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        pixel_embed = self.pixel_embed(x, self.pixel_pos)
+
+        patch_embed = self.norm2_proj(
+            self.proj(
+                self.norm1_proj(
+                    pixel_embed.reshape((-1, self.num_patches, pixel_embed.
+                                         shape[-1] * pixel_embed.shape[-2])))))
+        patch_embed = paddle.concat(
+            (self.cls_token.expand((B, -1, -1)).astype(patch_embed.dtype),
+             patch_embed),
+            axis=1)
+        patch_embed = patch_embed + self.patch_pos
+        patch_embed = self.pos_drop(patch_embed)
+        for blk in self.blocks:
+            pixel_embed, patch_embed = blk(pixel_embed, patch_embed)
+
+        patch_embed = self.norm(patch_embed)
+        return patch_embed[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        if self.class_num > 0:
+            x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def TNT_small(pretrained=False, use_ssld=False, **kwargs):
+    model = TNT(patch_size=16,
+                embed_dim=384,
+                in_dim=24,
+                depth=12,
+                num_heads=6,
+                in_num_head=4,
+                qkv_bias=False,
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["TNT_small"], use_ssld=use_ssld)
+    return model
+
+
+def TNT_base(pretrained=False, use_ssld=False, **kwargs):
+    model = TNT(patch_size=16,
+                embed_dim=640,
+                in_dim=40,
+                depth=12,
+                num_heads=10,
+                in_num_head=4,
+                qkv_bias=False,
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["TNT_base"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/twins.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/twins.py
new file mode 100755
index 000000000..3f983d962
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/twins.py
@@ -0,0 +1,692 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/Meituan-AutoML/Twins
+# reference: https://arxiv.org/abs/2104.13840
+
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+
+from .vision_transformer import trunc_normal_, normal_, zeros_, ones_, to_2tuple, DropPath, Identity, Mlp
+from .vision_transformer import Block as ViTBlock
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "pcpvt_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_small_pretrained.pdparams",
+    "pcpvt_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_base_pretrained.pdparams",
+    "pcpvt_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_large_pretrained.pdparams",
+    "alt_gvt_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_small_pretrained.pdparams",
+    "alt_gvt_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_base_pretrained.pdparams",
+    "alt_gvt_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_large_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class GroupAttention(nn.Layer):
+    """LSA: self attention within a group.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 ws=1):
+        super().__init__()
+        if ws == 1:
+            raise Exception("ws {ws} should not be 1")
+        if dim % num_heads != 0:
+            raise Exception(
+                "dim {dim} should be divided by num_heads {num_heads}.")
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.ws = ws
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        h_group, w_group = H // self.ws, W // self.ws
+        total_groups = h_group * w_group
+        x = x.reshape([B, h_group, self.ws, w_group, self.ws, C]).transpose(
+            [0, 1, 3, 2, 4, 5])
+        qkv = self.qkv(x).reshape([
+            B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads
+        ]).transpose([3, 0, 1, 4, 2, 5])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = paddle.matmul(q, k.transpose([0, 1, 2, 4, 3])) * self.scale
+
+        attn = nn.Softmax(axis=-1)(attn)
+        attn = self.attn_drop(attn)
+        attn = paddle.matmul(attn, v).transpose([0, 1, 3, 2, 4]).reshape(
+            [B, h_group, w_group, self.ws, self.ws, C])
+
+        x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    """GSA: using a key to summarize the information for a group to be efficient.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2D(
+                dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(
+            [B, N, self.num_heads, C // self.num_heads]).transpose(
+                [0, 2, 1, 3])
+
+        if self.sr_ratio > 1:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            tmp_n = H * W // self.sr_ratio**2
+            x_ = self.sr(x_).reshape([B, C, tmp_n]).transpose([0, 2, 1])
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(
+                [B, tmp_n, 2, self.num_heads, C // self.num_heads]).transpose(
+                    [2, 0, 3, 1, 4])
+        else:
+            kv = self.kv(x).reshape(
+                [B, N, 2, self.num_heads, C // self.num_heads]).transpose(
+                    [2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = paddle.matmul(q, k.transpose([0, 1, 3, 2])) * self.scale
+        attn = nn.Softmax(axis=-1)(attn)
+        attn = self.attn_drop(attn)
+
+        x = paddle.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class SBlock(ViTBlock):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1):
+        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
+                         attn_drop, drop_path, act_layer, norm_layer)
+
+    def forward(self, x, H, W):
+        return super().forward(x)
+
+
+class GroupBlock(ViTBlock):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1,
+                 ws=1):
+        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
+                         attn_drop, drop_path, act_layer, norm_layer)
+        del self.attn
+        if ws == 1:
+            self.attn = Attention(dim, num_heads, qkv_bias, qk_scale,
+                                  attn_drop, drop, sr_ratio)
+        else:
+            self.attn = GroupAttention(dim, num_heads, qkv_bias, qk_scale,
+                                       attn_drop, drop, ws)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding.
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        if img_size % patch_size != 0:
+            raise Exception(
+                f"img_size {img_size} should be divided by patch_size {patch_size}."
+            )
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose([0, 2, 1])
+        x = self.norm(x)
+        H, W = H // self.patch_size[0], W // self.patch_size[1]
+        return x, (H, W)
+
+
+# borrow from PVT https://github.com/whai362/PVT.git
+class PyramidVisionTransformer(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 block_cls=Block):
+        super().__init__()
+        self.class_num = class_num
+        self.depths = depths
+
+        # patch_embed
+        self.patch_embeds = nn.LayerList()
+        self.pos_embeds = nn.ParameterList()
+        self.pos_drops = nn.LayerList()
+        self.blocks = nn.LayerList()
+
+        for i in range(len(depths)):
+            if i == 0:
+                self.patch_embeds.append(
+                    PatchEmbed(img_size, patch_size, in_chans, embed_dims[i]))
+            else:
+                self.patch_embeds.append(
+                    PatchEmbed(img_size // patch_size // 2**(i - 1), 2,
+                               embed_dims[i - 1], embed_dims[i]))
+            patch_num = self.patch_embeds[i].num_patches + 1 if i == len(
+                embed_dims) - 1 else self.patch_embeds[i].num_patches
+            self.pos_embeds.append(
+                self.create_parameter(
+                    shape=[1, patch_num, embed_dims[i]],
+                    default_initializer=zeros_))
+            self.pos_drops.append(nn.Dropout(p=drop_rate))
+
+        dpr = [
+            float(x) for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        cur = 0
+        for k in range(len(depths)):
+            _block = nn.LayerList([
+                block_cls(
+                    dim=embed_dims[k],
+                    num_heads=num_heads[k],
+                    mlp_ratio=mlp_ratios[k],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[k]) for i in range(depths[k])
+            ])
+            self.blocks.append(_block)
+            cur += depths[k]
+
+        self.norm = norm_layer(embed_dims[-1])
+
+        # cls_token
+        self.cls_token = self.create_parameter(
+            shape=[1, 1, embed_dims[-1]],
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+
+        # classification head
+        self.head = nn.Linear(embed_dims[-1],
+                              class_num) if class_num > 0 else Identity()
+
+        # init weights
+        for pos_emb in self.pos_embeds:
+            trunc_normal_(pos_emb)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        for i in range(len(self.depths)):
+            x, (H, W) = self.patch_embeds[i](x)
+            if i == len(self.depths) - 1:
+                cls_tokens = self.cls_token.expand([B, -1, -1]).astype(x.dtype)
+                x = paddle.concat([cls_tokens, x], dim=1)
+            x = x + self.pos_embeds[i]
+            x = self.pos_drops[i](x)
+            for blk in self.blocks[i]:
+                x = blk(x, H, W)
+            if i < len(self.depths) - 1:
+                x = x.reshape([B, H, W, -1]).transpose(
+                    [0, 3, 1, 2]).contiguous()
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+# PEG  from https://arxiv.org/abs/2102.10882
+class PosCNN(nn.Layer):
+    def __init__(self, in_chans, embed_dim=768, s=1):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Conv2D(
+                in_chans,
+                embed_dim,
+                3,
+                s,
+                1,
+                bias_attr=paddle.ParamAttr(regularizer=L2Decay(0.0)),
+                groups=embed_dim,
+                weight_attr=paddle.ParamAttr(regularizer=L2Decay(0.0)), ))
+        self.s = s
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        feat_token = x
+        cnn_feat = feat_token.transpose([0, 2, 1]).reshape([B, C, H, W])
+        if self.s == 1:
+            x = self.proj(cnn_feat) + cnn_feat
+        else:
+            x = self.proj(cnn_feat)
+        x = x.flatten(2).transpose([0, 2, 1])
+        return x
+
+
+class CPVTV2(PyramidVisionTransformer):
+    """
+    Use useful results from CPVT. PEG and GAP.
+    Therefore, cls token is no longer required.
+    PEG is used to encode the absolute position on the fly, which greatly affects the performance when input resolution
+    changes during the training (such as segmentation, detection)
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 block_cls=Block):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+        del self.pos_embeds
+        del self.cls_token
+        self.pos_block = nn.LayerList(
+            [PosCNN(embed_dim, embed_dim) for embed_dim in embed_dims])
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        import math
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            normal_(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            m.weight.data.fill_(1.0)
+            m.bias.data.zero_()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(len(self.depths)):
+            x, (H, W) = self.patch_embeds[i](x)
+            x = self.pos_drops[i](x)
+
+            for j, blk in enumerate(self.blocks[i]):
+                x = blk(x, H, W)
+                if j == 0:
+                    x = self.pos_block[i](x, H, W)  # PEG here
+
+            if i < len(self.depths) - 1:
+                x = x.reshape([B, H, W, x.shape[-1]]).transpose([0, 3, 1, 2])
+
+        x = self.norm(x)
+        return x.mean(axis=1)  # GAP here
+
+
+class PCPVT(CPVTV2):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 block_cls=SBlock):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+
+
+class ALTGVT(PCPVT):
+    """
+    alias Twins-SVT
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 block_cls=GroupBlock,
+                 wss=[7, 7, 7]):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+        del self.blocks
+        self.wss = wss
+        # transformer encoder
+        dpr = [
+            float(x) for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+        self.blocks = nn.LayerList()
+        for k in range(len(depths)):
+            _block = nn.LayerList([
+                block_cls(
+                    dim=embed_dims[k],
+                    num_heads=num_heads[k],
+                    mlp_ratio=mlp_ratios[k],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[k],
+                    ws=1 if i % 2 == 1 else wss[k]) for i in range(depths[k])
+            ])
+            self.blocks.append(_block)
+            cur += depths[k]
+        self.apply(self._init_weights)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def pcpvt_small(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_small"], use_ssld=use_ssld)
+    return model
+
+
+def pcpvt_base(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 18, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_base"], use_ssld=use_ssld)
+    return model
+
+
+def pcpvt_large(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 8, 27, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_large"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_small(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[64, 128, 256, 512],
+        num_heads=[2, 4, 8, 16],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 10, 4],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_small"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_base(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[96, 192, 384, 768],
+        num_heads=[3, 6, 12, 24],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 18, 2],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_base"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_large(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[128, 256, 512, 1024],
+        num_heads=[4, 8, 16, 32],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 18, 2],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_large"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/uniformer.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/uniformer.py
new file mode 100755
index 000000000..13e579c36
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/uniformer.py
@@ -0,0 +1,552 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/Sense-X/UniFormer
+# reference: https://arxiv.org/abs/2201.09450
+
+from collections import OrderedDict
+from functools import partial
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import math
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity, Mlp
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "UniFormer_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_small_pretrained.pdparams",
+    "UniFormer_small_plus":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_small_plus_pretrained.pdparams",
+    "UniFormer_small_plus_dim64":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_small_plus_dim64_pretrained.pdparams",
+    "UniFormer_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_base_pretrained.pdparams",
+    "UniFormer_base_ls":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_base_ls_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+layer_scale = False
+init_value = 1e-6
+
+
+class CMlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1_conv = nn.Conv2D(in_features, hidden_features, 1)
+        self.act = act_layer()
+        self.fc2_conv = nn.Conv2D(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1_conv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2_conv(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            shape=[B, N, 3, self.num_heads, C // self.num_heads]).transpose(
+                perm=[2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @k.transpose(perm=[0, 1, 3, 2])) * self.scale
+        attn = nn.Softmax(axis=-1)(attn)
+        attn = self.attn_drop(attn)
+
+        x = (attn @v).transpose(perm=[0, 2, 1, 3]).reshape(shape=[B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class CBlock(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.pos_embed = nn.Conv2D(dim, dim, 3, padding=1, groups=dim)
+        self.norm1 = nn.BatchNorm2D(dim)
+        self.conv1 = nn.Conv2D(dim, dim, 1)
+        self.conv2 = nn.Conv2D(dim, dim, 1)
+        self.attn = nn.Conv2D(dim, dim, 5, padding=2, groups=dim)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = nn.BatchNorm2D(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = CMlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        x = x + self.drop_path(
+            self.conv2(self.attn(self.conv1(self.norm1(x)))))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class SABlock(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.pos_embed = nn.Conv2D(dim, dim, 3, padding=1, groups=dim)
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        global layer_scale
+        self.ls = layer_scale
+        if self.ls:
+            global init_value
+            print(f"Use layer_scale: {layer_scale}, init_values: {init_value}")
+            self.gamma_1 = self.create_parameter(
+                [dim],
+                dtype='float32',
+                default_initializer=nn.initializer.Constant(value=init_value))
+            self.gamma_2 = self.create_parameter(
+                [dim],
+                dtype='float32',
+                default_initializer=nn.initializer.Constant(value=init_value))
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        B, N, H, W = x.shape
+        x = x.flatten(2).transpose(perm=[0, 2, 1])
+        if self.ls:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        x = x.transpose(perm=[0, 2, 1]).reshape(shape=[B, N, H, W])
+        return x
+
+
+class HeadEmbedding(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self.proj = nn.Sequential(
+            nn.Conv2D(
+                in_channels,
+                out_channels // 2,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            nn.BatchNorm2D(out_channels // 2),
+            nn.GELU(),
+            nn.Conv2D(
+                out_channels // 2,
+                out_channels,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            nn.BatchNorm2D(out_channels))
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+
+
+class MiddleEmbedding(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self.proj = nn.Sequential(
+            nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            nn.BatchNorm2D(out_channels))
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.norm = nn.LayerNorm(embed_dim)
+        self.proj_conv = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj_conv(x)
+        B, C, H, W = x.shape
+        x = x.flatten(2).transpose(perm=[0, 2, 1])
+        x = self.norm(x)
+        x = x.reshape(shape=[B, H, W, C]).transpose(perm=[0, 3, 1, 2])
+        return x
+
+
+class UniFormer(nn.Layer):
+    """ UniFormer
+    A PaddlePaddle impl of : `UniFormer: Unifying Convolution and Self-attention for Visual Recognition`  -
+        https://arxiv.org/abs/2201.09450
+    """
+
+    def __init__(self,
+                 depth=[3, 4, 8, 3],
+                 img_size=224,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=[64, 128, 320, 512],
+                 head_dim=64,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 representation_size=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=None,
+                 conv_stem=False):
+        """
+        Args:
+            depth (list): depth of each stage
+            img_size (int, tuple): input image size
+            in_chans (int): number of input channels
+            class_num (int): number of classes for classification head
+            embed_dim (list): embedding dimension of each stage
+            head_dim (int): head dimension
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer (nn.Module): normalization layer
+            conv_stem (bool): whether use overlapped patch stem
+        """
+        super().__init__()
+        self.class_num = class_num
+        self.num_features = self.embed_dim = embed_dim
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        if conv_stem:
+            self.patch_embed1 = HeadEmbedding(
+                in_channels=in_chans, out_channels=embed_dim[0])
+            self.patch_embed2 = MiddleEmbedding(
+                in_channels=embed_dim[0], out_channels=embed_dim[1])
+            self.patch_embed3 = MiddleEmbedding(
+                in_channels=embed_dim[1], out_channels=embed_dim[2])
+            self.patch_embed4 = MiddleEmbedding(
+                in_channels=embed_dim[2], out_channels=embed_dim[3])
+        else:
+            self.patch_embed1 = PatchEmbed(
+                img_size=img_size,
+                patch_size=4,
+                in_chans=in_chans,
+                embed_dim=embed_dim[0])
+            self.patch_embed2 = PatchEmbed(
+                img_size=img_size // 4,
+                patch_size=2,
+                in_chans=embed_dim[0],
+                embed_dim=embed_dim[1])
+            self.patch_embed3 = PatchEmbed(
+                img_size=img_size // 8,
+                patch_size=2,
+                in_chans=embed_dim[1],
+                embed_dim=embed_dim[2])
+            self.patch_embed4 = PatchEmbed(
+                img_size=img_size // 16,
+                patch_size=2,
+                in_chans=embed_dim[2],
+                embed_dim=embed_dim[3])
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depth))
+        ]  # stochastic depth decay rule
+        num_heads = [dim // head_dim for dim in embed_dim]
+        self.blocks1 = nn.LayerList([
+            CBlock(
+                dim=embed_dim[0],
+                num_heads=num_heads[0],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer) for i in range(depth[0])
+        ])
+        self.blocks2 = nn.LayerList([
+            CBlock(
+                dim=embed_dim[1],
+                num_heads=num_heads[1],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i + depth[0]],
+                norm_layer=norm_layer) for i in range(depth[1])
+        ])
+        self.blocks3 = nn.LayerList([
+            SABlock(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i + depth[0] + depth[1]],
+                norm_layer=norm_layer) for i in range(depth[2])
+        ])
+        self.blocks4 = nn.LayerList([
+            SABlock(
+                dim=embed_dim[3],
+                num_heads=num_heads[3],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i + depth[0] + depth[1] + depth[2]],
+                norm_layer=norm_layer) for i in range(depth[3])
+        ])
+        self.norm = nn.BatchNorm2D(embed_dim[-1])
+
+        # Representation layer
+        if representation_size:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(
+                OrderedDict([('fc', nn.Linear(embed_dim, representation_size)),
+                             ('act', nn.Tanh())]))
+        else:
+            self.pre_logits = nn.Identity()
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim[-1],
+                              class_num) if class_num > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x = self.patch_embed1(x)
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x)
+        x = self.patch_embed2(x)
+        for blk in self.blocks2:
+            x = blk(x)
+        x = self.patch_embed3(x)
+        for blk in self.blocks3:
+            x = blk(x)
+        x = self.patch_embed4(x)
+        for blk in self.blocks4:
+            x = blk(x)
+        x = self.norm(x)
+        x = self.pre_logits(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = x.flatten(2).mean(-1)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def UniFormer_small(pretrained=True, use_ssld=False, **kwargs):
+    model = UniFormer(
+        depth=[3, 4, 8, 3],
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["UniFormer_small"], use_ssld=use_ssld)
+    return model
+
+
+def UniFormer_small_plus(pretrained=True, use_ssld=False, **kwargs):
+    model = UniFormer(
+        depth=[3, 5, 9, 3],
+        conv_stem=True,
+        embed_dim=[64, 128, 320, 512],
+        head_dim=32,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["UniFormer_small_plus"],
+        use_ssld=use_ssld)
+    return model
+
+
+def UniFormer_small_plus_dim64(pretrained=True, use_ssld=False, **kwargs):
+    model = UniFormer(
+        depth=[3, 5, 9, 3],
+        conv_stem=True,
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["UniFormer_small_plus_dim64"],
+        use_ssld=use_ssld)
+    return model
+
+
+def UniFormer_base(pretrained=True, use_ssld=False, **kwargs):
+    model = UniFormer(
+        depth=[5, 8, 20, 7],
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.3,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["UniFormer_base"], use_ssld=use_ssld)
+    return model
+
+
+def UniFormer_base_ls(pretrained=True, use_ssld=False, **kwargs):
+    global layer_scale
+    layer_scale = True
+    model = UniFormer(
+        depth=[5, 8, 20, 7],
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.3,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["UniFormer_base_ls"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/van.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/van.py
new file mode 100755
index 000000000..18ac7f7a1
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/van.py
@@ -0,0 +1,362 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/Visual-Attention-Network/VAN-Classification
+# reference: https://arxiv.org/abs/2202.09741
+
+from functools import partial
+import math
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "VAN_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/VAN_B0_pretrained.pdparams",
+    "VAN_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/VAN_B1_pretrained.pdparams",
+    "VAN_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/VAN_B2_pretrained.pdparams",
+    "VAN_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/VAN_B3_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@paddle.jit.not_to_static
+def swapdim(x, dim1, dim2):
+    a = list(range(len(x.shape)))
+    a[dim1], a[dim2] = a[dim2], a[dim1]
+    return x.transpose(a)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2D(in_features, hidden_features, 1)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Conv2D(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class LKA(nn.Layer):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv0 = nn.Conv2D(dim, dim, 5, padding=2, groups=dim)
+        self.conv_spatial = nn.Conv2D(
+            dim, dim, 7, stride=1, padding=9, groups=dim, dilation=3)
+        self.conv1 = nn.Conv2D(dim, dim, 1)
+
+    def forward(self, x):
+        attn = self.conv0(x)
+        attn = self.conv_spatial(attn)
+        attn = self.conv1(attn)
+        return x * attn
+
+
+class Attention(nn.Layer):
+    def __init__(self, d_model):
+        super().__init__()
+        self.proj_1 = nn.Conv2D(d_model, d_model, 1)
+        self.activation = nn.GELU()
+        self.spatial_gating_unit = LKA(d_model)
+        self.proj_2 = nn.Conv2D(d_model, d_model, 1)
+
+    def forward(self, x):
+        shorcut = x
+        x = self.proj_1(x)
+        x = self.activation(x)
+        x = self.spatial_gating_unit(x)
+        x = self.proj_2(x)
+        x = x + shorcut
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU):
+        super().__init__()
+        self.norm1 = nn.BatchNorm2D(dim)
+        self.attn = Attention(dim)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = nn.BatchNorm2D(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        layer_scale_init_value = 1e-2
+        self.layer_scale_1 = self.create_parameter(
+            shape=[dim, 1, 1],
+            default_initializer=Constant(value=layer_scale_init_value))
+        self.layer_scale_2 = self.create_parameter(
+            shape=[dim, 1, 1],
+            default_initializer=Constant(value=layer_scale_init_value))
+
+    def forward(self, x):
+        x = x + self.drop_path(self.layer_scale_1 * self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.layer_scale_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=patch_size // 2)
+        self.norm = nn.BatchNorm2D(embed_dim)
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = self.norm(x)
+        return x, H, W
+
+
+class VAN(nn.Layer):
+    r""" VAN
+    A PaddlePaddle impl of : `Visual Attention Network`  -
+      https://arxiv.org/pdf/2202.09741.pdf
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 mlp_ratios=[4, 4, 4, 4],
+                 drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 num_stages=4,
+                 flag=False):
+        super().__init__()
+        if flag == False:
+            self.class_num = class_num
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [x for x in paddle.linspace(0, drop_path_rate, sum(depths))
+               ]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(
+                img_size=img_size if i == 0 else img_size // (2**(i + 1)),
+                patch_size=7 if i == 0 else 3,
+                stride=4 if i == 0 else 2,
+                in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                embed_dim=embed_dims[i])
+
+            block = nn.LayerList([
+                Block(
+                    dim=embed_dims[i],
+                    mlp_ratio=mlp_ratios[i],
+                    drop=drop_rate,
+                    drop_path=dpr[cur + j]) for j in range(depths[i])
+            ])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+
+        # classification head
+        self.head = nn.Linear(embed_dims[3],
+                              class_num) if class_num > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            m.weight.set_value(
+                paddle.normal(
+                    std=math.sqrt(2.0 / fan_out), shape=m.weight.shape))
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+            for blk in block:
+                x = blk(x)
+
+            x = x.flatten(2)
+            x = swapdim(x, 1, 2)
+            x = norm(x)
+            if i != self.num_stages - 1:
+                x = x.reshape([B, H, W, x.shape[2]]).transpose([0, 3, 1, 2])
+
+        return x.mean(axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x):
+        x = self.dwconv(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def VAN_B0(pretrained=False, use_ssld=False, **kwargs):
+    model = VAN(embed_dims=[32, 64, 160, 256],
+                mlp_ratios=[8, 8, 4, 4],
+                norm_layer=partial(
+                    nn.LayerNorm, epsilon=1e-6),
+                depths=[3, 3, 5, 2],
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["VAN_B0"], use_ssld=use_ssld)
+    return model
+
+
+def VAN_B1(pretrained=False, use_ssld=False, **kwargs):
+    model = VAN(embed_dims=[64, 128, 320, 512],
+                mlp_ratios=[8, 8, 4, 4],
+                norm_layer=partial(
+                    nn.LayerNorm, epsilon=1e-6),
+                depths=[2, 2, 4, 2],
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["VAN_B1"], use_ssld=use_ssld)
+    return model
+
+
+def VAN_B2(pretrained=False, use_ssld=False, **kwargs):
+    model = VAN(embed_dims=[64, 128, 320, 512],
+                mlp_ratios=[8, 8, 4, 4],
+                norm_layer=partial(
+                    nn.LayerNorm, epsilon=1e-6),
+                depths=[3, 3, 12, 3],
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["VAN_B2"], use_ssld=use_ssld)
+    return model
+
+
+def VAN_B3(pretrained=False, use_ssld=False, **kwargs):
+    model = VAN(embed_dims=[64, 128, 320, 512],
+                mlp_ratios=[8, 8, 4, 4],
+                norm_layer=partial(
+                    nn.LayerNorm, epsilon=1e-6),
+                depths=[3, 5, 27, 3],
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["VAN_B3"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/vision_transformer.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/vision_transformer.py
new file mode 100755
index 000000000..5a015702c
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/vision_transformer.py
@@ -0,0 +1,459 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# reference: https://arxiv.org/abs/2010.11929
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ViT_small_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_small_patch16_224_pretrained.pdparams",
+    "ViT_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams",
+    "ViT_base_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_384_pretrained.pdparams",
+    "ViT_base_patch32_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch32_384_pretrained.pdparams",
+    "ViT_large_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch16_224_pretrained.pdparams",
+    "ViT_large_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch16_384_pretrained.pdparams",
+    "ViT_large_patch32_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch32_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+trunc_normal_ = TruncatedNormal(std=.02)
+normal_ = Normal
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.full(shape=[], fill_value=1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        # B= x.shape[0]
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
+                                   self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * \
+            (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+
+        x = self.proj(x).flatten(2).transpose((0, 2, 1))
+        return x
+
+
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 **kwargs):
+        super().__init__()
+        self.class_num = class_num
+
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.pos_embed = self.create_parameter(
+            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim,
+                              class_num) if class_num > 0 else Identity()
+
+        trunc_normal_(self.pos_embed)
+        trunc_normal_(self.cls_token)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        cls_tokens = self.cls_token.expand((B, -1, -1)).astype(x.dtype)
+        x = paddle.concat((cls_tokens, x), axis=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ViT_small_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=8,
+        num_heads=8,
+        mlp_ratio=3,
+        qk_scale=768**-0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_small_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch32_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch32_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch32_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch32_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/wideresnet.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/wideresnet.py
new file mode 100755
index 000000000..8efd3220b
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/wideresnet.py
@@ -0,0 +1,236 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from ..base.theseus_layer import TheseusLayer
+"""
+backbone option "WideResNet"
+code in this file is adpated from
+https://github.com/kekmodel/FixMatch-pytorch/blob/master/models/wideresnet.py
+thanks!
+"""
+
+
+def mish(x):
+    """Mish: A Self Regularized Non-Monotonic Neural Activation Function (https://arxiv.org/abs/1908.08681)"""
+    return x * paddle.tanh(F.softplus(x))
+
+
+class PSBatchNorm2D(nn.BatchNorm2D):
+    """How Does BN Increase Collapsed Neural Network Filters? (https://arxiv.org/abs/2001.11216)"""
+
+    def __init__(self,
+                 num_features,
+                 alpha=0.1,
+                 eps=1e-05,
+                 momentum=0.999,
+                 weight_attr=None,
+                 bias_attr=None):
+        super().__init__(num_features, momentum, eps, weight_attr, bias_attr)
+        self.alpha = alpha
+
+    def forward(self, x):
+        return super().forward(x) + self.alpha
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 stride,
+                 drop_rate=0.0,
+                 activate_before_residual=False):
+        super(BasicBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2D(in_planes, momentum=0.999)
+        self.relu1 = nn.LeakyReLU(negative_slope=0.1)
+        self.conv1 = nn.Conv2D(
+            in_planes,
+            out_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias_attr=False)
+        self.bn2 = nn.BatchNorm2D(out_planes, momentum=0.999)
+        self.relu2 = nn.LeakyReLU(negative_slope=0.1)
+        self.conv2 = nn.Conv2D(
+            out_planes,
+            out_planes,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        self.drop_rate = drop_rate
+        self.equalInOut = (in_planes == out_planes)
+        self.convShortcut = (not self.equalInOut) and nn.Conv2D(
+            in_planes,
+            out_planes,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            bias_attr=False) or None
+        self.activate_before_residual = activate_before_residual
+
+    def forward(self, x):
+        if not self.equalInOut and self.activate_before_residual == True:
+            x = self.relu1(self.bn1(x))
+        else:
+            out = self.relu1(self.bn1(x))
+        out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else x)))
+        if self.drop_rate > 0:
+            out = F.dropout(out, p=self.drop_rate, training=self.training)
+        out = self.conv2(out)
+        return paddle.add(x if self.equalInOut else self.convShortcut(x), out)
+
+
+class NetworkBlock(nn.Layer):
+    def __init__(self,
+                 nb_layers,
+                 in_planes,
+                 out_planes,
+                 block,
+                 stride,
+                 drop_rate=0.0,
+                 activate_before_residual=False):
+        super(NetworkBlock, self).__init__()
+        self.layer = self._make_layer(block, in_planes, out_planes, nb_layers,
+                                      stride, drop_rate,
+                                      activate_before_residual)
+
+    def _make_layer(self, block, in_planes, out_planes, nb_layers, stride,
+                    drop_rate, activate_before_residual):
+        layers = []
+        for i in range(int(nb_layers)):
+            layers.append(
+                block(i == 0 and in_planes or out_planes, out_planes, i == 0
+                      and stride or 1, drop_rate, activate_before_residual))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.layer(x)
+
+
+class Normalize(nn.Layer):
+    """ Ln normalization copied from
+    https://github.com/salesforce/CoMatch
+    """
+
+    def __init__(self, power=2):
+        super(Normalize, self).__init__()
+        self.power = power
+
+    def forward(self, x):
+        norm = x.pow(self.power).sum(1, keepdim=True).pow(1. / self.power)
+        out = x.divide(norm)
+        return out
+
+
+class Wide_ResNet(TheseusLayer):
+    def __init__(self,
+                 num_classes,
+                 depth=28,
+                 widen_factor=2,
+                 drop_rate=0.0,
+                 proj=False,
+                 proj_after=False,
+                 low_dim=64):
+        super(Wide_ResNet, self).__init__()
+        # prepare self values
+        self.widen_factor = widen_factor
+        self.depth = depth
+        self.drop_rate = drop_rate
+        # if use projection head
+        self.proj = proj
+        # if use the output of projection head for classification
+        self.proj_after = proj_after
+        self.low_dim = low_dim
+        channels = [
+            16, 16 * widen_factor, 32 * widen_factor, 64 * widen_factor
+        ]
+        assert ((depth - 4) % 6 == 0)
+        n = (depth - 4) / 6
+        block = BasicBlock
+        # 1st conv before any network block
+        self.conv1 = nn.Conv2D(
+            3,
+            channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        # 1st block
+        self.block1 = NetworkBlock(
+            n,
+            channels[0],
+            channels[1],
+            block,
+            1,
+            drop_rate,
+            activate_before_residual=True)
+        # 2nd block
+        self.block2 = NetworkBlock(n, channels[1], channels[2], block, 2,
+                                   drop_rate)
+        # 3rd block
+        self.block3 = NetworkBlock(n, channels[2], channels[3], block, 2,
+                                   drop_rate)
+        # global average pooling and classifier
+        self.bn1 = nn.BatchNorm2D(channels[3], momentum=0.999)
+        self.relu = nn.LeakyReLU(negative_slope=0.1)
+
+        # if proj after means we classify after projection head
+        # so we must change the in channel to low_dim of laster fc
+        if self.proj_after:
+            self.fc = nn.Linear(self.low_dim, num_classes)
+        else:
+            self.fc = nn.Linear(channels[3], num_classes)
+        self.channels = channels[3]
+        # projection head
+        if self.proj:
+            self.l2norm = Normalize(2)
+
+            self.fc1 = nn.Linear(64 * self.widen_factor,
+                                 64 * self.widen_factor)
+            self.relu_mlp = nn.LeakyReLU(negative_slope=0.1)
+            self.fc2 = nn.Linear(64 * self.widen_factor, self.low_dim)
+
+    def forward(self, x):
+        feat = self.conv1(x)
+        feat = self.block1(feat)
+        feat = self.block2(feat)
+        feat = self.block3(feat)
+        feat = self.relu(self.bn1(feat))
+        feat = F.adaptive_avg_pool2d(feat, 1)
+        feat = paddle.reshape(feat, [-1, self.channels])
+        if self.proj:
+            pfeat = self.fc1(feat)
+            pfeat = self.relu_mlp(pfeat)
+            pfeat = self.fc2(pfeat)
+            pfeat = self.l2norm(pfeat)
+
+            # if projection after classifiy, we classify last
+            if self.proj_after:
+                out = self.fc(pfeat)
+            else:
+                out = self.fc(feat)
+
+            return out, pfeat
+
+        # output
+        out = self.fc(feat)
+        return out
+
+
+def WideResNet(depth,
+               widen_factor,
+               dropout,
+               num_classes,
+               proj=False,
+               low_dim=64,
+               **kwargs):
+    return Wide_ResNet(
+        depth=depth,
+        widen_factor=widen_factor,
+        drop_rate=dropout,
+        num_classes=num_classes,
+        proj=proj,
+        low_dim=low_dim,
+        **kwargs)
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/xception.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/xception.py
new file mode 100755
index 000000000..7615cd706
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/xception.py
@@ -0,0 +1,393 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1610.02357
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+import sys
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "Xception41":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception41_pretrained.pdparams",
+    "Xception65":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception65_pretrained.pdparams",
+    "Xception71":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception71_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = "bn_" + name
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class SeparableConv(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride=1, name=None):
+        super(SeparableConv, self).__init__()
+
+        self._pointwise_conv = ConvBNLayer(
+            input_channels, output_channels, 1, name=name + "_sep")
+        self._depthwise_conv = ConvBNLayer(
+            output_channels,
+            output_channels,
+            3,
+            stride=stride,
+            groups=output_channels,
+            name=name + "_dw")
+
+    def forward(self, inputs):
+        x = self._pointwise_conv(inputs)
+        x = self._depthwise_conv(x)
+        return x
+
+
+class EntryFlowBottleneckBlock(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 stride=2,
+                 name=None,
+                 relu_first=False):
+        super(EntryFlowBottleneckBlock, self).__init__()
+        self.relu_first = relu_first
+
+        self._short = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            weight_attr=ParamAttr(name + "_branch1_weights"),
+            bias_attr=False)
+        self._conv1 = SeparableConv(
+            input_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv2 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._pool = MaxPool2D(kernel_size=3, stride=stride, padding=1)
+
+    def forward(self, inputs):
+        conv0 = inputs
+        short = self._short(inputs)
+        if self.relu_first:
+            conv0 = F.relu(conv0)
+        conv1 = self._conv1(conv0)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv2(conv2)
+        pool = self._pool(conv2)
+        return paddle.add(x=short, y=pool)
+
+
+class EntryFlow(nn.Layer):
+    def __init__(self, block_num=3):
+        super(EntryFlow, self).__init__()
+
+        name = "entry_flow"
+        self.block_num = block_num
+        self._conv1 = ConvBNLayer(
+            3, 32, 3, stride=2, act="relu", name=name + "_conv1")
+        self._conv2 = ConvBNLayer(32, 64, 3, act="relu", name=name + "_conv2")
+        if block_num == 3:
+            self._conv_0 = EntryFlowBottleneckBlock(
+                64, 128, stride=2, name=name + "_0", relu_first=False)
+            self._conv_1 = EntryFlowBottleneckBlock(
+                128, 256, stride=2, name=name + "_1", relu_first=True)
+            self._conv_2 = EntryFlowBottleneckBlock(
+                256, 728, stride=2, name=name + "_2", relu_first=True)
+        elif block_num == 5:
+            self._conv_0 = EntryFlowBottleneckBlock(
+                64, 128, stride=2, name=name + "_0", relu_first=False)
+            self._conv_1 = EntryFlowBottleneckBlock(
+                128, 256, stride=1, name=name + "_1", relu_first=True)
+            self._conv_2 = EntryFlowBottleneckBlock(
+                256, 256, stride=2, name=name + "_2", relu_first=True)
+            self._conv_3 = EntryFlowBottleneckBlock(
+                256, 728, stride=1, name=name + "_3", relu_first=True)
+            self._conv_4 = EntryFlowBottleneckBlock(
+                728, 728, stride=2, name=name + "_4", relu_first=True)
+        else:
+            sys.exit(-1)
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+
+        if self.block_num == 3:
+            x = self._conv_0(x)
+            x = self._conv_1(x)
+            x = self._conv_2(x)
+        elif self.block_num == 5:
+            x = self._conv_0(x)
+            x = self._conv_1(x)
+            x = self._conv_2(x)
+            x = self._conv_3(x)
+            x = self._conv_4(x)
+        return x
+
+
+class MiddleFlowBottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, name):
+        super(MiddleFlowBottleneckBlock, self).__init__()
+
+        self._conv_0 = SeparableConv(
+            input_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv_1 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._conv_2 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2c_weights")
+
+    def forward(self, inputs):
+        conv0 = F.relu(inputs)
+        conv0 = self._conv_0(conv0)
+        conv1 = F.relu(conv0)
+        conv1 = self._conv_1(conv1)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv_2(conv2)
+        return paddle.add(x=inputs, y=conv2)
+
+
+class MiddleFlow(nn.Layer):
+    def __init__(self, block_num=8):
+        super(MiddleFlow, self).__init__()
+
+        self.block_num = block_num
+        self._conv_0 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_0")
+        self._conv_1 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_1")
+        self._conv_2 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_2")
+        self._conv_3 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_3")
+        self._conv_4 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_4")
+        self._conv_5 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_5")
+        self._conv_6 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_6")
+        self._conv_7 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_7")
+        if block_num == 16:
+            self._conv_8 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_8")
+            self._conv_9 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_9")
+            self._conv_10 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_10")
+            self._conv_11 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_11")
+            self._conv_12 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_12")
+            self._conv_13 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_13")
+            self._conv_14 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_14")
+            self._conv_15 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_15")
+
+    def forward(self, inputs):
+        x = self._conv_0(inputs)
+        x = self._conv_1(x)
+        x = self._conv_2(x)
+        x = self._conv_3(x)
+        x = self._conv_4(x)
+        x = self._conv_5(x)
+        x = self._conv_6(x)
+        x = self._conv_7(x)
+        if self.block_num == 16:
+            x = self._conv_8(x)
+            x = self._conv_9(x)
+            x = self._conv_10(x)
+            x = self._conv_11(x)
+            x = self._conv_12(x)
+            x = self._conv_13(x)
+            x = self._conv_14(x)
+            x = self._conv_15(x)
+        return x
+
+
+class ExitFlowBottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels1, output_channels2,
+                 name):
+        super(ExitFlowBottleneckBlock, self).__init__()
+
+        self._short = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels2,
+            kernel_size=1,
+            stride=2,
+            padding=0,
+            weight_attr=ParamAttr(name + "_branch1_weights"),
+            bias_attr=False)
+        self._conv_1 = SeparableConv(
+            input_channels,
+            output_channels1,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv_2 = SeparableConv(
+            output_channels1,
+            output_channels2,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, inputs):
+        short = self._short(inputs)
+        conv0 = F.relu(inputs)
+        conv1 = self._conv_1(conv0)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv_2(conv2)
+        pool = self._pool(conv2)
+        return paddle.add(x=short, y=pool)
+
+
+class ExitFlow(nn.Layer):
+    def __init__(self, class_num):
+        super(ExitFlow, self).__init__()
+
+        name = "exit_flow"
+
+        self._conv_0 = ExitFlowBottleneckBlock(
+            728, 728, 1024, name=name + "_1")
+        self._conv_1 = SeparableConv(1024, 1536, stride=1, name=name + "_2")
+        self._conv_2 = SeparableConv(1536, 2048, stride=1, name=name + "_3")
+        self._pool = AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self._out = Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        conv0 = self._conv_0(inputs)
+        conv1 = self._conv_1(conv0)
+        conv1 = F.relu(conv1)
+        conv2 = self._conv_2(conv1)
+        conv2 = F.relu(conv2)
+        pool = self._pool(conv2)
+        pool = paddle.flatten(pool, start_axis=1, stop_axis=-1)
+        out = self._out(pool)
+        return out
+
+
+class Xception(nn.Layer):
+    def __init__(self,
+                 entry_flow_block_num=3,
+                 middle_flow_block_num=8,
+                 class_num=1000):
+        super(Xception, self).__init__()
+        self.entry_flow_block_num = entry_flow_block_num
+        self.middle_flow_block_num = middle_flow_block_num
+        self._entry_flow = EntryFlow(entry_flow_block_num)
+        self._middle_flow = MiddleFlow(middle_flow_block_num)
+        self._exit_flow = ExitFlow(class_num)
+
+    def forward(self, inputs):
+        x = self._entry_flow(inputs)
+        x = self._middle_flow(x)
+        x = self._exit_flow(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Xception41(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(entry_flow_block_num=3, middle_flow_block_num=8, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception41"], use_ssld=use_ssld)
+    return model
+
+
+def Xception65(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(
+        entry_flow_block_num=3, middle_flow_block_num=16, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception65"], use_ssld=use_ssld)
+    return model
+
+
+def Xception71(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(
+        entry_flow_block_num=5, middle_flow_block_num=16, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception71"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/model_zoo/xception_deeplab.py b/example/low_precision_training/ppcls/arch/backbone/model_zoo/xception_deeplab.py
new file mode 100755
index 000000000..f5a7fa529
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/model_zoo/xception_deeplab.py
@@ -0,0 +1,423 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1706.05587
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "Xception41_deeplab":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception41_deeplab_pretrained.pdparams",
+    "Xception65_deeplab":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception65_deeplab_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def check_data(data, number):
+    if type(data) == int:
+        return [data] * number
+    assert len(data) == number
+    return data
+
+
+def check_stride(s, os):
+    if s <= os:
+        return True
+    else:
+        return False
+
+
+def check_points(count, points):
+    if points is None:
+        return False
+    else:
+        if isinstance(points, list):
+            return (True if count in points else False)
+        else:
+            return (True if count == points else False)
+
+
+def gen_bottleneck_params(backbone='xception_65'):
+    if backbone == 'xception_65':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_41':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (8, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_71':
+        bottleneck_params = {
+            "entry_flow": (5, [2, 1, 2, 1, 2], [128, 256, 256, 728, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    else:
+        raise Exception(
+            "xception backbont only support xception_41/xception_65/xception_71"
+        )
+    return bottleneck_params
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + "/weights"),
+            bias_attr=False)
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/BatchNorm/beta"),
+            moving_mean_name=name + "/BatchNorm/moving_mean",
+            moving_variance_name=name + "/BatchNorm/moving_variance")
+
+    def forward(self, inputs):
+        return self._bn(self._conv(inputs))
+
+
+class Seperate_Conv(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 stride,
+                 filter,
+                 dilation=1,
+                 act=None,
+                 name=None):
+        super(Seperate_Conv, self).__init__()
+
+        self._conv1 = Conv2D(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=filter,
+            stride=stride,
+            groups=input_channels,
+            padding=(filter) // 2 * dilation,
+            dilation=dilation,
+            weight_attr=ParamAttr(name=name + "/depthwise/weights"),
+            bias_attr=False)
+        self._bn1 = BatchNorm(
+            input_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/depthwise/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/depthwise/BatchNorm/beta"),
+            moving_mean_name=name + "/depthwise/BatchNorm/moving_mean",
+            moving_variance_name=name + "/depthwise/BatchNorm/moving_variance")
+        self._conv2 = Conv2D(
+            input_channels,
+            output_channels,
+            1,
+            stride=1,
+            groups=1,
+            padding=0,
+            weight_attr=ParamAttr(name=name + "/pointwise/weights"),
+            bias_attr=False)
+        self._bn2 = BatchNorm(
+            output_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/pointwise/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/pointwise/BatchNorm/beta"),
+            moving_mean_name=name + "/pointwise/BatchNorm/moving_mean",
+            moving_variance_name=name + "/pointwise/BatchNorm/moving_variance")
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._bn1(x)
+        x = self._conv2(x)
+        x = self._bn2(x)
+        return x
+
+
+class Xception_Block(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 strides=1,
+                 filter_size=3,
+                 dilation=1,
+                 skip_conv=True,
+                 has_skip=True,
+                 activation_fn_in_separable_conv=False,
+                 name=None):
+        super(Xception_Block, self).__init__()
+
+        repeat_number = 3
+        output_channels = check_data(output_channels, repeat_number)
+        filter_size = check_data(filter_size, repeat_number)
+        strides = check_data(strides, repeat_number)
+
+        self.has_skip = has_skip
+        self.skip_conv = skip_conv
+        self.activation_fn_in_separable_conv = activation_fn_in_separable_conv
+        if not activation_fn_in_separable_conv:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                dilation=dilation,
+                name=name + "/separable_conv3")
+        else:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv3")
+
+        if has_skip and skip_conv:
+            self._short = ConvBNLayer(
+                input_channels,
+                output_channels[-1],
+                1,
+                stride=strides[-1],
+                padding=0,
+                name=name + "/shortcut")
+
+    def forward(self, inputs):
+        if not self.activation_fn_in_separable_conv:
+            x = F.relu(inputs)
+            x = self._conv1(x)
+            x = F.relu(x)
+            x = self._conv2(x)
+            x = F.relu(x)
+            x = self._conv3(x)
+        else:
+            x = self._conv1(inputs)
+            x = self._conv2(x)
+            x = self._conv3(x)
+        if self.has_skip:
+            if self.skip_conv:
+                skip = self._short(inputs)
+            else:
+                skip = inputs
+            return paddle.add(x, skip)
+        else:
+            return x
+
+
+class XceptionDeeplab(nn.Layer):
+    def __init__(self, backbone, class_num=1000):
+        super(XceptionDeeplab, self).__init__()
+
+        bottleneck_params = gen_bottleneck_params(backbone)
+        self.backbone = backbone
+
+        self._conv1 = ConvBNLayer(
+            3,
+            32,
+            3,
+            stride=2,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv1")
+        self._conv2 = ConvBNLayer(
+            32,
+            64,
+            3,
+            stride=1,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv2")
+
+        self.block_num = bottleneck_params["entry_flow"][0]
+        self.strides = bottleneck_params["entry_flow"][1]
+        self.chns = bottleneck_params["entry_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+
+        self.entry_flow = []
+        self.middle_flow = []
+
+        self.stride = 2
+        self.output_stride = 32
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/entry_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=64 if i == 0 else self.chns[i - 1],
+                    output_channels=self.chns[i],
+                    strides=[1, 1, self.stride],
+                    name=self.backbone + "/entry_flow/block" + str(i + 1)))
+            self.entry_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["middle_flow"][0]
+        self.strides = bottleneck_params["middle_flow"][1]
+        self.chns = bottleneck_params["middle_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/middle_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=728,
+                    output_channels=728,
+                    strides=[1, 1, self.strides[i]],
+                    skip_conv=False,
+                    name=self.backbone + "/middle_flow/block" + str(i + 1)))
+            self.middle_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["exit_flow"][0]
+        self.strides = bottleneck_params["exit_flow"][1]
+        self.chns = bottleneck_params["exit_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+        stride = self.strides[0] if check_stride(s * self.strides[0],
+                                                 self.output_stride) else 1
+        self._exit_flow_1 = Xception_Block(
+            728,
+            self.chns[0], [1, 1, stride],
+            name=self.backbone + "/exit_flow/block1")
+        s = s * stride
+        stride = self.strides[1] if check_stride(s * self.strides[1],
+                                                 self.output_stride) else 1
+        self._exit_flow_2 = Xception_Block(
+            self.chns[0][-1],
+            self.chns[1], [1, 1, stride],
+            dilation=2,
+            has_skip=False,
+            activation_fn_in_separable_conv=True,
+            name=self.backbone + "/exit_flow/block2")
+        s = s * stride
+
+        self.stride = s
+
+        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self._pool = AdaptiveAvgPool2D(1)
+        self._fc = Linear(
+            self.chns[1][-1],
+            class_num,
+            weight_attr=ParamAttr(name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_bias"))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        for ef in self.entry_flow:
+            x = ef(x)
+        for mf in self.middle_flow:
+            x = mf(x)
+        x = self._exit_flow_1(x)
+        x = self._exit_flow_2(x)
+        x = self._drop(x)
+        x = self._pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Xception41_deeplab(pretrained=False, use_ssld=False, **kwargs):
+    model = XceptionDeeplab('xception_41', **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception41_deeplab"], use_ssld=use_ssld)
+    return model
+
+
+def Xception65_deeplab(pretrained=False, use_ssld=False, **kwargs):
+    model = XceptionDeeplab("xception_65", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception65_deeplab"], use_ssld=use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/variant_models/__init__.py b/example/low_precision_training/ppcls/arch/backbone/variant_models/__init__.py
new file mode 100755
index 000000000..4a2716208
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/variant_models/__init__.py
@@ -0,0 +1,5 @@
+from .resnet_variant import ResNet50_last_stage_stride1, ResNet50_metabin
+from .vgg_variant import VGG19Sigmoid
+from .pp_lcnet_variant import PPLCNet_x2_5_Tanh
+from .pp_lcnetv2_variant import PPLCNetV2_base_ShiTu
+from .swin_transformer_variant import SwinTransformer_base_patch4_window7_224_SOLIDER,SwinTransformer_small_patch4_window7_224_SOLIDER,SwinTransformer_tiny_patch4_window7_224_SOLIDER
diff --git a/example/low_precision_training/ppcls/arch/backbone/variant_models/efficientnet_variant.py b/example/low_precision_training/ppcls/arch/backbone/variant_models/efficientnet_variant.py
new file mode 100755
index 000000000..86701d4b4
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/variant_models/efficientnet_variant.py
@@ -0,0 +1,44 @@
+import paddle
+import paddle.nn as nn
+from ..model_zoo.efficientnet import EfficientNetB3, _load_pretrained
+
+MODEL_URLS = {
+    "EfficientNetB3_watermark":
+    "https://paddleclas.bj.bcebos.com/models/practical/pretrained/EfficientNetB3_watermark_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def EfficientNetB3_watermark(padding_type='DYNAMIC',
+                             override_params={"batch_norm_epsilon": 0.00001},
+                             use_se=True,
+                             pretrained=False,
+                             use_ssld=False,
+                             **kwargs):
+    def replace_function(_fc, pattern):
+        classifier = nn.Sequential(
+            # 1536 is the orginal in_features
+            nn.Linear(
+                in_features=1536, out_features=625),
+            nn.ReLU(),
+            nn.Dropout(p=0.3),
+            nn.Linear(
+                in_features=625, out_features=256),
+            nn.ReLU(),
+            nn.Linear(
+                in_features=256, out_features=2), )
+        return classifier
+
+    pattern = "_fc"
+    model = EfficientNetB3(
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=True,
+        pretrained=False,
+        use_ssld=False,
+        **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB3_watermark"],
+                     use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/variant_models/foundation_vit_variant.py b/example/low_precision_training/ppcls/arch/backbone/variant_models/foundation_vit_variant.py
new file mode 100755
index 000000000..7f79dcaf3
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/variant_models/foundation_vit_variant.py
@@ -0,0 +1,52 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..model_zoo.foundation_vit import CLIP_vit_large_patch14_224, _load_pretrained
+
+MODEL_URLS = {
+    "CLIP_large_patch14_224_aesthetic":
+    "https://paddleclas.bj.bcebos.com/models/practical/pretrained/CLIP_large_patch14_224_aesthetic_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class MLP(nn.Layer):
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.layers = nn.Sequential(
+            nn.Linear(self.input_size, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1), nn.Linear(64, 16), nn.Linear(16, 1))
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class Aesthetic_Score_Predictor(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.model = CLIP_vit_large_patch14_224()
+        self.fc_head = nn.Linear(1024, 768, bias_attr=False)
+        self.mlp = MLP(768)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = x[:, 0, :]
+        x = self.fc_head(x)
+        x = F.normalize(x, p=2, axis=-1)
+        x = self.mlp(x)
+        return x
+
+
+def CLIP_large_patch14_224_aesthetic(pretrained=False,
+                                     use_ssld=False,
+                                     **kwargs):
+    model = Aesthetic_Score_Predictor()
+    _load_pretrained(pretrained, model,
+                     MODEL_URLS["CLIP_large_patch14_224_aesthetic"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py b/example/low_precision_training/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py
new file mode 100755
index 000000000..e4c25c4c6
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/variant_models/pp_lcnet_variant.py
@@ -0,0 +1,29 @@
+import paddle
+from paddle.nn import Sigmoid
+from paddle.nn import Tanh
+from ..legendary_models.pp_lcnet import PPLCNet_x2_5
+
+__all__ = ["PPLCNet_x2_5_Tanh"]
+
+
+class TanhSuffix(paddle.nn.Layer):
+    def __init__(self, origin_layer):
+        super(TanhSuffix, self).__init__()
+        self.origin_layer = origin_layer
+        self.tanh = Tanh()
+
+    def forward(self, input, res_dict=None, **kwargs):
+        x = self.origin_layer(input)
+        x = self.tanh(x)
+        return x
+
+
+def PPLCNet_x2_5_Tanh(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(origin_layer, pattern):
+        new_layer = TanhSuffix(origin_layer)
+        return new_layer
+
+    pattern = "fc"
+    model = PPLCNet_x2_5(pretrained=pretrained, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/variant_models/pp_lcnetv2_variant.py b/example/low_precision_training/ppcls/arch/backbone/variant_models/pp_lcnetv2_variant.py
new file mode 100755
index 000000000..6acccdc8e
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/variant_models/pp_lcnetv2_variant.py
@@ -0,0 +1,56 @@
+from paddle.nn import Conv2D, Identity
+
+from ..legendary_models.pp_lcnet_v2 import MODEL_URLS, PPLCNetV2_base, RepDepthwiseSeparable, _load_pretrained
+
+__all__ = ["PPLCNetV2_base_ShiTu"]
+
+
+def PPLCNetV2_base_ShiTu(pretrained=False, use_ssld=False, **kwargs):
+    """
+    An variant network of PPLCNetV2_base
+    1. remove ReLU layer after last_conv
+    2. add bias to last_conv
+    3. change stride to 1 in last two RepDepthwiseSeparable Block
+    """
+    model = PPLCNetV2_base(pretrained=False, use_ssld=use_ssld, **kwargs)
+
+    def remove_ReLU_function(conv, pattern):
+        new_conv = Identity()
+        return new_conv
+
+    def add_bias_last_conv(conv, pattern):
+        new_conv = Conv2D(
+            in_channels=conv._in_channels,
+            out_channels=conv._out_channels,
+            kernel_size=conv._kernel_size,
+            stride=conv._stride,
+            padding=conv._padding,
+            groups=conv._groups,
+            bias_attr=True)
+        return new_conv
+
+    def last_stride_function(rep_block, pattern):
+        new_conv = RepDepthwiseSeparable(
+            in_channels=rep_block.in_channels,
+            out_channels=rep_block.out_channels,
+            stride=1,
+            dw_size=rep_block.dw_size,
+            split_pw=rep_block.split_pw,
+            use_rep=rep_block.use_rep,
+            use_se=rep_block.use_se,
+            use_shortcut=rep_block.use_shortcut)
+        return new_conv
+
+    pattern_act = ["act"]
+    pattern_lastconv = ["last_conv"]
+    pattern_last_stride = [
+        "stages[3][0]",
+        "stages[3][1]",
+    ]
+    model.upgrade_sublayer(pattern_act, remove_ReLU_function)
+    model.upgrade_sublayer(pattern_lastconv, add_bias_last_conv)
+    model.upgrade_sublayer(pattern_last_stride, last_stride_function)
+
+    # load params again after upgrade some layers
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNetV2_base"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/variant_models/resnet_variant.py b/example/low_precision_training/ppcls/arch/backbone/variant_models/resnet_variant.py
new file mode 100755
index 000000000..3569cd206
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/variant_models/resnet_variant.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+import copy
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from ..legendary_models.resnet import ResNet50, MODEL_URLS, _load_pretrained
+
+__all__ = [
+    "ResNet50_last_stage_stride1", "ResNet50_adaptive_max_pool2d",
+    'ResNet50_metabin'
+]
+
+
+def ResNet50_last_stage_stride1(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(conv, pattern):
+        new_conv = nn.Conv2D(
+            in_channels=conv._in_channels,
+            out_channels=conv._out_channels,
+            kernel_size=conv._kernel_size,
+            stride=1,
+            padding=conv._padding,
+            groups=conv._groups,
+            bias_attr=conv._bias_attr)
+        return new_conv
+
+    pattern = ["blocks[13].conv1.conv", "blocks[13].short.conv"]
+    model = ResNet50(pretrained=False, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_adaptive_max_pool2d(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(pool, pattern):
+        new_pool = nn.AdaptiveMaxPool2D(output_size=1)
+        return new_pool
+
+    pattern = ["avg_pool"]
+    model = ResNet50(pretrained=False, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_metabin(pretrained=False,
+                     use_ssld=False,
+                     bias_lr_factor=1.0,
+                     **kwargs):
+    """
+    ResNet50 which replaces all `bn` layers with MetaBIN
+    reference: https://arxiv.org/abs/2011.14670
+    """
+
+    class BINGate(nn.Layer):
+        def __init__(self, num_features):
+            super().__init__()
+            self.gate = self.create_parameter(
+                shape=[num_features],
+                default_initializer=nn.initializer.Constant(1.0))
+            self.add_parameter("gate", self.gate)
+
+        def forward(self, opt={}):
+            flag_update = 'lr_gate' in opt and \
+                opt.get('enable_inside_update', False)
+            if flag_update and self.gate.grad is not None:  # update gate
+                lr = opt['lr_gate'] * self.gate.optimize_attr.get(
+                    'learning_rate', 1.0)
+                gate = self.gate - lr * self.gate.grad
+                gate.clip_(min=0, max=1)
+            else:
+                gate = self.gate
+            return gate
+
+        def clip_gate(self):
+            self.gate.set_value(self.gate.clip(0, 1))
+
+    class MetaBN(nn.BatchNorm2D):
+        def forward(self, inputs, opt={}):
+            mode = opt.get("bn_mode", "general") if self.training else "eval"
+            if mode == "general":  # update, but not apply running_mean/var
+                result = F.batch_norm(inputs, self._mean, self._variance,
+                                      self.weight, self.bias, self.training,
+                                      self._momentum, self._epsilon)
+            elif mode == "hold":  # not update, not apply running_mean/var
+                result = F.batch_norm(
+                    inputs,
+                    paddle.mean(
+                        inputs, axis=(0, 2, 3)),
+                    paddle.var(inputs, axis=(0, 2, 3)),
+                    self.weight,
+                    self.bias,
+                    self.training,
+                    self._momentum,
+                    self._epsilon)
+            elif mode == "eval":  # fix and apply running_mean/var,
+                if self._mean is None:
+                    result = F.batch_norm(
+                        inputs,
+                        paddle.mean(
+                            inputs, axis=(0, 2, 3)),
+                        paddle.var(inputs, axis=(0, 2, 3)),
+                        self.weight,
+                        self.bias,
+                        True,
+                        self._momentum,
+                        self._epsilon)
+                else:
+                    result = F.batch_norm(inputs, self._mean, self._variance,
+                                          self.weight, self.bias, False,
+                                          self._momentum, self._epsilon)
+            return result
+
+    class MetaBIN(nn.Layer):
+        """
+        MetaBIN (Meta Batch-Instance Normalization)
+        reference: https://arxiv.org/abs/2011.14670
+        """
+
+        def __init__(self, num_features):
+            super().__init__()
+            self.batch_norm = MetaBN(
+                num_features=num_features, use_global_stats=True)
+            self.instance_norm = nn.InstanceNorm2D(num_features=num_features)
+            self.gate = BINGate(num_features=num_features)
+            self.opt = defaultdict()
+
+        def forward(self, inputs):
+            out_bn = self.batch_norm(inputs, self.opt)
+            out_in = self.instance_norm(inputs)
+            gate = self.gate(self.opt)
+            gate = gate.unsqueeze([0, -1, -1])
+            out = out_bn * gate + out_in * (1 - gate)
+            return out
+
+        def reset_opt(self):
+            self.opt = defaultdict()
+
+        def setup_opt(self, opt):
+            """
+            Arg:
+                opt (dict): Optional setting to change the behavior of MetaBIN during training. 
+                    It includes three settings which are `enable_inside_update`, `lr_gate` and `bn_mode`.
+            """
+            self.check_opt(opt)
+            self.opt = copy.deepcopy(opt)
+
+        @classmethod
+        def check_opt(cls, opt):
+            assert isinstance(opt, dict), \
+                TypeError('Got the wrong type of `opt`. Please use `dict` type.')
+
+            if opt.get('enable_inside_update', False) and 'lr_gate' not in opt:
+                raise RuntimeError('Missing `lr_gate` in opt.')
+
+            assert isinstance(opt.get('lr_gate', 1.0), float), \
+                TypeError('Got the wrong type of `lr_gate`. Please use `float` type.')
+            assert isinstance(opt.get('enable_inside_update', True), bool), \
+                TypeError('Got the wrong type of `enable_inside_update`. Please use `bool` type.')
+            assert opt.get('bn_mode', "general") in ["general", "hold", "eval"], \
+                TypeError('Got the wrong value of `bn_mode`.')
+
+    def bn2metabin(bn, pattern):
+        metabin = MetaBIN(bn.weight.shape[0])
+        metabin.batch_norm.weight.set_value(bn.weight)
+        metabin.batch_norm.bias.set_value(bn.bias)
+        metabin.batch_norm._variance.set_value(bn._variance)
+        metabin.batch_norm._mean.set_value(bn._mean)
+        return metabin
+
+    def setup_optimize_attr(model, bias_lr_factor):
+        for name, params in model.named_parameters():
+            if params.stop_gradient:
+                continue
+            if "bias" in name:
+                params.optimize_attr['learning_rate'] = bias_lr_factor
+
+    pattern = []
+    pattern.extend(["blocks[{}].conv{}.bn".format(i, j) \
+                    for i in range(16) for j in range(3)])
+    pattern.extend(["blocks[{}].short.bn".format(i) for i in [0, 3, 7, 13]])
+    pattern.append("stem[0].bn")
+
+    model = ResNet50_last_stage_stride1(
+        pretrained=pretrained, use_ssld=use_ssld, **kwargs)
+
+    model.upgrade_sublayer(pattern, bn2metabin)
+    setup_optimize_attr(model=model, bias_lr_factor=bias_lr_factor)
+
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/variant_models/swin_transformer_variant.py b/example/low_precision_training/ppcls/arch/backbone/variant_models/swin_transformer_variant.py
new file mode 100755
index 000000000..1e6632f6e
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/variant_models/swin_transformer_variant.py
@@ -0,0 +1,355 @@
+import numpy as np
+import paddle
+import paddle.nn as nn
+from ..legendary_models.swin_transformer import SwinTransformer, _load_pretrained, \
+    PatchEmbed, BasicLayer, SwinTransformerBlock
+
+MODEL_URLS_SOLIDER = {
+    "SwinTransformer_tiny_patch4_window7_224_SOLIDER":
+        'https://paddleclas.bj.bcebos.com/models/SOLIDER/SwinTransformer_tiny_patch4_window7_224_pretrained.pdparams',
+    "SwinTransformer_small_patch4_window7_224_SOLIDER":
+        'https://paddleclas.bj.bcebos.com/models/SOLIDER/SwinTransformer_small_patch4_window7_224_pretrained.pdparams',
+    "SwinTransformer_base_patch4_window7_224_SOLIDER":
+        'https://paddleclas.bj.bcebos.com/models/SOLIDER/SwinTransformer_base_patch4_window7_224_pretrained.pdparams'
+}
+
+__all__ = list(MODEL_URLS_SOLIDER.keys())
+
+
+class PatchEmbed_SOLIDER(PatchEmbed):
+    def forward(self, x):
+        x = self.proj(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose([0, 2, 1])  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class SwinTransformerBlock_SOLIDER(SwinTransformerBlock):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super(SwinTransformerBlock_SOLIDER, self).__init__(
+            dim=dim,
+            input_resolution=input_resolution,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=shift_size,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop=drop,
+            attn_drop=attn_drop,
+            drop_path=drop_path,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.check_condition()
+
+    def check_condition(self):
+        if min(self.input_resolution) < self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+
+class BasicLayer_SOLIDER(BasicLayer):
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+
+        super(BasicLayer_SOLIDER, self).__init__(
+            dim=dim,
+            input_resolution=input_resolution,
+            depth=depth,
+            num_heads=num_heads,
+            window_size=window_size,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop=drop,
+            attn_drop=attn_drop,
+            drop_path=drop_path,
+            norm_layer=norm_layer,
+            downsample=downsample,
+            use_checkpoint=use_checkpoint
+        )
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock_SOLIDER(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+    def forward(self, x):
+        for blk in self.blocks:
+            x = blk(x)
+
+        if self.downsample is not None:
+            x_down = self.downsample(x)
+            return x_down, x
+        else:
+            return x, x
+
+
+class PatchMerging_SOLIDER(nn.Layer):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.sampler = nn.Unfold(kernel_sizes=2, strides=2)
+        self.norm = norm_layer(4 * dim)
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, "x size ({}*{}) are not even.".format(
+            H, W)
+
+        x = x.reshape([B, H, W, C]).transpose([0, 3, 1, 2])
+
+        x = self.sampler(x)
+        x = x.transpose([0, 2, 1])
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+
+
+class SwinTransformer_SOLIDER(SwinTransformer):
+    def __init__(self,
+                 embed_dim=96,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 out_indices=(0, 1, 2, 3),
+                 semantic_weight=1.0,
+                 use_checkpoint=False,
+                 **kwargs):
+        super(SwinTransformer_SOLIDER, self).__init__()
+        patches_resolution = self.patch_embed.patches_resolution
+        self.num_classes = num_classes = class_num
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths)).tolist()  # stochastic depth decay rule
+        self.patch_embed = PatchEmbed_SOLIDER(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        self.out_indices = out_indices
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer_SOLIDER(
+                dim=int(embed_dim * 2 ** i_layer),
+                input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                  patches_resolution[1] // (2 ** i_layer)),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging_SOLIDER
+                if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.num_features_s = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        for i in out_indices:
+            layer = norm_layer(self.num_features_s[i])
+            layer_name = f'norm{i}'
+            self.add_sublayer(layer_name, layer)
+        self.avgpool = nn.AdaptiveAvgPool2D(1)
+
+        # semantic embedding
+        self.semantic_weight = semantic_weight
+        if self.semantic_weight >= 0:
+            self.semantic_embed_w = nn.LayerList()
+            self.semantic_embed_b = nn.LayerList()
+            for i in range(len(depths)):
+                if i >= len(depths) - 1:
+                    i = len(depths) - 2
+                semantic_embed_w = nn.Linear(2, self.num_features_s[i + 1])
+                semantic_embed_b = nn.Linear(2, self.num_features_s[i + 1])
+                self._init_weights(semantic_embed_w)
+                self._init_weights(semantic_embed_b)
+                self.semantic_embed_w.append(semantic_embed_w)
+                self.semantic_embed_b.append(semantic_embed_b)
+            self.softplus = nn.Softplus()
+        self.head = nn.Linear(
+            self.num_features,
+            num_classes) if self.num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x, semantic_weight=None):
+        if self.semantic_weight >= 0 and semantic_weight is None:
+            w = paddle.ones((x.shape[0], 1)) * self.semantic_weight
+            w = paddle.concat([w, 1 - w], axis=-1)
+            semantic_weight = w.cuda()
+        x, hw_shape = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, out = layer(x)
+            if self.semantic_weight >= 0:
+                sw = self.semantic_embed_w[i](semantic_weight).unsqueeze(1)
+                sb = self.semantic_embed_b[i](semantic_weight).unsqueeze(1)
+                x = x * self.softplus(sw) + sb
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.reshape([-1, *hw_shape,
+                                   self.num_features_s[i]]).transpose([0, 3, 1, 2])
+                hw_shape = [item // 2 for item in hw_shape]
+                outs.append(out)
+
+        x = self.avgpool(outs[-1])  # B C 1
+        x = paddle.flatten(x, 1)
+
+        return x
+
+
+def SwinTransformer_tiny_patch4_window7_224_SOLIDER(
+        pretrained=False,
+        **kwargs):
+    model = SwinTransformer_SOLIDER(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.2,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.1
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model=model,
+        model_url=MODEL_URLS_SOLIDER["SwinTransformer_tiny_patch4_window7_224_SOLIDER"],
+        **kwargs)
+    return model
+
+
+def SwinTransformer_small_patch4_window7_224_SOLIDER(
+        pretrained=False,
+        **kwargs):
+    model = SwinTransformer_SOLIDER(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.3,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model=model,
+        model_url=MODEL_URLS_SOLIDER["SwinTransformer_small_patch4_window7_224_SOLIDER"],
+        **kwargs)
+    return model
+
+
+def SwinTransformer_base_patch4_window7_224_SOLIDER(
+        pretrained=False,
+        **kwargs):
+    model = SwinTransformer_SOLIDER(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        drop_path_rate=0.5,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model=model,
+        model_url=MODEL_URLS_SOLIDER["SwinTransformer_base_patch4_window7_224_SOLIDER"],
+        **kwargs)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/backbone/variant_models/vgg_variant.py b/example/low_precision_training/ppcls/arch/backbone/variant_models/vgg_variant.py
new file mode 100755
index 000000000..36ce47768
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/backbone/variant_models/vgg_variant.py
@@ -0,0 +1,28 @@
+import paddle
+from paddle.nn import Sigmoid
+from ..legendary_models.vgg import VGG19
+
+__all__ = ["VGG19Sigmoid"]
+
+
+class SigmoidSuffix(paddle.nn.Layer):
+    def __init__(self, origin_layer):
+        super().__init__()
+        self.origin_layer = origin_layer
+        self.sigmoid = Sigmoid()
+
+    def forward(self, input, res_dict=None, **kwargs):
+        x = self.origin_layer(input)
+        x = self.sigmoid(x)
+        return x
+
+
+def VGG19Sigmoid(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(origin_layer, pattern):
+        new_layer = SigmoidSuffix(origin_layer)
+        return new_layer
+
+    pattern = "fc2"
+    model = VGG19(pretrained=pretrained, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    return model
diff --git a/example/low_precision_training/ppcls/arch/distill/afd_attention.py b/example/low_precision_training/ppcls/arch/distill/afd_attention.py
new file mode 100755
index 000000000..63b094f31
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/distill/afd_attention.py
@@ -0,0 +1,123 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle
+import numpy as np
+
+
+class LinearBNReLU(nn.Layer):
+    def __init__(self, nin, nout):
+        super().__init__()
+        self.linear = nn.Linear(nin, nout)
+        self.bn = nn.BatchNorm1D(nout)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, relu=True):
+        if relu:
+            return self.relu(self.bn(self.linear(x)))
+        return self.bn(self.linear(x))
+
+
+def unique_shape(s_shapes):
+    n_s = []
+    unique_shapes = []
+    n = -1
+    for s_shape in s_shapes:
+        if s_shape not in unique_shapes:
+            unique_shapes.append(s_shape)
+            n += 1
+        n_s.append(n)
+    return n_s, unique_shapes
+
+
+class LinearTransformTeacher(nn.Layer):
+    def __init__(self, qk_dim, t_shapes, keys):
+        super().__init__()
+        self.teacher_keys = keys
+        self.t_shapes = [[1] + t_i for t_i in t_shapes]
+        self.query_layer = nn.LayerList(
+            [LinearBNReLU(t_shape[1], qk_dim) for t_shape in self.t_shapes])
+
+    def forward(self, t_features_dict):
+        g_t = [t_features_dict[key] for key in self.teacher_keys]
+        bs = g_t[0].shape[0]
+        channel_mean = [f_t.mean(3).mean(2) for f_t in g_t]
+        spatial_mean = []
+        for i in range(len(g_t)):
+            c, h, w = g_t[i].shape[1:]
+            spatial_mean.append(g_t[i].pow(2).mean(1).reshape([bs, h * w]))
+        query = paddle.stack(
+            [
+                query_layer(
+                    f_t, relu=False)
+                for f_t, query_layer in zip(channel_mean, self.query_layer)
+            ],
+            axis=1)
+        value = [F.normalize(f_s, axis=1) for f_s in spatial_mean]
+        return {"query": query, "value": value}
+
+
+class LinearTransformStudent(nn.Layer):
+    def __init__(self, qk_dim, t_shapes, s_shapes, keys):
+        super().__init__()
+        self.student_keys = keys
+        self.t_shapes = [[1] + t_i for t_i in t_shapes]
+        self.s_shapes = [[1] + s_i for s_i in s_shapes]
+        self.t = len(self.t_shapes)
+        self.s = len(self.s_shapes)
+        self.qk_dim = qk_dim
+        self.n_t, self.unique_t_shapes = unique_shape(self.t_shapes)
+        self.relu = nn.ReLU()
+        self.samplers = nn.LayerList(
+            [Sample(t_shape) for t_shape in self.unique_t_shapes])
+        self.key_layer = nn.LayerList([
+            LinearBNReLU(s_shape[1], self.qk_dim) for s_shape in self.s_shapes
+        ])
+        self.bilinear = LinearBNReLU(qk_dim, qk_dim * len(self.t_shapes))
+
+    def forward(self, s_features_dict):
+        g_s = [s_features_dict[key] for key in self.student_keys]
+        bs = g_s[0].shape[0]
+        channel_mean = [f_s.mean(3).mean(2) for f_s in g_s]
+        spatial_mean = [sampler(g_s, bs) for sampler in self.samplers]
+
+        key = paddle.stack(
+            [
+                key_layer(f_s)
+                for key_layer, f_s in zip(self.key_layer, channel_mean)
+            ],
+            axis=1).reshape([-1, self.qk_dim])  # Bs x h
+        bilinear_key = self.bilinear(
+            key, relu=False).reshape([bs, self.s, self.t, self.qk_dim])
+        value = [F.normalize(s_m, axis=2) for s_m in spatial_mean]
+        return {"bilinear_key": bilinear_key, "value": value}
+
+
+class Sample(nn.Layer):
+    def __init__(self, t_shape):
+        super().__init__()
+        self.t_N, self.t_C, self.t_H, self.t_W = t_shape
+        self.sample = nn.AdaptiveAvgPool2D((self.t_H, self.t_W))
+
+    def forward(self, g_s, bs):
+        g_s = paddle.stack(
+            [
+                self.sample(f_s.pow(2).mean(
+                    1, keepdim=True)).reshape([bs, self.t_H * self.t_W])
+                for f_s in g_s
+            ],
+            axis=1)
+        return g_s
diff --git a/example/low_precision_training/ppcls/arch/gears/__init__.py b/example/low_precision_training/ppcls/arch/gears/__init__.py
new file mode 100755
index 000000000..b2a6f46a0
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arcmargin import ArcMargin
+from .cosmargin import CosMargin
+from .circlemargin import CircleMargin
+from .fc import FC
+from .vehicle_neck import VehicleNeck
+from paddle.nn import Tanh, Identity
+from .bnneck import BNNeck
+from .adamargin import AdaMargin
+from .frfn_neck import FRFNNeck
+from .metabnneck import MetaBNNeck
+from .ml_decoder import MLDecoder
+
+__all__ = ['build_gear', 'add_ml_decoder_head']
+
+
+def build_gear(config):
+    support_dict = [
+        'ArcMargin', 'CosMargin', 'CircleMargin', 'FC', 'VehicleNeck', 'Tanh',
+        'BNNeck', 'AdaMargin', 'FRFNNeck', 'MetaBNNeck'
+    ]
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception(
+        'head only support {}'.format(support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
+
+
+def add_ml_decoder_head(model, config):
+    if 'class_num' not in config:
+        if hasattr(model, 'class_num'):
+            config['class_num'] = model.class_num
+        else:
+            raise AttributeError(
+                'Please manually add parameter `class_num` '
+                'for MLDecoder in the config file.')
+
+    # remove_layers: list of layer names that need to be deleted from backbone
+    if 'remove_layers' in config:
+        remove_layers = config.pop('remove_layers')
+    else:
+        remove_layers = ['avg_pool', 'flatten']
+    for remove_layer in remove_layers:
+        if hasattr(model, remove_layer):
+            delattr(model, remove_layer)
+            setattr(model, remove_layer, Identity())
+        else:
+            raise AttributeError(
+                f"{remove_layer} does not have attribute the model.")
+
+    # replace_layer: layer name that need to be replaced in backbone
+    if 'replace_layer' in config:
+        replace_layer = config.pop('replace_layer')
+    else:
+        replace_layer = 'fc'
+    if hasattr(model, replace_layer):
+        delattr(model, replace_layer)
+        setattr(model, replace_layer, MLDecoder(**config))
+    else:
+        raise AttributeError(
+            f"{replace_layer} does not have attribute the model.")
diff --git a/example/low_precision_training/ppcls/arch/gears/adamargin.py b/example/low_precision_training/ppcls/arch/gears/adamargin.py
new file mode 100755
index 000000000..1b0f5f245
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/adamargin.py
@@ -0,0 +1,111 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on AdaFace(https://github.com/mk-minchul/AdaFace)
+# Paper: AdaFace: Quality Adaptive Margin for Face Recognition
+from paddle.nn import Layer
+import math
+import paddle
+
+
+def l2_norm(input, axis=1):
+    norm = paddle.norm(input, 2, axis, True)
+    output = paddle.divide(input, norm)
+    return output
+
+
+class AdaMargin(Layer):
+    def __init__(
+            self,
+            embedding_size=512,
+            class_num=70722,
+            m=0.4,
+            h=0.333,
+            s=64.,
+            t_alpha=1.0, ):
+        super(AdaMargin, self).__init__()
+        self.classnum = class_num
+        kernel_weight = paddle.uniform(
+            [embedding_size, class_num], min=-1, max=1)
+        kernel_weight_norm = paddle.norm(
+            kernel_weight, p=2, axis=0, keepdim=True)
+        kernel_weight_norm = paddle.where(kernel_weight_norm > 1e-5,
+                                          kernel_weight_norm,
+                                          paddle.ones_like(kernel_weight_norm))
+        kernel_weight = kernel_weight / kernel_weight_norm
+        self.kernel = self.create_parameter(
+            [embedding_size, class_num],
+            attr=paddle.nn.initializer.Assign(kernel_weight))
+
+        # initial kernel
+        # self.kernel.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5)
+        self.m = m
+        self.eps = 1e-3
+        self.h = h
+        self.s = s
+
+        # ema prep
+        self.t_alpha = t_alpha
+        self.register_buffer('t', paddle.zeros([1]), persistable=True)
+        self.register_buffer(
+            'batch_mean', paddle.ones([1]) * 20, persistable=True)
+        self.register_buffer(
+            'batch_std', paddle.ones([1]) * 100, persistable=True)
+
+    def forward(self, embbedings, label):
+
+        norms = paddle.norm(embbedings, 2, 1, True)
+        embbedings = paddle.divide(embbedings, norms)
+        kernel_norm = l2_norm(self.kernel, axis=0)
+        cosine = paddle.mm(embbedings, kernel_norm)
+        cosine = paddle.clip(cosine, -1 + self.eps,
+                             1 - self.eps)  # for stability
+
+        safe_norms = paddle.clip(norms, min=0.001, max=100)  # for stability
+        safe_norms = safe_norms.clone().detach()
+
+        # update batchmean batchstd
+        with paddle.no_grad():
+            mean = safe_norms.mean().detach()
+            std = safe_norms.std().detach()
+            self.batch_mean = mean * self.t_alpha + (1 - self.t_alpha
+                                                     ) * self.batch_mean
+            self.batch_std = std * self.t_alpha + (1 - self.t_alpha
+                                                   ) * self.batch_std
+
+        margin_scaler = (safe_norms - self.batch_mean) / (
+            self.batch_std + self.eps)  # 66% between -1, 1
+        margin_scaler = margin_scaler * self.h  # 68% between -0.333 ,0.333 when h:0.333
+        margin_scaler = paddle.clip(margin_scaler, -1, 1)
+
+        # g_angular
+        m_arc = paddle.nn.functional.one_hot(
+            label.reshape([-1]), self.classnum)
+        g_angular = self.m * margin_scaler * -1
+        m_arc = m_arc * g_angular
+        theta = paddle.acos(cosine)
+        theta_m = paddle.clip(
+            theta + m_arc, min=self.eps, max=math.pi - self.eps)
+        cosine = paddle.cos(theta_m)
+
+        # g_additive
+        m_cos = paddle.nn.functional.one_hot(
+            label.reshape([-1]), self.classnum)
+        g_add = self.m + (self.m * margin_scaler)
+        m_cos = m_cos * g_add
+        cosine = cosine - m_cos
+
+        # scale
+        scaled_cosine_m = cosine * self.s
+        return scaled_cosine_m
diff --git a/example/low_precision_training/ppcls/arch/gears/arcmargin.py b/example/low_precision_training/ppcls/arch/gears/arcmargin.py
new file mode 100755
index 000000000..6c72a71a2
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/arcmargin.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1801.07698
+
+import paddle
+import paddle.nn as nn
+import math
+
+
+class ArcMargin(nn.Layer):
+    def __init__(self,
+                 embedding_size,
+                 class_num,
+                 margin=0.5,
+                 scale=80.0,
+                 easy_margin=False):
+        super().__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        self.margin = margin
+        self.scale = scale
+        self.easy_margin = easy_margin
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label=None):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+        sin = paddle.sqrt(1.0 - paddle.square(cos) + 1e-6)
+        cos_m = math.cos(self.margin)
+        sin_m = math.sin(self.margin)
+        phi = cos * cos_m - sin * sin_m
+
+        th = math.cos(self.margin) * (-1)
+        mm = math.sin(self.margin) * self.margin
+        if self.easy_margin:
+            phi = self._paddle_where_more_than(cos, 0, phi, cos)
+        else:
+            phi = self._paddle_where_more_than(cos, th, phi, cos - mm)
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, phi) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
+
+    def _paddle_where_more_than(self, target, limit, x, y):
+        mask = paddle.cast(x=(target > limit), dtype='float32')
+        output = paddle.multiply(mask, x) + paddle.multiply((1.0 - mask), y)
+        return output
diff --git a/example/low_precision_training/ppcls/arch/gears/bnneck.py b/example/low_precision_training/ppcls/arch/gears/bnneck.py
new file mode 100755
index 000000000..e7abb8921
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/bnneck.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+
+from ..utils import get_param_attr_dict
+
+
+class BNNeck(nn.Layer):
+    def __init__(self, num_features, **kwargs):
+        super().__init__()
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=0.0),
+            trainable=False)
+
+        if 'weight_attr' in kwargs:
+            weight_attr = get_param_attr_dict(kwargs['weight_attr'])
+
+        bias_attr = None
+        if 'bias_attr' in kwargs:
+            bias_attr = get_param_attr_dict(kwargs['bias_attr'])
+
+        use_global_stats = None
+        if 'use_global_stats' in kwargs:
+            use_global_stats = get_param_attr_dict(kwargs['use_global_stats'])
+
+        self.feat_bn = nn.BatchNorm1D(
+            num_features,
+            momentum=0.9,
+            epsilon=1e-05,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            use_global_stats=use_global_stats)
+
+        self.flatten = nn.Flatten()
+
+    def forward(self, x):
+        x = self.flatten(x)
+        x = self.feat_bn(x)
+        return x
diff --git a/example/low_precision_training/ppcls/arch/gears/circlemargin.py b/example/low_precision_training/ppcls/arch/gears/circlemargin.py
new file mode 100755
index 000000000..c04d6618b
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/circlemargin.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2002.10857
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class CircleMargin(nn.Layer):
+    def __init__(self, embedding_size, class_num, margin, scale):
+        super(CircleMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        feat_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, feat_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        logits = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return logits
+
+        alpha_p = paddle.clip(-logits.detach() + 1 + self.margin, min=0.)
+        alpha_n = paddle.clip(logits.detach() + self.margin, min=0.)
+        delta_p = 1 - self.margin
+        delta_n = self.margin
+
+        m_hot = F.one_hot(label.reshape([-1]), num_classes=logits.shape[1])
+
+        logits_p = alpha_p * (logits - delta_p)
+        logits_n = alpha_n * (logits - delta_n)
+        pre_logits = logits_p * m_hot + logits_n * (1 - m_hot)
+        pre_logits = self.scale * pre_logits
+
+        return pre_logits
diff --git a/example/low_precision_training/ppcls/arch/gears/cosmargin.py b/example/low_precision_training/ppcls/arch/gears/cosmargin.py
new file mode 100755
index 000000000..d420c0ace
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/cosmargin.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1801.09414
+
+import paddle
+import math
+import paddle.nn as nn
+
+
+class CosMargin(paddle.nn.Layer):
+    def __init__(self, embedding_size, class_num, margin=0.35, scale=64.0):
+        super(CosMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        label.stop_gradient = True
+
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+
+        cos_m = cos - self.margin
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, cos_m) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
diff --git a/example/low_precision_training/ppcls/arch/gears/fc.py b/example/low_precision_training/ppcls/arch/gears/fc.py
new file mode 100755
index 000000000..622b0d37d
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/fc.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+from ..utils import get_param_attr_dict
+
+
+class FC(nn.Layer):
+    def __init__(self, embedding_size, class_num, **kwargs):
+        super(FC, self).__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierNormal())
+        if 'weight_attr' in kwargs:
+            weight_attr = get_param_attr_dict(kwargs['weight_attr'])
+
+        bias_attr = None
+        if 'bias_attr' in kwargs:
+            bias_attr = get_param_attr_dict(kwargs['bias_attr'])
+
+        self.fc = nn.Linear(
+            self.embedding_size,
+            self.class_num,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, input, label=None):
+        out = self.fc(input)
+        return out
diff --git a/example/low_precision_training/ppcls/arch/gears/frfn_neck.py b/example/low_precision_training/ppcls/arch/gears/frfn_neck.py
new file mode 100755
index 000000000..43eadbab4
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/frfn_neck.py
@@ -0,0 +1,32 @@
+import paddle.nn as nn
+
+
+class Normalize(nn.Layer):
+    """ Ln normalization copied from
+    https://github.com/salesforce/CoMatch
+    """
+
+    def __init__(self, power=2):
+        super(Normalize, self).__init__()
+        self.power = power
+
+    def forward(self, x):
+        norm = x.pow(self.power).sum(1, keepdim=True).pow(1. / self.power)
+        out = x.divide(norm)
+        return out
+
+
+class FRFNNeck(nn.Layer):
+    def __init__(self, num_features, low_dim, **kwargs):
+        super(FRFNNeck, self).__init__()
+        self.l2norm = Normalize(2)
+        self.fc1 = nn.Linear(num_features, num_features)
+        self.relu_mlp = nn.LeakyReLU(negative_slope=0.1)
+        self.fc2 = nn.Linear(num_features, low_dim)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu_mlp(x)
+        x = self.fc2(x)
+        x = self.l2norm(x)
+        return x
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/arch/gears/identity_head.py b/example/low_precision_training/ppcls/arch/gears/identity_head.py
new file mode 100755
index 000000000..7d11e5742
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/identity_head.py
@@ -0,0 +1,9 @@
+from paddle import nn
+
+
+class IdentityHead(nn.Layer):
+    def __init__(self):
+        super(IdentityHead, self).__init__()
+
+    def forward(self, x, label=None):
+        return {"features": x, "logits": None}
diff --git a/example/low_precision_training/ppcls/arch/gears/metabnneck.py b/example/low_precision_training/ppcls/arch/gears/metabnneck.py
new file mode 100755
index 000000000..a27a7a852
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/metabnneck.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+from collections import defaultdict
+import copy
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..utils import get_param_attr_dict
+
+
+class MetaBN1D(nn.BatchNorm1D):
+    def forward(self, inputs, opt={}):
+        mode = opt.get("bn_mode", "general") if self.training else "eval"
+        if mode == "general":  # update, but not apply running_mean/var
+            result = F.batch_norm(inputs, self._mean, self._variance,
+                                  self.weight, self.bias, self.training,
+                                  self._momentum, self._epsilon)
+        elif mode == "hold":  # not update, not apply running_mean/var
+            result = F.batch_norm(
+                inputs,
+                paddle.mean(
+                    inputs, axis=0),
+                paddle.var(inputs, axis=0),
+                self.weight,
+                self.bias,
+                self.training,
+                self._momentum,
+                self._epsilon)
+        elif mode == "eval":  # fix and apply running_mean/var,
+            if self._mean is None:
+                result = F.batch_norm(
+                    inputs,
+                    paddle.mean(
+                        inputs, axis=0),
+                    paddle.var(inputs, axis=0),
+                    self.weight,
+                    self.bias,
+                    True,
+                    self._momentum,
+                    self._epsilon)
+            else:
+                result = F.batch_norm(inputs, self._mean, self._variance,
+                                      self.weight, self.bias, False,
+                                      self._momentum, self._epsilon)
+        return result
+
+
+class MetaBNNeck(nn.Layer):
+    def __init__(self, num_features, **kwargs):
+        super(MetaBNNeck, self).__init__()
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=0.0),
+            trainable=False)
+
+        if 'weight_attr' in kwargs:
+            weight_attr = get_param_attr_dict(kwargs['weight_attr'])
+
+        bias_attr = None
+        if 'bias_attr' in kwargs:
+            bias_attr = get_param_attr_dict(kwargs['bias_attr'])
+
+        use_global_stats = None
+        if 'use_global_stats' in kwargs:
+            use_global_stats = get_param_attr_dict(kwargs['use_global_stats'])
+
+        self.feat_bn = MetaBN1D(
+            num_features,
+            momentum=0.9,
+            epsilon=1e-05,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            use_global_stats=use_global_stats)
+        self.flatten = nn.Flatten()
+        self.opt = {}
+
+    def forward(self, x):
+        x = self.flatten(x)
+        x = self.feat_bn(x, self.opt)
+        return x
+
+    def reset_opt(self):
+        self.opt = defaultdict()
+
+    def setup_opt(self, opt):
+        """
+        Arg:
+            opt (dict): Optional setting to change the behavior of MetaBIN during training. 
+                It includes three settings which are `enable_inside_update`, `lr_gate` and `bn_mode`.
+        """
+        self.check_opt(opt)
+        self.opt = copy.deepcopy(opt)
+
+    @classmethod
+    def check_opt(cls, opt):
+        assert isinstance(opt, dict), \
+            TypeError('Got the wrong type of `opt`. Please use `dict` type.')
+
+        if opt.get('enable_inside_update', False) and 'lr_gate' not in opt:
+            raise RuntimeError('Missing `lr_gate` in opt.')
+
+        assert isinstance(opt.get('lr_gate', 1.0), float), \
+            TypeError('Got the wrong type of `lr_gate`. Please use `float` type.')
+        assert isinstance(opt.get('enable_inside_update', True), bool), \
+            TypeError('Got the wrong type of `enable_inside_update`. Please use `bool` type.')
+        assert opt.get('bn_mode', "general") in ["general", "hold", "eval"], \
+            TypeError('Got the wrong value of `bn_mode`.')
diff --git a/example/low_precision_training/ppcls/arch/gears/ml_decoder.py b/example/low_precision_training/ppcls/arch/gears/ml_decoder.py
new file mode 100755
index 000000000..7f24bac2f
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/ml_decoder.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import XavierNormal, Constant, Normal
+
+xavier_normal_ = XavierNormal()
+normal_ = Normal
+zero_ = Constant(value=0.0)
+
+
+class MLDecoder(nn.Layer):
+    """
+    ML-Decoder is an attention-based classification head,
+    which introduced by Tal Ridnik et al. in https://arxiv.org/pdf/2111.12933.pdf.
+    """
+
+    def __init__(self,
+                 class_num=80,
+                 in_channels=2048,
+                 query_num=80,
+                 embed_dim=768,
+                 depth=1,
+                 num_heads=8,
+                 mlp_hidden_dim=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 freeze_query_embed=True,
+                 remove_self_attn=True):
+        super().__init__()
+        self.class_num = class_num
+        self.in_channels = in_channels
+
+        # 1 <= query_num <= class_num
+        query_num = min(max(query_num, 1), class_num)
+
+        self.input_proj = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=embed_dim,
+            kernel_size=1,
+            stride=1)
+
+        self.query_pos_embed = nn.Embedding(
+            num_embeddings=query_num,
+            embedding_dim=embed_dim)
+        if freeze_query_embed:
+            self.query_pos_embed.weight.stop_gradient = True
+
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=embed_dim,
+            nhead=num_heads,
+            dim_feedforward=mlp_hidden_dim,
+            dropout=dropout,
+            activation=activation,
+            attn_dropout=dropout,
+            act_dropout=dropout)
+        if remove_self_attn:
+            del decoder_layer.self_attn
+            decoder_layer.self_attn = self.self_attn_identity
+        self.decoder = nn.TransformerDecoder(
+            decoder_layer=decoder_layer,
+            num_layers=depth)
+
+        group_factor = math.ceil(class_num / query_num)
+        self.group_conv = nn.Conv2D(
+            in_channels=query_num * embed_dim,
+            out_channels=query_num * group_factor,
+            kernel_size=1,
+            stride=1,
+            groups=query_num)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        normal_(self.query_pos_embed.weight)
+        xavier_normal_(self.group_conv.weight)
+        zero_(self.group_conv.bias)
+
+    @staticmethod
+    def self_attn_identity(*args):
+        return args[0]
+
+    def group_fc_pool(self, x):
+        x = x.flatten(1)[..., None, None]
+        x = self.group_conv(x)
+        x = x.flatten(1)[:, :self.class_num]
+        return x
+
+    def forward(self, x):
+        if x.ndim == 2:
+            assert x.shape[1] % self.in_channels == 0, "Wrong `in_channels` value!!!"
+            x = x.reshape([x.shape[0], self.in_channels, -1, 1])
+        elif x.ndim == 3:
+            assert x.shape[1] == self.in_channels, "Wrong input shape!!!"
+            x = x.unsqueeze(-1)
+        else:
+            assert x.ndim == 4 and x.shape[1] == self.in_channels, "Wrong input shape!!!"
+
+        feat_proj = F.relu(self.input_proj(x))
+        feat_flatten = feat_proj.flatten(2).transpose([0, 2, 1])
+
+        query_pos_embed = self.query_pos_embed.weight[None].tile([x.shape[0], 1, 1])
+        out_embed = self.decoder(query_pos_embed, feat_flatten)
+
+        logit = self.group_fc_pool(out_embed)
+        return logit
diff --git a/example/low_precision_training/ppcls/arch/gears/vehicle_neck.py b/example/low_precision_training/ppcls/arch/gears/vehicle_neck.py
new file mode 100755
index 000000000..05f4e333f
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/gears/vehicle_neck.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class VehicleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW'):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            weight_attr=weight_attr,
+            bias_attr=weight_attr,
+            data_format=data_format)
+        self.flatten = nn.Flatten()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.flatten(x)
+        return x
diff --git a/example/low_precision_training/ppcls/arch/slim/__init__.py b/example/low_precision_training/ppcls/arch/slim/__init__.py
new file mode 100755
index 000000000..734bfefad
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/slim/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .prune import prune_model
+from .quant import quantize_model
diff --git a/example/low_precision_training/ppcls/arch/slim/prune.py b/example/low_precision_training/ppcls/arch/slim/prune.py
new file mode 100755
index 000000000..59cd411a1
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/slim/prune.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ...utils import logger
+
+
+def prune_model(config, model):
+    if config.get("Slim", False) and config["Slim"].get("prune", False):
+        import paddleslim
+        prune_method_name = config["Slim"]["prune"]["name"].lower()
+        assert prune_method_name in [
+            "fpgm", "l1_norm"
+        ], "The prune methods only support 'fpgm' and 'l1_norm'"
+        if prune_method_name == "fpgm":
+            model.pruner = paddleslim.dygraph.FPGMFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+        else:
+            model.pruner = paddleslim.dygraph.L1NormFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+
+        # prune model
+        _prune_model(config, model)
+    else:
+        model.pruner = None
+
+
+def _prune_model(config, model):
+    from paddleslim.analysis import dygraph_flops as flops
+    logger.info("FLOPs before pruning: {}GFLOPs".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9))
+    model.eval()
+
+    params = []
+    for sublayer in model.sublayers():
+        for param in sublayer.parameters(include_sublayers=False):
+            if isinstance(sublayer, paddle.nn.Conv2D):
+                params.append(param.name)
+    ratios = {}
+    for param in params:
+        ratios[param] = config["Slim"]["prune"]["pruned_ratio"]
+    plan = model.pruner.prune_vars(ratios, [0])
+
+    logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9,
+        plan.pruned_flops))
+
+    for param in model.parameters():
+        if "conv2d" in param.name:
+            logger.info("{}\t{}".format(param.name, param.shape))
+
+    model.train()
diff --git a/example/low_precision_training/ppcls/arch/slim/quant.py b/example/low_precision_training/ppcls/arch/slim/quant.py
new file mode 100755
index 000000000..3e31d9d53
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/slim/quant.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ...utils import logger
+
+QUANT_CONFIG = {
+    # weight preprocess type, default is None and no preprocessing is performed.
+    'weight_preprocess_type': None,
+    # activation preprocess type, default is None and no preprocessing is performed.
+    'activation_preprocess_type': None,
+    # weight quantize type, default is 'channel_wise_abs_max'
+    'weight_quantize_type': 'channel_wise_abs_max',
+    # activation quantize type, default is 'moving_average_abs_max'
+    'activation_quantize_type': 'moving_average_abs_max',
+    # weight quantize bit num, default is 8
+    'weight_bits': 8,
+    # activation quantize bit num, default is 8
+    'activation_bits': 8,
+    # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
+    'dtype': 'int8',
+    # window size for 'range_abs_max' quantization. default is 10000
+    'window_size': 10000,
+    # The decay coefficient of moving average, default is 0.9
+    'moving_rate': 0.9,
+    # for dygraph quantization, layers of type in quantizable_layer_type will be quantized
+    'quantizable_layer_type': ['Conv2D', 'Linear'],
+}
+
+
+def quantize_model(config, model, mode="train"):
+    if config.get("Slim", False) and config["Slim"].get("quant", False):
+        from paddleslim.dygraph.quant import QAT
+        assert config["Slim"]["quant"]["name"].lower(
+        ) == 'pact', 'Only PACT quantization method is supported now'
+        QUANT_CONFIG["activation_preprocess_type"] = "PACT"
+        if mode in ["infer", "export"]:
+            QUANT_CONFIG['activation_preprocess_type'] = None
+
+        # for re-parameterization nets, convert to reparameterized model first
+        for layer in model.sublayers():
+            if hasattr(layer, "re_parameterize"):
+                layer.re_parameterize()
+
+        model.quanter = QAT(config=QUANT_CONFIG)
+        model.quanter.quantize(model)
+        logger.info("QAT model summary:")
+        paddle.summary(model, (1, 3, 224, 224))
+    else:
+        model.quanter = None
+    return
diff --git a/example/low_precision_training/ppcls/arch/utils.py b/example/low_precision_training/ppcls/arch/utils.py
new file mode 100755
index 000000000..785b7fbbe
--- /dev/null
+++ b/example/low_precision_training/ppcls/arch/utils.py
@@ -0,0 +1,99 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import types
+import paddle
+from difflib import SequenceMatcher
+
+from . import backbone
+from typing import Any, Dict, Union
+
+
+def get_architectures():
+    """
+    get all of model architectures
+    """
+    names = []
+    for k, v in backbone.__dict__.items():
+        if isinstance(v, (types.FunctionType, six.class_types)):
+            names.append(k)
+    return names
+
+
+def get_blacklist_model_in_static_mode():
+    from ppcls.arch.backbone import distilled_vision_transformer
+    from ppcls.arch.backbone import vision_transformer
+    blacklist = distilled_vision_transformer.__all__ + vision_transformer.__all__
+    return blacklist
+
+
+def similar_architectures(name='', names=[], thresh=0.1, topk=10):
+    """
+    inferred similar architectures
+    """
+    scores = []
+    for idx, n in enumerate(names):
+        if n.startswith('__'):
+            continue
+        score = SequenceMatcher(None, n.lower(), name.lower()).quick_ratio()
+        if score > thresh:
+            scores.append((idx, score))
+    scores.sort(key=lambda x: x[1], reverse=True)
+    similar_names = [names[s[0]] for s in scores[:min(topk, len(scores))]]
+    return similar_names
+
+
+def get_param_attr_dict(ParamAttr_config: Union[None, bool, Dict[str, Dict]]
+                        ) -> Union[None, bool, paddle.ParamAttr]:
+    """parse ParamAttr from an dict
+
+    Args:
+        ParamAttr_config (Union[None, bool, Dict[str, Dict]]): ParamAttr configure
+
+    Returns:
+        Union[None, bool, paddle.ParamAttr]: Generated ParamAttr
+    """
+    if ParamAttr_config is None:
+        return None
+    if isinstance(ParamAttr_config, bool):
+        return ParamAttr_config
+    ParamAttr_dict = {}
+    if 'initializer' in ParamAttr_config:
+        initializer_cfg = ParamAttr_config.get('initializer')
+        if 'name' in initializer_cfg:
+            initializer_name = initializer_cfg.pop('name')
+            ParamAttr_dict['initializer'] = getattr(
+                paddle.nn.initializer, initializer_name)(**initializer_cfg)
+        else:
+            raise ValueError(f"'name' must specified in initializer_cfg")
+    if 'learning_rate' in ParamAttr_config:
+        # NOTE: only support an single value now
+        learning_rate_value = ParamAttr_config.get('learning_rate')
+        if isinstance(learning_rate_value, (int, float)):
+            ParamAttr_dict['learning_rate'] = learning_rate_value
+        else:
+            raise ValueError(
+                f"learning_rate_value must be float or int, but got {type(learning_rate_value)}"
+            )
+    if 'regularizer' in ParamAttr_config:
+        regularizer_cfg = ParamAttr_config.get('regularizer')
+        if 'name' in regularizer_cfg:
+            # L1Decay or L2Decay
+            regularizer_name = regularizer_cfg.pop('name')
+            ParamAttr_dict['regularizer'] = getattr(
+                paddle.regularizer, regularizer_name)(**regularizer_cfg)
+        else:
+            raise ValueError(f"'name' must specified in regularizer_cfg")
+    return paddle.ParamAttr(**ParamAttr_dict)
diff --git a/example/low_precision_training/ppcls/configs/.DS_Store b/example/low_precision_training/ppcls/configs/.DS_Store
new file mode 100755
index 0000000000000000000000000000000000000000..0def6937f61c959081779c875cdd29cd4942de67
GIT binary patch
literal 6148
zcmeHK%}N6?5T3Nvrijpkf_MseZLux*hnJ<+gBK63=s~6K(nS~7jkH^PD206heIXyj
z=W!-U3#AI4M05rwUot-%@@2^+03h1KUK5}S01}n3P{HOCAwTJY6r6=nm~TXI3K3-C
zXz;WU&4%B|0PUTG6L^FG?r`sB`xhoUMBiKZl*EIq(RdeyVrgl)EX#65-uU-=<Yz%P
z9JGV}6^+idPQqzF2ruGj)UB=^=`;)CH0qm#xEEo_^<|v)^r)?eX>VY1ed~}FS?N|c
z$K!gd(b}o0dUJnLQ{#hHqo($DcPEpIT;JM0JnKFrkEwpKbPD|QjI27G!Yg(bR`%@o
z(?q9t=$qzE;~AL&W`G&^KMc4NP_F!+CG$#|0cPNv4AA+YPzgPUxkYnyU_;kO@|Orn
zuupFZLgmnNm|H{-im<7OHdVMMhOp^qS1!(Tm|L{zAk5789XGRZFBD;BN4qlNAUuoQ
zG6T%OJOf2LtWy0y{`&ntpTr|(fEoBx42V+4@3e7AuC^{Lj%uw!y+tLVxZL6_1v{n`
gV=R^84OAm&SJXlD9Of3$g2ER8LjyO=z>hNU0V`olv;Y7A

literal 0
HcmV?d00001

diff --git a/example/low_precision_training/ppcls/configs/Attr/PPLCNet_x1_0_pedestrian_attribute.yaml b/example/low_precision_training/ppcls/configs/Attr/PPLCNet_x1_0_pedestrian_attribute.yaml
new file mode 100755
index 000000000..0aa50f88d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Attr/PPLCNet_x1_0_pedestrian_attribute.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 26
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pedestrian_attribute/data"
+      cls_label_path: "dataset/pedestrian_attribute/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - TimmAutoAugment:
+            prob: 0.8
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.4
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pedestrian_attribute/data"
+      cls_label_path: "dataset/pedestrian_attribute/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/pedestrian_attribute_label_list.txt
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/Attr/PPLCNet_x1_0_vehicle_attribute.yaml b/example/low_precision_training/ppcls/configs/Attr/PPLCNet_x1_0_vehicle_attribute.yaml
new file mode 100755
index 000000000..c9a902562
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Attr/PPLCNet_x1_0_vehicle_attribute.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  class_num: 19
+  use_ssld: True
+  lr_mult_list: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [256, 192]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/vehicle_attribute_label_list.txt
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/Attr/StrongBaselineAttr.yaml b/example/low_precision_training/ppcls/configs/Attr/StrongBaselineAttr.yaml
new file mode 100755
index 000000000..2324015d6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Attr/StrongBaselineAttr.yaml
@@ -0,0 +1,113 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "ResNet50"
+  pretrained: True
+  class_num: 26
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Adam
+  lr:
+    name: Piecewise
+    decay_epochs: [12, 18, 24, 28]
+    values: [0.0001, 0.00001, 0.000001, 0.0000001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/attribute/data/"
+      cls_label_path: "dataset/attribute/trainval.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/attribute/data/"
+      cls_label_path: "dataset/attribute/test.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
diff --git a/example/low_precision_training/ppcls/configs/CAE/cae_base_patch16_224_finetune.yaml b/example/low_precision_training/ppcls/configs/CAE/cae_base_patch16_224_finetune.yaml
new file mode 100755
index 000000000..899deab20
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/CAE/cae_base_patch16_224_finetune.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 20
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: cae_base_patch16_224
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.1
+  attn_drop_rate: 0.0
+
+  use_mean_pooling: True
+  init_scale: 0.001
+  use_rel_pos_bias: True
+  use_abs_pos_emb: False
+  init_values: 0.1
+  lin_probe: False
+
+  sin_pos_emb: True
+
+  enable_linear_eval: False
+  model_key: model|module|state_dict
+  model_ema:
+    enable_model_ema: False 
+    model_ema_decay: 0.9999
+    model_ema_force_cpu: False
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - SoftTargetCrossEntropy:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.65
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      batch_transform_ops:
+        - MixupCutmixHybrid:
+            mixup_alpha: 0.8
+            cutmix_alpha: 1.0
+            switch_prob: 0.5
+            num_classes: 1000
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandomResizedCrop:
+            size: 224
+        - RandomHorizontalFlip:
+            prob: 0.5
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [ 0.5, 0.5, 0.5 ]
+            std: [ 0.5, 0.5, 0.5 ]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [ 0.5, 0.5, 0.5 ]
+            std: [ 0.5, 0.5, 0.5 ]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/CAE/cae_large_patch16_224_finetune.yaml b/example/low_precision_training/ppcls/configs/CAE/cae_large_patch16_224_finetune.yaml
new file mode 100755
index 000000000..a7fbe9002
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/CAE/cae_large_patch16_224_finetune.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 20
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: cae_large_patch16_224
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+  attn_drop_rate: 0.0
+
+  use_mean_pooling: True
+  init_scale: 0.001
+  use_rel_pos_bias: True
+  use_abs_pos_emb: False
+  init_values: 0.1
+  lin_probe: False
+
+  sin_pos_emb: True
+
+  enable_linear_eval: False
+  model_key: model|module|state_dict
+  model_ema:
+    enable_model_ema: False 
+    model_ema_decay: 0.9999
+    model_ema_force_cpu: False
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - SoftTargetCrossEntropy:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.75
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      batch_transform_ops:
+        - MixupCutmixHybrid:
+            mixup_alpha: 0.8
+            cutmix_alpha: 1.0
+            switch_prob: 0.5
+            num_classes: 1000
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandomResizedCrop:
+            size: 224
+        - RandomHorizontalFlip:
+            prob: 0.5
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [ 0.5, 0.5, 0.5 ]
+            std: [ 0.5, 0.5, 0.5 ]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [ 0.5, 0.5, 0.5 ]
+            std: [ 0.5, 0.5, 0.5 ]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml b/example/low_precision_training/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml
new file mode 100755
index 000000000..aaddd1b15
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O2
+  use_fp16_test: True
+
+# model architecture
+Arch:
+  name: CLIP_vit_base_patch16_224
+  class_num: 1000
+  return_embed: False
+  pretrained: True
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.6
+  filter_bias_and_bn: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0003
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 224
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/CLIP/CLIP_vit_large_patch14_224_finetune.yaml b/example/low_precision_training/ppcls/configs/CLIP/CLIP_vit_large_patch14_224_finetune.yaml
new file mode 100755
index 000000000..c9c236799
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/CLIP/CLIP_vit_large_patch14_224_finetune.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O2
+  use_fp16_test: True
+
+# model architecture
+Arch:
+  name: CLIP_vit_large_patch14_224
+  class_num: 1000
+  return_embed: False
+  pretrained: True
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.6
+  filter_bias_and_bn: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0003
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 224
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/Cartoonface/ResNet50_icartoon.yaml b/example/low_precision_training/ppcls/configs/Cartoonface/ResNet50_icartoon.yaml
new file mode 100755
index 000000000..3d1b99378
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Cartoonface/ResNet50_icartoon.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_mode: "retrieval"
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_shape: [3, 224, 224]
+  infer_imgs:
+  save_inference_dir: "./inference"
+  feature_normalize: True
+
+Arch:  
+  name: "RecModel"
+  Backbone:
+    name: "ResNet50"
+    pretrained: True
+  BackboneStopLayer: 
+    name: "flatten"
+    output_dim: 2048
+  Head:
+    name: "FC"
+    class_num: 5013
+    embedding_size: 2048
+    # margin: 0.5
+    # scale:  80
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    # - TripletLoss:  
+    #     margin: 0.1
+    #     weight: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+DataLoader:
+  Train:
+    dataset:
+      name: ICartoonDataset
+      image_root:  "./dataset/iCartoonFace"
+      cls_label_path:  "./dataset/iCartoonFace/train_list.txt"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+        name: DistributedBatchSampler
+        #num_instances: 2
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  
+  Eval:
+    Query:
+      dataset: 
+        name: ICartoonDataset
+        image_root: "./dataset/iCartoonFace"
+        cls_label_path: "./dataset/iCartoonFace/query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+          name: DistributedBatchSampler
+          batch_size: 64
+          drop_last: False
+          shuffle: False
+      loader:
+          num_workers: 8
+          use_shared_memory: True
+
+    Gallery:
+      dataset: 
+          name: ICartoonDataset
+          image_root: "./dataset/iCartoonFace"
+          cls_label_path: "./dataset/iCartoonFace/gallery.txt"
+          transform_ops:
+            - DecodeImage:
+                to_rgb: True
+                channel_first: False
+            - ResizeImage:
+                resize_short: 256
+            - CropImage:
+                size: 224
+            - NormalizeImage:
+                scale: 0.00392157
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                order: ''
+      sampler:
+          name: DistributedBatchSampler
+          batch_size: 64
+          drop_last: False
+          shuffle: False
+      loader:
+          num_workers: 8
+          use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - Recallk:
+        topk: [1]
diff --git a/example/low_precision_training/ppcls/configs/DeepHash/DCH.yaml b/example/low_precision_training/ppcls/configs/DeepHash/DCH.yaml
new file mode 100755
index 000000000..363d7b406
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/DeepHash/DCH.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 15
+  eval_during_train: True
+  eval_interval: 15
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+
+  Backbone:
+    name: AlexNet
+    pretrained: True
+    class_num: 48
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - DCHLoss:
+        weight: 1.0
+        gamma:  20.0
+        _lambda: 0.1
+        n_class: 10
+  Eval:
+    - DCHLoss:
+        weight: 1.0
+        gamma:  20.0
+        _lambda: 0.1
+        n_class: 10
+
+Optimizer:
+  name: SGD
+  lr:
+    name: Piecewise
+    learning_rate: 0.005
+    decay_epochs: [200]
+    values: [0.005, 0.0005]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR10/
+      cls_label_path: ./dataset/CIFAR10/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/train_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - mAP: {}
+    - Recallk:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/DeepHash/DSHSD.yaml b/example/low_precision_training/ppcls/configs/DeepHash/DSHSD.yaml
new file mode 100755
index 000000000..e5b45d64a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/DeepHash/DSHSD.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 15
+  eval_during_train: True
+  eval_interval: 15
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key:  features
+  infer_add_softmax: False
+  is_rec: True
+
+  Backbone:
+    name: AlexNet
+    pretrained: True
+    class_num: 48
+  Neck:
+    name: Tanh
+  Head:
+    name: FC
+    class_num: 10
+    embedding_size: 48
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - DSHSDLoss:
+        weight: 1.0
+        alpha:  0.05
+  Eval:
+    - DSHSDLoss:
+        weight: 1.0
+        alpha:  0.05
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Piecewise
+    learning_rate: 0.00001
+    decay_epochs: [200]
+    values: [0.00001, 0.000001]  
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR10/
+      cls_label_path: ./dataset/CIFAR10/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/train_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - mAP: {}
+    - Recallk:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/DeepHash/LCDSH.yaml b/example/low_precision_training/ppcls/configs/DeepHash/LCDSH.yaml
new file mode 100755
index 000000000..5a3349d9d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/DeepHash/LCDSH.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 15
+  eval_during_train: True
+  eval_interval: 15
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+
+  Backbone:
+    name: AlexNet
+    pretrained: True
+    class_num: 48
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - LCDSHLoss:
+        weight: 1.0
+        _lambda: 3
+        n_class: 10
+  Eval:
+    - LCDSHLoss:
+        weight: 1.0
+        _lambda: 3
+        n_class: 10
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Piecewise
+    learning_rate: 0.00001
+    decay_epochs: [200]
+    values: [0.00001, 0.000001]  
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR10/
+      cls_label_path: ./dataset/CIFAR10/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/train_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - mAP: {}
+    - Recallk:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/GeneralRecognition/Gallery2FC_PPLCNet_x2_5.yaml b/example/low_precision_training/ppcls/configs/GeneralRecognition/Gallery2FC_PPLCNet_x2_5.yaml
new file mode 100755
index 000000000..fbaefdcf5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/GeneralRecognition/Gallery2FC_PPLCNet_x2_5.yaml
@@ -0,0 +1,51 @@
+# global configs
+Global:
+  pretrained_model: ./pretrained/general_PPLCNet_x2_5_pretrained_v1.0_quant
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference/general_PPLCNet_x2_5_quant/inference
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: RecModel
+
+  Backbone: 
+    name: PPLCNet_x2_5
+    pretrained: False
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: ArcMargin 
+    embedding_size: 512
+    class_num: 185341
+    margin: 0.2
+    scale: 30
+
+# indexing engine config
+IndexProcess:
+  image_root: "./drink_dataset_v1.0/gallery/"
+  data_file:  "./drink_dataset_v1.0/gallery/drink_label.txt"
+  delimiter: "\t"
+  batch_size: 2
+  transform_ops:
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [ 0.485, 0.456, 0.406 ]
+        std: [ 0.229, 0.224, 0.225 ]
+        order: ''
+    - ToCHWImage:
diff --git a/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml b/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
new file mode 100755
index 000000000..70daa639b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: PPLCNet_x2_5
+    pretrained: True
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 185341
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: "L2"
+    coeff: 0.00001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml b/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml
new file mode 100755
index 000000000..728942fe3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: PPLCNet_x2_5_Tanh
+    pretrained: True
+    use_ssld: True
+    class_num: 512
+  Head:
+    name: FC
+    embedding_size: 512
+    class_num: 185341
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/all_data
+      cls_label_path: ./dataset/all_data/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml b/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
new file mode 100755
index 000000000..b6c45363b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
@@ -0,0 +1,188 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+  infer_model_name: "Student"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  models:
+    - Teacher:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+    - Student:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        key: "logits"
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: "logits"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml b/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml
new file mode 100755
index 000000000..bcaea03b8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml
@@ -0,0 +1,193 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+  infer_model_name: "Student"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  models:
+    - Teacher:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+    - Student:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        key: "logits"
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: "logits"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:
+        weight: 1.0
+        key: "backbone"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_base.yaml b/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_base.yaml
new file mode 100755
index 000000000..912c3cd39
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_base.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: CLIP_vit_base_patch16_224
+    pretrained: True
+    return_embed: True
+    return_mean_embed: True 
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 512
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 192613
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 1
+  regularizer:
+    name: "L2"
+    coeff: 0.00002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data_v2.txt 
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224  
+        - RandomRotation:
+            prob: 0.3
+            degrees: 90
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/ppshitu_traindata/Aliproduct
+        cls_label_path: ./dataset/ppshitu_traindata/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+              interpolation: bicubic
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 12
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/ppshitu_traindata/Aliproduct/
+        cls_label_path: ./dataset/ppshitu_traindata/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+              interpolation: bicubic
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 12
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_large.yaml b/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_large.yaml
new file mode 100755
index 000000000..f7e33b3f8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_large.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: CLIP_vit_large_patch14_224
+    pretrained: True
+    return_embed: True
+    return_mean_embed: True 
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 512
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 192613
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0025
+    warmup_epoch: 1
+  regularizer:
+    name: "L2"
+    coeff: 0.00002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data_v2.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224  
+        - RandomRotation:
+            prob: 0.3
+            degrees: 90
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/ppshitu_traindata/Aliproduct
+        cls_label_path: ./dataset/ppshitu_traindata/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+              interpolation: bicubic
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 12
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/ppshitu_traindata/Aliproduct/
+        cls_label_path: ./dataset/ppshitu_traindata/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+              interpolation: bicubic
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 12
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml b/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml
new file mode 100755
index 000000000..61eadb48a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml
@@ -0,0 +1,209 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: retrieval
+  retrieval_feature_from: features # 'backbone' or 'features'
+  re_ranking: False
+  use_dali: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: PPLCNetV2_base_ShiTu
+    pretrained: True
+    use_ssld: True
+    class_expand: &feat_dim 512
+  BackboneStopLayer:
+    name: flatten
+  Neck:
+    name: BNNeck
+    num_features: *feat_dim
+    weight_attr:
+      initializer:
+        name: Constant
+        value: 1.0
+    bias_attr:
+      initializer:
+        name: Constant
+        value: 0.0
+      learning_rate: 1.0e-20 # NOTE: Temporarily set lr small enough to freeze the bias to zero
+  Head:
+    name: FC
+    embedding_size: *feat_dim
+    class_num: 192612
+    weight_attr:
+      initializer:
+        name: Normal
+        std: 0.001
+    bias_attr: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletAngularMarginLoss:
+        weight: 1.0
+        feature_from: features
+        margin: 0.5
+        reduction: mean
+        add_absolute: True
+        absolute_loss_weight: 0.1
+        normalize_feature: True
+        ap_value: 0.8
+        an_value: 0.4
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.06 # for 8gpu x 256bs
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    coeff: 0.00001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data_v2.txt
+      relabel: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+            return_numpy: False
+            interpolation: bilinear
+            backend: cv2
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+            backend: cv2
+        - RandCropImageV2:
+            size: [224, 224]
+        - RandomRotation:
+            prob: 0.5
+            degrees: 90
+            interpolation: bilinear
+        - ResizeImage:
+            size: [224, 224]
+            return_numpy: False
+            interpolation: bilinear
+            backend: cv2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: hwc
+    sampler:
+      name: PKSampler
+      batch_size: 256
+      sample_per_id: 4
+      drop_last: False
+      shuffle: True
+      sample_method: "id_avg_prob"
+      id_list: [50030, 80700, 92019, 96015] # be careful when set relabel=True
+      ratio: [4, 4]
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: [224, 224]
+              return_numpy: False
+              interpolation: bilinear
+              backend: cv2
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: hwc
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: [224, 224]
+              return_numpy: False
+              interpolation: bilinear
+              backend: cv2
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: hwc
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/AlexNet/AlexNet.yaml b/example/low_precision_training/ppcls/configs/ImageNet/AlexNet/AlexNet.yaml
new file mode 100755
index 000000000..25c19eedc
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/AlexNet/AlexNet.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: AlexNet
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    decay_epochs: [30, 60, 90]
+    values: [0.01, 0.001, 0.0001, 0.00001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CSPNet/CSPDarkNet53.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CSPNet/CSPDarkNet53.yaml
new file mode 100755
index 000000000..29ca02e25
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CSPNet/CSPDarkNet53.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSPDarkNet53
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
new file mode 100755
index 000000000..5f116f11a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_base_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2.5e-4
+    eta_min: 2.5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 2.5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
new file mode 100755
index 000000000..d845d8f4d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_base_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1.25e-4
+    eta_min: 1.25e-6
+    warmup_epoch: 20
+    warmup_start_lr: 1.25e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
new file mode 100755
index 000000000..9cadcc901
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_large_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2.5e-4
+    eta_min: 2.5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 2.5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
new file mode 100755
index 000000000..1e01bb0bb
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_large_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 6.25e-5
+    eta_min: 6.25e-7
+    warmup_epoch: 20
+    warmup_start_lr: 6.25e-8
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 8
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
new file mode 100755
index 000000000..2c182bb53
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_small_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg 
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
new file mode 100755
index 000000000..fa8986f2c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_tiny_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_base_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_base_224.yaml
new file mode 100755
index 000000000..591afe390
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_base_224.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_base_224
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_base_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_base_384.yaml
new file mode 100755
index 000000000..0adec4be5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_base_384.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_base_384
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_large_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_large_224.yaml
new file mode 100755
index 000000000..6f5b23e10
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_large_224.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_large_224
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_large_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_large_384.yaml
new file mode 100755
index 000000000..63a4aa1a0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_large_384.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_large_384
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_small.yaml
new file mode 100755
index 000000000..d6c0551df
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_small.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_small
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_tiny.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_tiny.yaml
new file mode 100755
index 000000000..4d705857b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ConvNeXt/ConvNeXt_tiny.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_tiny
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_13_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_13_224.yaml
new file mode 100755
index 000000000..b211c0cd1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_13_224.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_13_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3  # lr 2e-3 for total_batch_size 2048
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_13_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_13_384.yaml
new file mode 100755
index 000000000..14e2b9d9e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_13_384.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_13_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3  # lr 2e-3 for total_batch_size 2048
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        size: 384
+        interpolation: bicubic
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_21_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_21_224.yaml
new file mode 100755
index 000000000..8274a582e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_21_224.yaml
@@ -0,0 +1,154 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_21_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3  # lr 1e-3 for total_batch_size 1024
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: RASampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_21_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_21_384.yaml
new file mode 100755
index 000000000..4aa2e27ca
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_21_384.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_21_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3  # lr 1e-3 for total_batch_size 1024
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        size: 384
+        interpolation: bicubic
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_W24_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_W24_384.yaml
new file mode 100755
index 000000000..18b6d7ff6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/CvT/CvT_W24_384.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_W24_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3  # lr 1e-3 for total_batch_size 1024
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        size: 384
+        interpolation: bicubic
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102.yaml
new file mode 100755
index 000000000..c87635763
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA102
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102x.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102x.yaml
new file mode 100755
index 000000000..580c8ce4c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102x.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA102x
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102x2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102x2.yaml
new file mode 100755
index 000000000..0691a2afc
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA102x2.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA102x2
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA169.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA169.yaml
new file mode 100755
index 000000000..7731d361b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA169.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA169
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA34.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA34.yaml
new file mode 100755
index 000000000..555716033
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA34.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA34
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA46_c.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA46_c.yaml
new file mode 100755
index 000000000..1fef5b861
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA46_c.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA46_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA46x_c.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA46x_c.yaml
new file mode 100755
index 000000000..a88a940d2
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA46x_c.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA46x_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60.yaml
new file mode 100755
index 000000000..0a82f7d2c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA60
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60x.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60x.yaml
new file mode 100755
index 000000000..19dc8ef32
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60x.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA60x
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60x_c.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60x_c.yaml
new file mode 100755
index 000000000..ebf247840
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DLA/DLA60x_c.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA60x_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN107.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN107.yaml
new file mode 100755
index 000000000..a18c341f6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN107.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN107
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN131.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN131.yaml
new file mode 100755
index 000000000..68e9479ff
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN131.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN131
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN68.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN68.yaml
new file mode 100755
index 000000000..33a0e416e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN68.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN68
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN92.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN92.yaml
new file mode 100755
index 000000000..583079845
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN92.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN92
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN98.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN98.yaml
new file mode 100755
index 000000000..f3bb99423
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DPN/DPN98.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN98
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_base.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_base.yaml
new file mode 100755
index 000000000..7d4ffcb10
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_base.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DSNet_base
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_small.yaml
new file mode 100755
index 000000000..e006104c9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_small.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DSNet_small
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_tiny.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_tiny.yaml
new file mode 100755
index 000000000..884620582
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DSNet/DSNet_tiny.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DSNet_tiny
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
new file mode 100755
index 000000000..ccf05adaa
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DarkNet/DarkNet53.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DarkNet53
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
new file mode 100755
index 000000000..127cf91e8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
new file mode 100755
index 000000000..542ce15f8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
new file mode 100755
index 000000000..21ec5f88b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - CutmixOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
new file mode 100755
index 000000000..5f9286b82
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - Cutout:
+            n_holes: 1
+            length: 112
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
new file mode 100755
index 000000000..8e14546a3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - GridMask:
+            d1: 96
+            d2: 224
+            rotate: 1
+            ratio: 0.5
+            mode: 0
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
new file mode 100755
index 000000000..b8bdeba2c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - HideAndSeek:
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
new file mode 100755
index 000000000..176acaf3e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
new file mode 100755
index 000000000..c1f8c14f6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugment:
+            num_layers: 2 
+            magnitude: 5
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
new file mode 100755
index 000000000..1788e529d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
new file mode 100755
index 000000000..39087d340
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_base_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
new file mode 100755
index 000000000..bf0ac2b7a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_base_distilled_patch16_384
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384 
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
new file mode 100755
index 000000000..cf3f1dc5e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_base_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
new file mode 100755
index 000000000..8f4a9a9d2
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_base_patch16_384
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384 
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
new file mode 100755
index 000000000..0db9532e3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_small_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
new file mode 100755
index 000000000..5e91973b1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_small_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
new file mode 100755
index 000000000..3068ada56
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_tiny_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
new file mode 100755
index 000000000..3cd8cd06c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_tiny_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet121.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet121.yaml
new file mode 100755
index 000000000..13e57d675
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet121.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet121
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet161.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet161.yaml
new file mode 100755
index 000000000..a3fa6b1cf
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet161.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet161
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet169.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet169.yaml
new file mode 100755
index 000000000..5eb27d4dd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet169.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet169
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet201.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet201.yaml
new file mode 100755
index 000000000..6b7aad5c1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet201.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet201
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet264.yaml b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet264.yaml
new file mode 100755
index 000000000..046e3a83c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/DenseNet/DenseNet264.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet264
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x1_0_ssld.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x1_0_ssld.yaml
new file mode 100755
index 000000000..a9f28674e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x1_0_ssld.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_r50_vd_distill
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: True
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: False
+        dropout_prob: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4 # for bs 1024
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_dml.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_dml.yaml
new file mode 100755
index 000000000..4fb768098
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_dml.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_lcnet_x2_5_dml
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+    - Student:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_ssld.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_ssld.yaml
new file mode 100755
index 000000000..e778cb8d8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_ssld.yaml
@@ -0,0 +1,160 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_r50_vd_distill
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: True
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml
new file mode 100755
index 000000000..17e020e9a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_lcnet_x2_5_udml
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+    - Student:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+       weight: 1.0
+       key: logits
+       model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: logits
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:
+        weight: 1.0
+        key: "blocks5"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
new file mode 100755
index 000000000..7cc99b64a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: MobileNetV3_large_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        dropout_prob: null
+    - Student:
+        name: MobileNetV3_small_x1_0
+        class_num: *class_num
+        pretrained: False
+        dropout_prob: null
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/res2net200_vd_distill_pphgnet_base.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/res2net200_vd_distill_pphgnet_base.yaml
new file mode 100755
index 000000000..acc0aa380
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/res2net200_vd_distill_pphgnet_base.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: Res2Net200_vd_26w_4s
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPHGNet_base
+        class_num: *class_num
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+              interpolation: bicubic
+              backend: pil
+          - RandFlipImage:
+              flip_code: 1
+          - TimmAutoAugment:
+              config_str: rand-m7-mstd0.5-inc1
+              interpolation: bicubic
+              img_size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 236
+              interpolation: bicubic
+              backend: pil
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 236
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
new file mode 100755
index 000000000..6816cd257
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
@@ -0,0 +1,211 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  models:
+    - Teacher:
+        name: AttentionModel
+        pretrained_list:
+        freeze_params_list:
+          - True
+          - False
+        models:
+          - ResNet34:
+              name: ResNet34
+              pretrained: True
+              return_patterns: &t_keys ["blocks[0]", "blocks[1]", "blocks[2]", "blocks[3]",
+                                        "blocks[4]", "blocks[5]", "blocks[6]", "blocks[7]",
+                                        "blocks[8]", "blocks[9]", "blocks[10]", "blocks[11]",
+                                        "blocks[12]", "blocks[13]", "blocks[14]", "blocks[15]"]
+          - LinearTransformTeacher:
+              name: LinearTransformTeacher
+              qk_dim: 128
+              keys: *t_keys
+              t_shapes: &t_shapes [[64, 56, 56], [64, 56, 56], [64, 56, 56], [128, 28, 28],
+                                   [128, 28, 28], [128, 28, 28], [128, 28, 28], [256, 14, 14],
+                                   [256, 14, 14], [256, 14, 14], [256, 14, 14], [256, 14, 14],
+                                   [256, 14, 14], [512, 7, 7], [512, 7, 7], [512, 7, 7]]
+
+    - Student:
+        name: AttentionModel
+        pretrained_list:
+        freeze_params_list:
+          - False
+          - False
+        models:
+          - ResNet18:
+              name: ResNet18
+              pretrained: False
+              return_patterns: &s_keys ["blocks[0]", "blocks[1]", "blocks[2]", "blocks[3]",
+                                        "blocks[4]", "blocks[5]", "blocks[6]", "blocks[7]"]
+          - LinearTransformStudent:
+              name: LinearTransformStudent
+              qk_dim: 128
+              keys: *s_keys
+              s_shapes: &s_shapes [[64, 56, 56], [64, 56, 56], [128, 28, 28], [128, 28, 28],
+                                   [256, 14, 14], [256, 14, 14], [512, 7, 7], [512, 7, 7]]
+              t_shapes: *t_shapes
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+        key: logits
+    - DistillationKLDivLoss:
+        weight: 0.9
+        model_name_pairs: [["Student", "Teacher"]]
+        temperature: 4
+        key: logits
+    - AFDLoss:
+        weight: 50.0
+        model_name_pair: ["Student", "Teacher"]
+        student_keys: ["bilinear_key", "value"]
+        teacher_keys: ["query", "value"]
+        s_shapes: *s_shapes
+        t_shapes: *t_shapes
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.1
+    milestones: [30, 60, 90]
+    step_each_epoch: 1
+    gamma: 0.1
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+              interpolation: bicubic
+              backend: pil
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+              interpolation: bicubic
+              backend: pil
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+          interpolation: bicubic
+          backend: pil
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dist.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dist.yaml
new file mode 100755
index 000000000..a689f6172
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dist.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/r34_r18_dist
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet34
+        class_num: *class_num
+        pretrained: True
+    - Student:
+        name: ResNet18
+        class_num: *class_num
+        pretrained: False
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationDISTLoss:
+        weight: 2.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml
new file mode 100755
index 000000000..19c684535
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml
@@ -0,0 +1,166 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet34
+        pretrained: True
+
+    - Student:
+        name: ResNet18
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationDKDLoss:
+        weight: 1.0
+        model_name_pairs: [["Student", "Teacher"]]
+        temperature: 1
+        alpha: 1.0
+        beta: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.2
+    milestones: [30, 60, 90]
+    step_each_epoch: 1
+    gamma: 0.1
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_mgd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_mgd.yaml
new file mode 100755
index 000000000..d501f3dd5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_mgd.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/r34_r18_mgd
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet34
+        class_num: *class_num
+        pretrained: True
+        return_patterns: &t_stages ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"]
+    - Student:
+        name: ResNet18
+        class_num: *class_num
+        pretrained: False
+        return_patterns: &s_stages ["blocks[1]", "blocks[3]", "blocks[5]", "blocks[7]"]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationPairLoss:
+        weight: 1.0
+        base_loss_name: MGDLoss
+        model_name_pairs: [["Student", "Teacher"]]
+        s_key: "blocks[7]"
+        t_key: "blocks[15]"
+        name: "loss_mgd"
+        student_channels: 512
+        teacher_channels: 512
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_pefd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_pefd.yaml
new file mode 100755
index 000000000..1c87f03ec
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_pefd.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/r34_r18_pefd
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet34
+        class_num: *class_num
+        pretrained: True
+        return_patterns: &t_stages ["avg_pool"]
+    - Student:
+        name: ResNet18
+        class_num: *class_num
+        pretrained: False
+        return_patterns: &s_stages ["avg_pool"]
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationPairLoss:
+        weight: 25.0
+        base_loss_name: PEFDLoss
+        model_name_pairs: [["Student", "Teacher"]]
+        s_key: "avg_pool"
+        t_key: "avg_pool"
+        name: "loss_pefd"
+        student_channel: 512
+        teacher_channel: 512
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_skd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_skd.yaml
new file mode 100755
index 000000000..d1100a1b7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_skd.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet34
+        pretrained: True
+
+    - Student:
+        name: ResNet18
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationSKDLoss:
+        weight: 1.0
+        model_name_pairs: [["Student", "Teacher"]]
+        temperature: 1.0
+        multiplier: 2.0
+        alpha: 0.9
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.1
+    milestones: [30, 60, 90]
+    step_each_epoch: 1
+    gamma: 0.1
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_wsl.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_wsl.yaml
new file mode 100755
index 000000000..adabc5ae0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Distillation/resnet34_distill_resnet18_wsl.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/r34_r18_wsl
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet34
+        pretrained: True
+
+    - Student:
+        name: ResNet18
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationWSLLoss:
+        weight: 2.5
+        model_name_pairs: [["Student", "Teacher"]]
+        temperature: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.1
+    milestones: [30, 60, 90]
+    step_each_epoch: 1
+    gamma: 0.1
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml
new file mode 100755
index 000000000..a1ae01f2e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_25.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ESNet_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml
new file mode 100755
index 000000000..bf806d9cd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ESNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml
new file mode 100755
index 000000000..566560795
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x0_75.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ESNet_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml
new file mode 100755
index 000000000..e0b32d5c5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ESNet/ESNet_x1_0.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ESNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB0.yaml
new file mode 100755
index 000000000..8f8ce1825
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB0.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB1.yaml
new file mode 100755
index 000000000..33cdcff94
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB1.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 240, 240]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 240
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 272
+        - CropImage:
+            size: 240
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 272
+    - CropImage:
+        size: 240
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB2.yaml
new file mode 100755
index 000000000..3d2e15f85
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB2.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 260, 260]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 260
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+        - CropImage:
+            size: 260
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 260
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB3.yaml
new file mode 100755
index 000000000..4dd71da17
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB3.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 300, 300]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 300
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 332
+        - CropImage:
+            size: 300
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 332
+    - CropImage:
+        size: 300
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB4.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB4.yaml
new file mode 100755
index 000000000..a123c1267
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB4.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 380, 380]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 380
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 412
+        - CropImage:
+            size: 380
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 412
+    - CropImage:
+        size: 380
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB5.yaml
new file mode 100755
index 000000000..8c163d64f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB5.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 456, 456]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 456
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 488
+        - CropImage:
+            size: 456
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 488
+    - CropImage:
+        size: 456
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB6.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB6.yaml
new file mode 100755
index 000000000..9897673a3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB6.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 528, 528]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB6
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 528
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 560
+        - CropImage:
+            size: 528
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 560
+    - CropImage:
+        size: 528
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB7.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB7.yaml
new file mode 100755
index 000000000..8f02cd278
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNet/EfficientNetB7.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 600, 600]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB7
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 600
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 632
+        - CropImage:
+            size: 600
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 632
+    - CropImage:
+        size: 600
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/EfficientNetV2/EfficientNetV2_S.yaml b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNetV2/EfficientNetV2_S.yaml
new file mode 100755
index 000000000..48257bc13
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/EfficientNetV2/EfficientNetV2_S.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 350
+  print_batch_step: 20
+  use_visualdl: False
+  train_mode: progressive  # progressive training
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+EMA:
+  decay: 0.9999
+
+# model architecture
+Arch:
+  name: EfficientNetV2_S
+  class_num: 1000
+  use_sync_bn: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65 # 8gpux128bs
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    coeff: 0.00001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 171
+            progress_size: [171, 214, 257, 300]
+            scale: [0.05, 1.0]
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV2:
+            num_layers: 2
+            magnitude: 5.0
+            progress_magnitude: [5.0, 8.3333333333, 11.66666666667, 15.0]
+        - NormalizeImage:
+            scale: 1.0
+            mean: [128.0, 128.0, 128.0]
+            std: [128.0, 128.0, 128.0]
+            order: ""
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - CropImageAtRatio:
+            size: 384
+            pad: 32
+            interpolation: bilinear
+        - NormalizeImage:
+            scale: 1.0
+            mean: [128.0, 128.0, 128.0]
+            std: [128.0, 128.0, 128.0]
+            order: ""
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - CropImageAtRatio:
+        size: 384
+        pad: 32
+        interpolation: bilinear
+    - NormalizeImage:
+        scale: 1.0
+        mean: [128.0, 128.0, 128.0]
+        std: [128.0, 128.0, 128.0]
+        order: ""
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_L.yaml b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_L.yaml
new file mode 100755
index 000000000..b250021f5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_L.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_L
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 0.01
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.7
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 50
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_M.yaml b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_M.yaml
new file mode 100755
index 000000000..98aafa660
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_M.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_M
+  class_num: 1000
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002 
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.5
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 50
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_S.yaml b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_S.yaml
new file mode 100755
index 000000000..626a57295
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_S.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_S
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.03
+  clip_grad: null
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.004
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.3
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 50
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  ##RandomResized
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T0.yaml
new file mode 100755
index 000000000..050d8e738
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T0.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_T0
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.005
+  clip_grad: null
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.004
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: null
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.05
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 50
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage: 
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T1.yaml
new file mode 100755
index 000000000..bd78165d3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T1.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_T1
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  clip_grad: null
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.004
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m3-mstd0.5-inc1 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.05
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 50
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T2.yaml
new file mode 100755
index 000000000..abe11fcf7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/FasterNet/FasterNet_T2.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_T2
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.02
+  clip_grad: null
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.004
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m5-mstd0.5-inc1 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.1
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 50
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
new file mode 100755
index 000000000..fe56eb53d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: GhostNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
new file mode 100755
index 000000000..e063ec2b7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: GhostNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
new file mode 100755
index 000000000..40572d4ab
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: GhostNet_x1_3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml
new file mode 100755
index 000000000..76469db68
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W18_C.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HRNet_W18_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W30_C.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W30_C.yaml
new file mode 100755
index 000000000..78fdce1b3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W30_C.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HRNet_W30_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W32_C.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W32_C.yaml
new file mode 100755
index 000000000..43a06712e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W32_C.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HRNet_W32_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W40_C.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W40_C.yaml
new file mode 100755
index 000000000..a32ba1e60
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W40_C.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HRNet_W40_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W44_C.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W44_C.yaml
new file mode 100755
index 000000000..f8b715fd9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W44_C.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HRNet_W44_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
new file mode 100755
index 000000000..a91baa577
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W48_C.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HRNet_W48_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W64_C.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W64_C.yaml
new file mode 100755
index 000000000..f482fc6d5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HRNet/HRNet_W64_C.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HRNet_W64_C
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet39_ds.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet39_ds.yaml
new file mode 100755
index 000000000..d15ba0c75
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet39_ds.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HarDNet39_ds
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet68.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet68.yaml
new file mode 100755
index 000000000..f0d8d8e26
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet68.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HarDNet68
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet68_ds.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet68_ds.yaml
new file mode 100755
index 000000000..dc003a88f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet68_ds.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HarDNet68_ds
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet85.yaml b/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet85.yaml
new file mode 100755
index 000000000..f69bc650c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/HarDNet/HarDNet85.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HarDNet85
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Inception/GoogLeNet.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Inception/GoogLeNet.yaml
new file mode 100755
index 000000000..6709433f9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Inception/GoogLeNet.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: GoogLeNet
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - GoogLeNetLoss:
+        weight: 1.0
+  Eval:
+    - GoogLeNetLoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - GoogLeNetTopkAcc:
+        topk: [1, 5]
+  Eval:
+    - GoogLeNetTopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Inception/InceptionV3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Inception/InceptionV3.yaml
new file mode 100755
index 000000000..fe6c66a34
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Inception/InceptionV3.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: InceptionV3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Inception/InceptionV4.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Inception/InceptionV4.yaml
new file mode 100755
index 000000000..996049490
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Inception/InceptionV4.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: InceptionV4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_128.yaml b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_128.yaml
new file mode 100755
index 000000000..496155274
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_128.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_128
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml
new file mode 100755
index 000000000..27798a673
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_128S.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_128S
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_192.yaml b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_192.yaml
new file mode 100755
index 000000000..7e045c65d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_192.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_192
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_256.yaml b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_256.yaml
new file mode 100755
index 000000000..2b764daed
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_256.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_384.yaml
new file mode 100755
index 000000000..6751e066c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/LeViT/LeViT_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M0.yaml
new file mode 100755
index 000000000..fe64441f8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M0.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MicroNet_M0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2  # for total batch size 512
+  regularizer:
+    name: 'L2'
+    coeff: 3e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M1.yaml
new file mode 100755
index 000000000..694793e2a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M1.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MicroNet_M1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2  # for total batch size 512
+  regularizer:
+    name: 'L2'
+    coeff: 3e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M2.yaml
new file mode 100755
index 000000000..379445787
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M2.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MicroNet_M2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2  # for total batch size 512
+  regularizer:
+    name: 'L2'
+    coeff: 3e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M3.yaml
new file mode 100755
index 000000000..7a41c05ba
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MicroNet/MicroNet_M3.yaml
@@ -0,0 +1,152 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MicroNet_M3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2  # for total batch size 512
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 4e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            use_log_aspect: True
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_L.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_L.yaml
new file mode 100755
index 000000000..fe75f2cc8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_L.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MixNet_L
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_M.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_M.yaml
new file mode 100755
index 000000000..62c039bda
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_M.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MixNet_M
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_S.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_S.yaml
new file mode 100755
index 000000000..f56f40eb1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MixNet/MixNet_S.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MixNet_S
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml
new file mode 100755
index 000000000..04772014a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml
@@ -0,0 +1,160 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNeXt_x1_0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.1  # for total batch size 512
+    eta_min: 1e-5
+    warmup_epoch: 3
+    warmup_start_lr: 1e-4
+    by_epoch: True
+  regularizer:
+    name: 'L2'
+    coeff: 1e-4
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+            hue: 0
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
new file mode 100755
index 000000000..66fa53ec7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
new file mode 100755
index 000000000..364ed851a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
new file mode 100755
index 000000000..46fc98859
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
new file mode 100755
index 000000000..180e97c8c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
new file mode 100755
index 000000000..12ba90772
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
new file mode 100755
index 000000000..c8897d8fc
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
new file mode 100755
index 000000000..d6c761ba4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
new file mode 100755
index 000000000..ac33c49d4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
new file mode 100755
index 000000000..98fc61db4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
new file mode 100755
index 000000000..1fd273469
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml
new file mode 100755
index 000000000..e8c400d1d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml
new file mode 100755
index 000000000..b357cc1ae
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml
new file mode 100755
index 000000000..b9bb092fa
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
new file mode 100755
index 000000000..e3b54a66b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml
new file mode 100755
index 000000000..7eab8f904
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..8957dc785
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml
new file mode 100755
index 000000000..8d59196e0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml
new file mode 100755
index 000000000..2ccb6faee
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml
new file mode 100755
index 000000000..08fb96996
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_ampo2_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_ampo2_ultra.yaml
new file mode 100755
index 000000000..2880583b9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_ampo2_ultra.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 5.2
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_fp32_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_fp32_ultra.yaml
new file mode 100755
index 000000000..f2df21425
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_fp32_ultra.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 5.2
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml
new file mode 100755
index 000000000..b401bd168
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_large.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_large.yaml
new file mode 100755
index 000000000..6243f96c8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_large.yaml
@@ -0,0 +1,181 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_conv_large
+  drop_rate: 0.2
+  drop_path_rate: 0.35
+  class_num: 1000
+ 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.2
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002 ##null
+    eta_min: 1.0e-06
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+EMA:
+  decay: 0.9998 
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m8-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: # todo 确认是否需要
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 448
+        - CropImage:
+            size: 448
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 2
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_medium.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_medium.yaml
new file mode 100755
index 000000000..fcfe07dcb
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_medium.yaml
@@ -0,0 +1,181 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 500
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 320, 320]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_conv_medium
+  drop_rate: 0.2
+  drop_path_rate: 0.1
+  class_num: 1000
+ 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+EMA:
+  decay: 0.9998 
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m8-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 320
+        - CropImage:
+            size: 320
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_small.yaml
new file mode 100755
index 000000000..e1e81d9d2
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_conv_small.yaml
@@ -0,0 +1,181 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 2400
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_conv_small
+  drop_rate: 0.25
+  drop_path_rate: 0.03
+  class_num: 1000
+ 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.06
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002 ##null
+    eta_min: 0
+    warmup_epoch: 5
+    warmup_start_lr: 0
+
+
+EMA:
+  decay: 0.9998 
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m8-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_large.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_large.yaml
new file mode 100755
index 000000000..fa473a723
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_large.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: True
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_hybrid_large
+  drop_rate: 0.2
+  drop_path_rate: 0.1
+  use_fused_attn: False
+  class_num: 1000
+ 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.001
+    eta_min: 1.0e-06
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+EMA:
+  decay: 0.9998 
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 448
+        - CropImage:
+            size: 448
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 448
+    - CropImage:
+        size: 448
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_medium.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_medium.yaml
new file mode 100755
index 000000000..30df82dc9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_medium.yaml
@@ -0,0 +1,176 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 500
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: True
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_hybrid_medium
+  drop_rate: 0.2
+  drop_path_rate: 0.1
+  use_fused_attn: False
+  class_num: 1000
+ 
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.001
+    eta_min: 0
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+EMA:
+  decay: 0.9998 
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: # todo 确认是否需要
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_S.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_S.yaml
new file mode 100755
index 000000000..bf39753e7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_S.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileViT_S
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  no_weight_decay_name: .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 5
+    warmup_start_lr: 0.0002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_XS.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_XS.yaml
new file mode 100755
index 000000000..c4b6804e5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_XS.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileViT_XS
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  no_weight_decay_name: .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 5
+    warmup_start_lr: 0.0002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_XXS.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_XXS.yaml
new file mode 100755
index 000000000..a3611f4e3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViT/MobileViT_XXS.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileViT_XXS
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  no_weight_decay_name: .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 5
+    warmup_start_lr: 0.0002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg 
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x0_5.yaml
new file mode 100755
index 000000000..12891491a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x0_5.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV2_x0_5
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.004
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.009  # for total batch size 1024
+    eta_min: 0.0009
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x1_0.yaml
new file mode 100755
index 000000000..aeee054da
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x1_0.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV2_x1_0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.013
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0075  # for total batch size 1024
+    eta_min: 0.00075
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x1_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x1_5.yaml
new file mode 100755
index 000000000..198e4333f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x1_5.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV2_x1_5
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.029
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0035  # for total batch size 1024
+    eta_min: 0.00035
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x2_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x2_0.yaml
new file mode 100755
index 000000000..2739ea9d1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV2/MobileViTV2_x2_0.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV2_x2_0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 1024
+    eta_min: 0.0002
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_S.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_S.yaml
new file mode 100755
index 000000000..ba49dbbc2
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_S.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_S
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_S_L2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_S_L2.yaml
new file mode 100755
index 000000000..4a9ba8b01
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_S_L2.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_S_L2
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XS.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XS.yaml
new file mode 100755
index 000000000..3ac7264ac
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XS.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_XS
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XS_L2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XS_L2.yaml
new file mode 100755
index 000000000..10b3c7f9e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XS_L2.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_XS_L2
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XXS.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XXS.yaml
new file mode 100755
index 000000000..719a20068
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XXS.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_XXS
+  class_num: 1000
+  dropout: 0.05
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XXS_L2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XXS_L2.yaml
new file mode 100755
index 000000000..ff84fadd7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_XXS_L2.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_XXS_L2
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x0_5.yaml
new file mode 100755
index 000000000..31eef0c35
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x0_5.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_x0_5
+  class_num: 1000
+  classifier_dropout: 0.
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 1020 by referring to official
+    eta_min: 0.0002
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x0_75.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x0_75.yaml
new file mode 100755
index 000000000..eb4e3a3bd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x0_75.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_x0_75
+  class_num: 1000
+  classifier_dropout: 0.
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 1020 by referring to official
+    eta_min: 0.0002
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x1_0.yaml
new file mode 100755
index 000000000..e6b437bb4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/MobileViTV3/MobileViTV3_x1_0.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_x1_0
+  class_num: 1000
+  classifier_dropout: 0.
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 1020 by referring to official
+    eta_min: 0.0002
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_base_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_base_224.yaml
new file mode 100755
index 000000000..66317732e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_base_224.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_base_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_base_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_base_384.yaml
new file mode 100755
index 000000000..d013e6470
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_base_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_base_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_large_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_large_224.yaml
new file mode 100755
index 000000000..33b8dabfa
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_large_224.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_large_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_large_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_large_384.yaml
new file mode 100755
index 000000000..a3eaf4c58
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_large_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_large_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_small_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_small_224.yaml
new file mode 100755
index 000000000..d16151df2
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_small_224.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_small_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_small_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_small_384.yaml
new file mode 100755
index 000000000..daf727621
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/NextViT/NextViT_small_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_small_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_base.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_base.yaml
new file mode 100755
index 000000000..873b027a9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_base.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNet_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m15-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.4
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.4
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 236
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 236
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
new file mode 100755
index 000000000..9cc9dd9cd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNet_small
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 236
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 236
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_tiny.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_tiny.yaml
new file mode 100755
index 000000000..77f1e7a5f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNet/PPHGNet_tiny.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNet_tiny
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml
new file mode 100755
index 000000000..d5a7ab49e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B0
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml
new file mode 100755
index 000000000..bdae7decb
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B1
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml
new file mode 100755
index 000000000..a30d1dd65
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B2
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml
new file mode 100755
index 000000000..536659111
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B3
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml
new file mode 100755
index 000000000..167a48092
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B4
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml
new file mode 100755
index 000000000..c8cd5fcf7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: PPHGNet_small
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPHGNetV2_B4
+        class_num: *class_num
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/"
+        # ImageNet_5M label path, the training process does not use real labels.
+        cls_label_path: "./dataset/train_list_imagenet_5M.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+              interpolation: bicubic
+              backend: pil
+          - RandFlipImage:
+              flip_code: 1
+          - TimmAutoAugment:
+              config_str: rand-m7-mstd0.5-inc1
+              interpolation: bicubic
+              img_size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 236
+              interpolation: bicubic
+              backend: pil
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 236
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml
new file mode 100755
index 000000000..074a77b42
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: PPHGNet_small
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPHGNetV2_B4
+        class_num: *class_num
+        pretrained: path/to/stage1_best_model_student
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # stage2 should reduce learning rate
+    learning_rate: 0.005
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        # ImageNet-1k label path, the training process does not use real labels
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+              interpolation: bicubic
+              backend: pil
+          - RandFlipImage:
+              flip_code: 1
+          - TimmAutoAugment:
+              config_str: rand-m7-mstd0.5-inc1
+              interpolation: bicubic
+              img_size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 236
+              interpolation: bicubic
+              backend: pil
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 236
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml
new file mode 100755
index 000000000..57ca91199
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B5
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml
new file mode 100755
index 000000000..da9cb04bd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B6
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml
new file mode 100755
index 000000000..f700f6c6e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x0_25
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml
new file mode 100755
index 000000000..c83c504a7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml
new file mode 100755
index 000000000..00eae55c4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml
new file mode 100755
index 000000000..96a43650b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..97291077f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml
new file mode 100755
index 000000000..336fb3345
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 3.2
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml
new file mode 100755
index 000000000..9f7abadd9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.6
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml
new file mode 100755
index 000000000..367689197
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml
new file mode 100755
index 000000000..87fb92eee
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x2_0
+  class_num: 1000 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml
new file mode 100755
index 000000000..762d21fed
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x2_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
new file mode 100755
index 000000000..4195efebc
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 480
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNetV2_base
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    # support to specify width and height respectively:
+    # scales: [(160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 500
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_large.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_large.yaml
new file mode 100755
index 000000000..1551e673f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_large.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 480
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNetV2_large
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    # support to specify width and height respectively:
+    # scales: [(160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 250
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_small.yaml
new file mode 100755
index 000000000..bb937ba78
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PPLCNetV2/PPLCNetV2_small.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 480
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNetV2_small
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    # support to specify width and height respectively:
+    # scales: [(160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 500
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
new file mode 100755
index 000000000..447326ac7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B0
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
new file mode 100755
index 000000000..35ab6507a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B1
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
new file mode 100755
index 000000000..ec93edc94
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B2
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
new file mode 100755
index 000000000..77cfbad84
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B2_Linear
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
new file mode 100755
index 000000000..9d4443491
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B3
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
new file mode 100755
index 000000000..78e08fa7a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B4
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
new file mode 100755
index 000000000..b8f4da0d3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B5
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/PeleeNet/PeleeNet.yaml b/example/low_precision_training/ppcls/configs/ImageNet/PeleeNet/PeleeNet.yaml
new file mode 100755
index 000000000..06b0059bc
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/PeleeNet/PeleeNet.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PeleeNet
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.18  # for total batch size 512
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256  # for 2 cards
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_0.yaml
new file mode 100755
index 000000000..c4fa39e9b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_0.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_3.yaml
new file mode 100755
index 000000000..8bfe5c3c0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_3.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_1_3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_5.yaml
new file mode 100755
index 000000000..66a8497cb
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_1_5.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_2_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_2_0.yaml
new file mode 100755
index 000000000..80a80c179
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_2_0.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_3_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_3_0.yaml
new file mode 100755
index 000000000..7b896d20f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ReXNet/ReXNet_3_0.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_3_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet101.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet101.yaml
new file mode 100755
index 000000000..fa793e4d4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet101.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet152.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet152.yaml
new file mode 100755
index 000000000..1b7bc86e9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet152.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet152
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet26.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet26.yaml
new file mode 100755
index 000000000..d8e88eb90
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet26.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet26
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet38.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet38.yaml
new file mode 100755
index 000000000..b1326d647
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet38.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet38
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet50.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet50.yaml
new file mode 100755
index 000000000..5e2a5cf06
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RedNet/RedNet50.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_12GF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_12GF.yaml
new file mode 100755
index 000000000..7d9135427
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_12GF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_12GF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_1600MF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_1600MF.yaml
new file mode 100755
index 000000000..b14298f5e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_1600MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_1600MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_16GF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_16GF.yaml
new file mode 100755
index 000000000..7de217298
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_16GF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_16GF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_200MF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_200MF.yaml
new file mode 100755
index 000000000..1b87cabc0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_200MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_200MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_3200MF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_3200MF.yaml
new file mode 100755
index 000000000..702296394
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_3200MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_3200MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_32GF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_32GF.yaml
new file mode 100755
index 000000000..0c770b0c3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_32GF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_32GF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_400MF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_400MF.yaml
new file mode 100755
index 000000000..aad3bb23f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_400MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_400MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_600MF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_600MF.yaml
new file mode 100755
index 000000000..54dbbcca4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_600MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_600MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_6400MF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_6400MF.yaml
new file mode 100755
index 000000000..eae2c6a4d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_6400MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_6400MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_800MF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_800MF.yaml
new file mode 100755
index 000000000..8dd6d19db
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_800MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_800MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_8GF.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_8GF.yaml
new file mode 100755
index 000000000..3cb65e961
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RegNet/RegNetX_8GF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_8GF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A0.yaml
new file mode 100755
index 000000000..ecec64644
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_A0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A1.yaml
new file mode 100755
index 000000000..8ce5a8ad4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A1.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_A1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A2.yaml
new file mode 100755
index 000000000..7c8e4bc61
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_A2.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_A2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B0.yaml
new file mode 100755
index 000000000..5449d698d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1.yaml
new file mode 100755
index 000000000..37f926299
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1g2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1g2.yaml
new file mode 100755
index 000000000..a78d60d6d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1g2.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B1g2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1g4.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1g4.yaml
new file mode 100755
index 000000000..4b2678409
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B1g4.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B1g4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B2.yaml
new file mode 100755
index 000000000..d6c3f36a3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B2.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B2g4.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B2g4.yaml
new file mode 100755
index 000000000..d596211bb
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B2g4.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B2g4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B3.yaml
new file mode 100755
index 000000000..a15cbdc6c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B3.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B3g4.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B3g4.yaml
new file mode 100755
index 000000000..d2921bdb5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_B3g4.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B3g4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_D2se.yaml b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_D2se.yaml
new file mode 100755
index 000000000..0b7f105f1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/RepVGG/RepVGG_D2se.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 320, 320]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_D2se
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 320
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 320
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
new file mode 100755
index 000000000..155a5caee
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net101_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
new file mode 100755
index 000000000..db0076cf3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net200_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
new file mode 100755
index 000000000..e90812e03
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net50_14w_8s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
new file mode 100755
index 000000000..eabf48041
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net50_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
new file mode 100755
index 000000000..4f72ecbb4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net50_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml
new file mode 100755
index 000000000..315b7c74e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt101.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt200.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt200.yaml
new file mode 100755
index 000000000..cb3721264
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt200.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 320, 320]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt200
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 320
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 320
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 320
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt269.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt269.yaml
new file mode 100755
index 000000000..985ff98e7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt269.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 416, 416]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt269
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 416
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 416
+        - CropImage:
+            size: 416
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 416
+    - CropImage:
+        size: 416
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml
new file mode 100755
index 000000000..50529aecd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt50.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
new file mode 100755
index 000000000..c5db97b10
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt50_fast_1s1x64d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_32x4d.yaml
new file mode 100755
index 000000000..08a2f7bf1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_64x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_64x4d.yaml
new file mode 100755
index 000000000..be20d009d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_64x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00015
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml
new file mode 100755
index 000000000..385103801
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
new file mode 100755
index 000000000..fa1bf5b14
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt101_vd_64x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_vd_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_32x4d.yaml
new file mode 100755
index 000000000..2c25ada95
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt152_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_64x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_64x4d.yaml
new file mode 100755
index 000000000..6c065a95a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_64x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt152_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00018
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml
new file mode 100755
index 000000000..5ad91320c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt152_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml
new file mode 100755
index 000000000..44a354cbc
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt152_vd_64x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt152_vd_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_32x4d.yaml
new file mode 100755
index 000000000..ae3163dac
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt50_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_64x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_64x4d.yaml
new file mode 100755
index 000000000..b1af78e03
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_64x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt50_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml
new file mode 100755
index 000000000..be5881070
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt50_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml
new file mode 100755
index 000000000..e040dd378
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt/ResNeXt50_vd_64x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt50_vd_64x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
new file mode 100755
index 000000000..e0d496e83
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x16d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
new file mode 100755
index 000000000..40f82b5fd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x32d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
new file mode 100755
index 000000000..5974ca719
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x48d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
new file mode 100755
index 000000000..d2f4cd852
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x8d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet101.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet101.yaml
new file mode 100755
index 000000000..afd6329b3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet101.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml
new file mode 100755
index 000000000..748dfc12f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet101_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet101_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet152.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet152.yaml
new file mode 100755
index 000000000..993d52e5d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet152.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet152
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml
new file mode 100755
index 000000000..8daf0f806
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet152_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet152_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18.yaml
new file mode 100755
index 000000000..d1675b8c4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  # pretrained_model: ./exp_11.21/output/epoch_120.pdparams
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False  ## VisualDL
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0  ## 1 ##
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet18
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    # name: Cosine
+    # learning_rate: 0.1
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  # weight_decay: 0.00005
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: /data/ILSVRC2012/
+      cls_label_path: /data/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: /data/ILSVRC2012/
+      cls_label_path: /data/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_custom.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_custom.yaml
new file mode 100755
index 000000000..b38bac2ee
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_custom.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: ./output/latest.pdparams
+  # pretrained_model: null
+  output_dir: ./eval/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False  ## VisualDL
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0  ## 1 ##
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet18
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine ## 看看是每个iter吗
+    learning_rate: 0.1
+    # name: Piecewise
+    # learning_rate: 0.1
+    # decay_epochs: [30, 60, 90]
+    # values: [0.1, 0.01, 0.001, 0.0001]
+  weight_decay: 5e-5
+  # weight_decay: 0.00005
+  # regularizer:
+  #   name: 'L2'
+  #   coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: /data/ILSVRC2012/
+      cls_label_path: /data/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: /data/ILSVRC2012/
+      cls_label_path: /data/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_dbb.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_dbb.yaml
new file mode 100755
index 000000000..35250d83a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_dbb.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet18
+  class_num: 1000
+  layer_type: DiverseBranchBlock
+  use_first_short_conv: False
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            backend: pil
+            interpolation: bilinear
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            saturation: 0.4
+            hue: 0.4
+        - PCALighting:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            backend: pil
+            interpolation: bilinear
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_exactmatch.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_exactmatch.yaml
new file mode 100755
index 000000000..83bfe2861
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_exactmatch.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  # pretrained_model: ./output/epoch_120.pdparams
+  pretrained_model: null
+  output_dir: ./test_output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False  ## VisualDL
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+# AMP:
+#   use_amp: False
+#   use_fp16_test: False
+#   scale_loss: 128.0  ## 1 ##
+#   use_dynamic_loss_scaling: True
+#   use_promote: False
+#   # O1: mixed fp16, O2: pure fp16
+#   level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet18
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: cosine
+    learning_rate: 0.1  #####
+  weight_decay: 0.00005
+  # regularizer:
+  #   name: 'L2'
+  #   coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: /data/ILSVRC2012/
+      cls_label_path: /data/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: /data/ILSVRC2012/
+      cls_label_path: /data/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml
new file mode 100755
index 000000000..591ef505b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet18_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet18_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml
new file mode 100755
index 000000000..135388ae7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet200_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet200_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet34.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet34.yaml
new file mode 100755
index 000000000..88af8fe0e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet34.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet34
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml
new file mode 100755
index 000000000..feb6c4266
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet34_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet34_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
new file mode 100755
index 000000000..a6dd0f5f4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
new file mode 100755
index 000000000..4981efe91
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml
new file mode 100755
index 000000000..47aa4c0e4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml
@@ -0,0 +1,150 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_channel: &image_channel 4
+  # used for static mode and model export
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2_ultra.yaml
new file mode 100755
index 000000000..7f4de6640
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_amp_O2_ultra.yaml
@@ -0,0 +1,150 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_channel: &image_channel 4
+  # used for static mode and model export
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: True
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_ampo2_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_ampo2_ultra.yaml
new file mode 100755
index 000000000..ee606707a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_ampo2_ultra.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.8
+    decay_epochs: [30, 60, 90]
+    values: [0.8, 0.08, 0.008, 0.0008]
+    warmup_epoch : 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_fp32_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_fp32_ultra.yaml
new file mode 100755
index 000000000..9fb8da0d6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_fp32_ultra.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.8
+    decay_epochs: [30, 60, 90]
+    values: [0.8, 0.08, 0.008, 0.0008]
+    warmup_epoch : 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
new file mode 100755
index 000000000..5f30746b1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml
new file mode 100755
index 000000000..33e72300a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SENet154_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SENet154_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
new file mode 100755
index 000000000..df3d9c561
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNeXt101_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2_ultra.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2_ultra.yaml
new file mode 100755
index 000000000..835782185
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2_ultra.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_channel: &image_channel 4
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: SE_ResNeXt101_32x4d
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: True
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
new file mode 100755
index 000000000..192b5aafd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNeXt50_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
new file mode 100755
index 000000000..e82bfe42c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNeXt50_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
new file mode 100755
index 000000000..5cfed65d2
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNet18_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
new file mode 100755
index 000000000..857300d51
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNet34_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
new file mode 100755
index 000000000..e7b94138c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNet50_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
new file mode 100755
index 000000000..4b0393cce
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_swish
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
new file mode 100755
index 000000000..a00c648cf
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
new file mode 100755
index 000000000..e3b82cf90
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_33
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
new file mode 100755
index 000000000..c228b57a6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
new file mode 100755
index 000000000..f0d5d46f0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
new file mode 100755
index 000000000..202514c99
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.25
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
new file mode 100755
index 000000000..d633a754e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.25
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml
new file mode 100755
index 000000000..7224467c7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SqueezeNet1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml
new file mode 100755
index 000000000..ed5ef8ecb
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SqueezeNet1_1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S1.yaml
new file mode 100755
index 000000000..a25df46d4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S1.yaml
@@ -0,0 +1,165 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: StarNet_S1
+  drop_rate: 0
+  drop_path_rate: 0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: None
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 3e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m1-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 0.2
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - ResizeImage:
+            interpolation: bicubic
+            backend: pil
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 20
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S2.yaml
new file mode 100755
index 000000000..f57360e59
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S2.yaml
@@ -0,0 +1,165 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: StarNet_S2
+  drop_rate: 0
+  drop_path_rate: 0.01
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: None
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 3e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m1-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 0.2
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - ResizeImage:
+            interpolation: bicubic
+            backend: pil
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 20
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S3.yaml
new file mode 100755
index 000000000..af99efbc4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S3.yaml
@@ -0,0 +1,166 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: StarNet_S3
+  drop_rate: 0
+  drop_path_rate: 0.01
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: None
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 3e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list_smallbatch.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m1-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 0.2
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - ResizeImage:
+            interpolation: bicubic
+            backend: pil
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 20
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S4.yaml b/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S4.yaml
new file mode 100755
index 000000000..608f76c7f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/StarNet/StarNet_S4.yaml
@@ -0,0 +1,165 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: StarNet_S4
+  drop_rate: 0
+  drop_path_rate: 0.02
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: None
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 3e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m1-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 0.2
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - ResizeImage:
+            interpolation: bicubic
+            backend: pil
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 20
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
new file mode 100755
index 000000000..c212dab12
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_base_patch4_window12_384
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+  
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
new file mode 100755
index 000000000..6941543d8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
@@ -0,0 +1,176 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_base_patch4_window7_224
+  class_num: 1000
+  pretrained: True
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+  
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
new file mode 100755
index 000000000..3e335b24e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_large_patch4_window12_384
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+  
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
new file mode 100755
index 000000000..89bdd0108
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_large_patch4_window7_224
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+  
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
new file mode 100755
index 000000000..ee134d5f0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_small_patch4_window7_224
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..e359afcef
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: True
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window16_256.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window16_256.yaml
new file mode 100755
index 000000000..966bb2cd1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window16_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_base_patch4_window16_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window24_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window24_384.yaml
new file mode 100755
index 000000000..b71ad0ec4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window24_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_base_patch4_window24_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window8_256.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window8_256.yaml
new file mode 100755
index 000000000..2337b5d8d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window8_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_base_patch4_window8_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window16_256.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window16_256.yaml
new file mode 100755
index 000000000..ed60a4b82
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window16_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_large_patch4_window16_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window24_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window24_384.yaml
new file mode 100755
index 000000000..48972c48f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window24_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_large_patch4_window24_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window16_256.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window16_256.yaml
new file mode 100755
index 000000000..e8c4c7c4c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window16_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_small_patch4_window16_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window8_256.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window8_256.yaml
new file mode 100755
index 000000000..c0e1e0c7e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window8_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_small_patch4_window8_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window16_256.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window16_256.yaml
new file mode 100755
index 000000000..18678441e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window16_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_tiny_patch4_window16_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window8_256.yaml b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window8_256.yaml
new file mode 100755
index 000000000..a1b69b3d7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window8_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_tiny_patch4_window8_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/TNT/TNT_base.yaml b/example/low_precision_training/ppcls/configs/ImageNet/TNT/TNT_base.yaml
new file mode 100755
index 000000000..c2c7766be
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/TNT/TNT_base.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TNT_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/TNT/TNT_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/TNT/TNT_small.yaml
new file mode 100755
index 000000000..2ab6cb60f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/TNT/TNT_small.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TNT_small
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_A.yaml b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_A.yaml
new file mode 100755
index 000000000..4e6c365bd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_A.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 192]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_A
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 192
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 219
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 219
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 192
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_B.yaml b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_B.yaml
new file mode 100755
index 000000000..96fd0d06b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_B.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 188, 188]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_B
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 188
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 214
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 188
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 214
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 188
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_C.yaml b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_C.yaml
new file mode 100755
index 000000000..addaec8c1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_C.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 184, 184]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_C
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.0
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 184
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 210
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 184
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 210
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 184
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_D.yaml b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_D.yaml
new file mode 100755
index 000000000..a868af59b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_D.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 152, 152]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_D
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.0
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 152
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 173
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 152
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 173
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 152
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_E.yaml b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_E.yaml
new file mode 100755
index 000000000..02617db16
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/TinyNet/TinyNet_E.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 106, 106]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_E
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.0
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 106
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 121
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 106
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 121
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 106
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
new file mode 100755
index 000000000..216bda288
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_base.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: alt_gvt_base
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.3
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
new file mode 100755
index 000000000..ff2e62520
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_large.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: alt_gvt_large
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.5
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
new file mode 100755
index 000000000..7de1133c1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Twins/alt_gvt_small.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: alt_gvt_small
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
new file mode 100755
index 000000000..b3fbeca13
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_base.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: pcpvt_base
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.3
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
new file mode 100755
index 000000000..4d91ea255
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_large.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: pcpvt_large
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.5
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
new file mode 100755
index 000000000..97d5f1e19
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Twins/pcpvt_small.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: pcpvt_small
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_base.yaml b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_base.yaml
new file mode 100755
index 000000000..58ef6931f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_base.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_base
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_base_ls.yaml b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_base_ls.yaml
new file mode 100755
index 000000000..29a3d4261
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_base_ls.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_base_ls
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small.yaml b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small.yaml
new file mode 100755
index 000000000..5d1860d89
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_small
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small_plus.yaml b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small_plus.yaml
new file mode 100755
index 000000000..f08fb716a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small_plus.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_small_plus
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small_plus_dim64.yaml b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small_plus_dim64.yaml
new file mode 100755
index 000000000..bff77c140
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/UniFormer/UniFormer_small_plus_dim64.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_small_plus_dim64
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B0.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B0.yaml
new file mode 100755
index 000000000..f121dc57d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B0.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VAN_B0
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+            mean: [0.5, 0.5, 0.5]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B1.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B1.yaml
new file mode 100755
index 000000000..fbc30f6f5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B1.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VAN_B1
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+            mean: [0.5, 0.5, 0.5]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B2.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B2.yaml
new file mode 100755
index 000000000..1dca03a2d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B2.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VAN_B2
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+            mean: [0.5, 0.5, 0.5]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B3.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B3.yaml
new file mode 100755
index 000000000..6bdef9a20
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VAN/VAN_B3.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VAN_B3
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+            mean: [0.5, 0.5, 0.5]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG11.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG11.yaml
new file mode 100755
index 000000000..25318c680
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG11.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VGG11
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG13.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG13.yaml
new file mode 100755
index 000000000..3720cf8fe
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG13.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VGG13
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG16.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG16.yaml
new file mode 100755
index 000000000..38f92e106
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG16.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VGG16
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG19.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG19.yaml
new file mode 100755
index 000000000..77f6360a4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VGG/VGG19.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VGG19
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml
new file mode 100755
index 000000000..5b9c518a7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_base_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml
new file mode 100755
index 000000000..a8792a036
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_base_patch16_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml
new file mode 100755
index 000000000..477d9edd0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_base_patch32_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml
new file mode 100755
index 000000000..7174f151f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_large_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml
new file mode 100755
index 000000000..195cd2293
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_large_patch16_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml
new file mode 100755
index 000000000..afa78dacf
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_large_patch32_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml
new file mode 100755
index 000000000..7eafe2acd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_small_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception41.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception41.yaml
new file mode 100755
index 000000000..c622617f7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception41.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception41
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception41_deeplab.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception41_deeplab.yaml
new file mode 100755
index 000000000..d03b6bc77
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception41_deeplab.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception41_deeplab
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception65.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception65.yaml
new file mode 100755
index 000000000..c134331a0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception65.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception65
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception65_deeplab.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception65_deeplab.yaml
new file mode 100755
index 000000000..05e88336e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception65_deeplab.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception65_deeplab
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception71.yaml b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception71.yaml
new file mode 100755
index 000000000..3fabd6595
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ImageNet/Xception/Xception71.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception71
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0225
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/Logo/ResNet50_ReID.yaml b/example/low_precision_training/ppcls/configs/Logo/ResNet50_ReID.yaml
new file mode 100755
index 000000000..bfbedf8f9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Logo/ResNet50_ReID.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "CircleMargin"
+    margin: 0.35
+    scale: 64
+    embedding_size: 512
+    class_num: 3000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - PairwiseCosface:
+        margin: 0.35
+        gamma: 64
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: "L2"
+    coeff: 0.0001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: "dataset/LogoDet-3K-crop/train/"
+      cls_label_path: "dataset/LogoDet-3K-crop/train_list.txt"
+      relabel: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AugMix:
+            prob: 0.5
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+        - RandomErasing:
+            EPSILON: 0.5
+    sampler:
+      name: PKSampler
+      batch_size: 128
+      sample_per_id: 2
+      drop_last: True
+
+    loader:
+      num_workers: 6
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: "dataset/LogoDet-3K-crop/val/"
+        cls_label_path: "dataset/LogoDet-3K-crop/query_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 8
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: ImageNetDataset
+        image_root: "dataset/LogoDet-3K-crop/train/"
+        cls_label_path: "dataset/LogoDet-3K-crop/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/CLIP_vit_base_patch16_448_ml_decoder_448.yaml b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/CLIP_vit_base_patch16_448_ml_decoder_448.yaml
new file mode 100755
index 000000000..435842f89
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/CLIP_vit_base_patch16_448_ml_decoder_448.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: CLIP_vit_base_patch16_224
+  class_num: 80
+  return_embed: False
+  use_fused_attn: False # fused attn can be used in AMP O2 mode only
+  pretrained: True
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 768
+  remove_layers: []
+  replace_layer: 'head'
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-5
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 8
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B0_ml_decoder_448.yaml b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B0_ml_decoder_448.yaml
new file mode 100755
index 000000000..00b3abb16
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B0_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B0
+  class_num: 80
+  pretrained: True # ssld pretrained
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B4_ml_decoder_448.yaml b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B4_ml_decoder_448.yaml
new file mode 100755
index 000000000..a92c89bc1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B4_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B4
+  class_num: 80
+  pretrained: True # ssld pretrained
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B6_ml_decoder_448.yaml b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B6_ml_decoder_448.yaml
new file mode 100755
index 000000000..d21453c31
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B6_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B6
+  class_num: 80
+  pretrained: True # ssld pretrained
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 8
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-LCNet_x1_0_ml_decoder_448.yaml b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-LCNet_x1_0_ml_decoder_448.yaml
new file mode 100755
index 000000000..15aaee0a2
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/PP-LCNet_x1_0_ml_decoder_448.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 80
+  pretrained: True
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  class_num: 80
+  in_channels: 1280
+
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/README.md b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/README.md
new file mode 100755
index 000000000..e4e4263c1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/README.md
@@ -0,0 +1,272 @@
+# ML-Decoder多标签分类
+
+## 目录
+
+* [1. 模型介绍](#1)
+* [2. 数据和模型准备](#2)
+* [3. 模型训练](#3)
+* [4. 模型评估](#4)
+* [5. 模型预测](#5)
+* [6. 基于预测引擎预测](#6)
+  * [6.1 导出 inference model](#6.1)
+  * [6.2 基于 Python 预测引擎推理](#6.2)
+* [7. 引用](#7)
+
+<a name="1"></a>
+## 1. 模型介绍
+
+ML-Decoder是一种新的基于注意力的分类头，它通过查询来预测类别标签的存在，并且比全局平均池化能够更好地利用空间数据。ML-Decoder的特点有以下几点：
+
+1. ML-Decoder重新设计了解码器的架构，并且使用了一种新颖的分组解码方案，使得ML-Decoder具有高效性和可扩展性，可以很好地处理数千个类别的分类任务。
+2. ML-Decoder具有一致性的速度-准确度权衡，相比于使用更大的主干网络，ML-Decoder能够提供更好的性能。
+3. ML-Decoder也具有多功能性，它可以作为各种分类头的替代方案，并且在使用词语查询时能够泛化到未见过的类别。通过使用新颖的查询增强方法，ML-Decoder的泛化能力进一步提高。
+
+使用ML-Decoder，作者在多个分类任务上取得了最先进的结果：
+1. 在MS-COCO多标签分类上，达到了91.4%的mAP；
+2. 在NUS-WIDE零样本分类上，达到了31.1%的ZSL mAP；
+3. 在ImageNet单标签分类上，使用普通的ResNet50主干网络，达到了80.7%的新高分，而没有使用额外的数据或蒸馏。
+
+`PaddleClas` 目前支持在单标签分类和多标签分类任务中使用ML-Decoder, 且使用方式非常简单方便，须在配置文件的Arch作用域下设置use_ml_decoder=True 并给出对ML-Decoder的参数配置即可:
+```yaml
+# model architecture
+Arch:
+  name: ResNet101
+  class_num: 80
+  pretrained: True
+  # use ml-decoder head to replace avg_pool and fc
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+  # optional args
+  # class_num: 80
+  # remove_layers: ['avg_pool', 'flatten']
+  # replace_layer: 'fc'
+```
+注意：
+1. 当所选择的Backbone无法调取class_num属性时，MLDecoder需要在配置文件中手动添加`class_num`属性。
+2. 实际使用时可根据对应Backbone中的实际的层的名称来修改`remove_layers`和`replace_layer`参数。 Backbone中的实际的层的名称可根据选取的主干模型名在`ppcls/arch/backbone`路径下查找对应的实现代码，并查看改模型相应输出层的名称。
+
+下面是选择`RepVGG_A0`为例， 对应查看`ppcls/arch/backbone/model_zoo/repvgg.py`, 适应性添加或修改`class_num`, `remove_layers`和`replace_layer`后的MLDecoder配置示例：
+```yaml
+# model architecture
+Arch:
+  name: RepVGG_A0
+  class_num: 80
+  pretrained: True
+  # use ml-decoder head to replace avg_pool and fc
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 1280
+  # optional args
+  class_num: 80
+  remove_layers: ['gap']
+  replace_layer: 'linear'
+```
+
+开发者可以自行尝试使用不同的主干模型来结合ML-Decoder。
+
+目前使用多类Backbone结合ML-Decoder在COCO2017的多标签分类任务上的性能指标如下：
+
+|        Model         | Backbone  | Resolution | mAP |                   Links                   |
+|:--------------------:|:---------:|:----------:|:---:|:-----------------------------------------:|
+| PP-LCNet_x1_0_ml_decoder_448 | PP-LCNet_x1_0 |  448x448   |  77.96%  | [config](./PP-LCNet_x1_0_ml_decoder_448.yaml) |
+| PP-HGNetV2-B0_ml_decoder_448 | PP-HGNetV2-B0 |  448x448   |  80.98%  | [config](./PP-HGNetV2-B0_ml_decoder_448.yaml) |
+| PP-HGNetV2-B4_ml_decoder_448 | PP-HGNetV2-B4 |  448x448   |  87.96%   | [config](./PP-HGNetV2-B4_ml_decoder_448.yaml) |
+| PP-HGNetV2-B6_ml_decoder_448 | PP-HGNetV2-B6 |  448x448   |  91.25%  | [config](./PP-HGNetV2-B6_ml_decoder_448.yaml) |
+| ResNet50_ml_decoder_448 | ResNet50 |  448x448   |  83.50%  | [config](./ResNet50_ml_decoder_448.yaml) |
+| ResNet101_ml_decoder | ResNet101 |  448x448   |  91.40%   | [config](./ResNet101_ml_decoder_448.yaml) |
+| CLIP_vit_base_patch16_448_ml_decoder_448 | CLIP_vit_base_patch16_448 |  448x448   |  89.15%   | [config](./CLIP_vit_base_patch16_448_ml_decoder_448.yaml) |
+
+基于 [COCO2017](https://cocodataset.org/) 数据集，如下将介绍添加ml-decoder进行多标签分类的训练、评估、预测的过程。请首先安装 PaddlePaddle 和 PaddleClas，具体安装步骤可详看 [环境准备](../installation.md)。
+
+
+
+<a name="2"></a>
+## 2. 数据和模型准备
+
+* 进入 `PaddleClas` 目录。
+
+```
+cd path_to_PaddleClas
+```
+
+* 创建并进入 `dataset/COCO2017` 目录，下载并解压 COCO2017 数据集。
+
+```shell
+mkdir dataset/COCO2017 && cd dataset/COCO2017
+wget http://images.cocodataset.org/zips/train2017.zip -O t.zip && unzip t.zip -d . && rm t.zip
+wget http://images.cocodataset.org/zips/val2017.zip -O t.zip && unzip t.zip -d . && rm t.zip
+wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -O t.zip && unzip t.zip -d . && rm t.zip
+```
+
+* 返回 `PaddleClas` 根目录
+
+
+```shell
+cd ../../
+# 转换训练集并生成`COCO2017_labels.txt`
+python3 ./ppcls/utils/create_coco_multilabel_lists.py \
+        --dataset_dir dataset/COCO2017 \
+        --image_dir train2017 \
+        --anno_path annotations/instances_train2017.json \
+        --save_name multilabel_train_list --save_label_name
+# 转换测试集
+python3 ./ppcls/utils/create_coco_multilabel_lists.py \
+        --dataset_dir dataset/COCO2017 \
+        --image_dir val2017 \
+        --anno_path annotations/instances_val2017.json \
+        --save_name multilabel_val_list
+```
+
+<a name="3"></a>
+## 3. 模型训练
+
+```shell
+# 多卡
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    tools/train.py \
+        -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
+# 单卡
+python3 tools/train.py \
+        -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
+```
+
+**注意:**
+1. 目前多标签分类的损失函数默认使用`MultiLabelAsymmetricLoss`。
+2. 目前多标签分类的评估指标默认使用`MultiLabelMAP(integral)`。
+
+<a name="4"></a>
+
+## 4. 模型评估
+
+```bash
+python3 tools/eval.py \
+    -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml \
+    -o Global.pretrained_model="./output/ResNet101_ml_decoder_448/best_model"
+```
+
+<a name="5"></a>
+## 5. 模型预测
+
+```bash
+python3 tools/infer.py \
+    -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml \
+    -o Global.pretrained_model="./output/ResNet101_ml_decoder_448/best_model"
+```
+
+得到类似下面的输出：
+```
+[{'class_ids': [0, 2, 7, 24, 25, 26, 33, 56], 'scores': [0.99998, 0.52104, 0.51953, 0.59292, 0.64329, 0.63605, 0.99994, 0.7054], 'label_names': ['person', 'car', 'truck', 'backpack', 'umbrella', 'handbag', 'kite', 'chair'], 'file_name': 'deploy/images/coco_000000570688.jpg'}]
+```
+
+<a name="6"></a>
+## 6. 基于预测引擎预测
+
+<a name="6.1"></a>
+### 6.1 导出 inference model
+
+```bash
+python3 tools/export_model.py \
+    -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml \
+    -o Global.pretrained_model="./output/ResNet101_ml_decoder_448/best_model"
+```
+inference model 的路径默认在当前路径下 `./inference`
+`./inference` 文件夹下应有如下文件结构：
+
+```
+├── inference
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+```
+
+<a name="6.2"></a>
+
+### 6.2 基于 Python 预测引擎推理
+
+切换到depoly目录下，并且使用deploy中的脚本进行推理前需要确认paddleclas为非本地安装, 如不是请进行切换，不然会出现包的导入错误。 
+
+```shell
+# 本地安装
+pip install -e .
+# 非本地安装
+python setup.py install
+
+# 进入deploy目录下
+cd deploy
+```
+
+<a name="6.2.1"></a>  
+
+#### 6.2.1 预测单张图像
+
+运行下面的命令，对图像 `./images/coco_000000570688.jpg` 进行分类。
+
+```shell
+# linux使用`python3`，windows使用`python (-m)`来执行脚本
+# 使用下面的命令使用 GPU 进行预测
+python3 python/predict_cls.py \
+    -c configs/inference_cls_multilabel.yaml \
+    -o Global.inference_model_dir=../inference/ \
+    -o Global.infer_imgs=images/coco_000000570688.jpg \
+    -o PostProcess.MultiLabelThreshOutput.class_id_map_file=../ppcls/utils/COCO2017_label_list.txt 
+# 使用下面的命令使用 CPU 进行预测
+python3 python/predict_cls.py \
+    -c configs/inference_cls_multilabel.yaml \
+    -o Global.inference_model_dir=../inference/ \
+    -o Global.infer_imgs=images/coco_000000570688.jpg \
+    -o PostProcess.MultiLabelThreshOutput.class_id_map_file=../ppcls/utils/COCO2017_label_list.txt \
+    -o Global.use_gpu=False
+```
+
+输出结果如下：
+
+```
+coco_000000570688.jpg:  class id(s): [0, 2, 3, 4, 7, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 45, 46, 47, 48, 49, 51, 52, 53, 54, 57, 58, 60, 61, 62, 63, 64, 65, 67, 69, 70, 71, 72, 73, 75], score(s): [0.84, 0.68, 0.93, 0.54, 0.74, 0.90, 0.56, 0.60, 0.63, 0.77, 0.64, 0.70, 0.94, 0.82, 0.99, 0.71, 0.86, 0.81, 0.81, 0.65, 0.65, 0.92, 0.67, 0.53, 0.83, 0.63, 0.58, 0.52, 0.83, 0.55, 0.92, 0.72, 0.74, 0.59, 0.82, 0.50, 0.62, 0.77, 0.87, 0.64, 0.84, 0.67], label_name(s): ['person', 'car', 'motorcycle', 'airplane', 'truck', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'carrot', 'hot dog', 'pizza', 'donut', 'couch', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'cell phone', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+```
+
+<a name="6.2.2"></a>  
+
+#### 6.2.2 基于文件夹的批量预测
+
+如果希望预测文件夹内的图像，可以直接修改配置文件中的 `Global.infer_imgs` 字段，也可以通过下面的 `-o` 参数修改对应的配置。
+
+```shell
+# linux使用`python3`，windows使用`python (-m)`来执行脚本
+# 使用下面的命令使用 GPU 进行预测，如果希望使用 CPU 预测，可以在命令后面添加 -o Global.use_gpu=False
+python3 python/predict_cls.py \
+    -c configs/inference_cls_multilabel.yaml \
+    -o Global.inference_model_dir=../inference/ \
+    -o PostProcess.MultiLabelThreshOutput.class_id_map_file=../ppcls/utils/COCO2017_label_list.txt \
+    -o Global.infer_imgs=images/ImageNet/
+```
+
+终端中会输出该文件夹内所有图像的分类结果，如下所示。
+
+```
+ILSVRC2012_val_00000010.jpeg:   class id(s): [0, 2, 3, 7, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 40, 41, 45, 46, 47, 48, 49, 52, 53, 54, 58, 60, 61, 62, 63, 64, 65, 69, 70, 71, 72, 73, 75], score(s): [0.80, 0.58, 0.89, 0.74, 0.86, 0.66, 0.56, 0.60, 0.81, 0.64, 0.73, 0.94, 0.75, 0.99, 0.70, 0.86, 0.78, 0.63, 0.57, 0.76, 0.66, 0.60, 0.94, 0.65, 0.90, 0.63, 0.52, 0.79, 0.50, 0.93, 0.72, 0.70, 0.60, 0.83, 0.61, 0.75, 0.86, 0.67, 0.87, 0.64], label_name(s): ['person', 'car', 'motorcycle', 'truck', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'wine glass', 'cup', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'hot dog', 'pizza', 'donut', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+ILSVRC2012_val_00010010.jpeg:   class id(s): [0, 2, 3, 6, 7, 8, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 40, 45, 46, 47, 48, 49, 51, 52, 53, 54, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75], score(s): [0.78, 0.66, 0.93, 0.54, 0.77, 0.50, 0.86, 0.69, 0.63, 0.50, 0.78, 0.56, 0.71, 0.93, 0.78, 0.99, 0.64, 0.85, 0.80, 0.53, 0.85, 0.71, 0.66, 0.96, 0.70, 0.62, 0.85, 0.58, 0.57, 0.57, 0.78, 0.50, 0.92, 0.64, 0.73, 0.71, 0.77, 0.53, 0.66, 0.52, 0.73, 0.87, 0.69, 0.85, 0.66], label_name(s): ['person', 'car', 'motorcycle', 'train', 'truck', 'boat', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'wine glass', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'carrot', 'hot dog', 'pizza', 'donut', 'couch', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+ILSVRC2012_val_00020010.jpeg:   class id(s): [0, 2, 3, 7, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 40, 41, 45, 46, 47, 48, 49, 51, 52, 53, 54, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75], score(s): [0.85, 0.62, 0.94, 0.69, 0.89, 0.56, 0.56, 0.62, 0.77, 0.67, 0.71, 0.93, 0.78, 0.99, 0.65, 0.86, 0.75, 0.57, 0.60, 0.76, 0.66, 0.58, 0.95, 0.73, 0.50, 0.88, 0.63, 0.59, 0.62, 0.84, 0.59, 0.82, 0.74, 0.74, 0.62, 0.86, 0.53, 0.53, 0.60, 0.73, 0.88, 0.65, 0.85, 0.70], label_name(s): ['person', 'car', 'motorcycle', 'truck', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'wine glass', 'cup', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'carrot', 'hot dog', 'pizza', 'donut', 'couch', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+ILSVRC2012_val_00030010.jpeg:   class id(s): [0, 2, 3, 4, 7, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 40, 41, 45, 46, 47, 48, 49, 51, 52, 53, 58, 60, 61, 62, 63, 64, 65, 69, 70, 71, 72, 73, 75], score(s): [0.82, 0.60, 0.92, 0.54, 0.67, 0.87, 0.57, 0.63, 0.57, 0.84, 0.70, 0.80, 0.92, 0.82, 0.99, 0.72, 0.86, 0.80, 0.59, 0.55, 0.84, 0.73, 0.60, 0.94, 0.75, 0.53, 0.89, 0.51, 0.84, 0.56, 0.90, 0.87, 0.67, 0.70, 0.85, 0.59, 0.82, 0.91, 0.62, 0.89, 0.67], label_name(s): ['person', 'car', 'motorcycle', 'airplane', 'truck', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'wine glass', 'cup', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'carrot', 'hot dog', 'pizza', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+```
+
+<a name="7"></a>
+## 7. 引用
+```
+@misc{ridnik2021mldecoder,
+      title={ML-Decoder: Scalable and Versatile Classification Head}, 
+      author={Tal Ridnik and Gilad Sharir and Avi Ben-Cohen and Emanuel Ben-Baruch and Asaf Noy},
+      year={2021},
+      eprint={2111.12933},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
new file mode 100755
index 000000000..8bafbb23d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet101
+  class_num: 80
+  pretrained: True
+  # use ml-decoder head to replace avg_pool and fc
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  one_dim_param_no_weight_decay: True
+  weight_decay: 0.0001 # 1e-4
+  lr:
+    name: OneCycleLR
+    max_learning_rate: 0.0001 # 1e-4
+    divide_factor: 25.0
+    end_learning_rate: 0.0000000001 # 1e-10
+    phase_pct: 0.2
+    anneal_strategy: cos
+    three_phase: False
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: ./dataset/COCO2017/train2017
+      cls_label_path: ./dataset/COCO2017/multilabel_train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 56
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: ./dataset/COCO2017/val2017
+      cls_label_path: ./dataset/COCO2017/multilabel_val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet50_ml_decoder_448.yaml b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet50_ml_decoder_448.yaml
new file mode 100755
index 000000000..067c5be1c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet50_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 80
+  pretrained: True
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/example/low_precision_training/ppcls/configs/PULC/car_exists/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/car_exists/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..911b8edec
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/car_exists/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,139 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 2
+  pretrained: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..247f655b5
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0.yaml
@@ -0,0 +1,152 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.9
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..4c11802d6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..c263f2309
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/car_exists/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,152 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/car_exists/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/car_exists/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..a75fda4b4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/car_exists/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 2e-6
+    warmup_epoch: 5
+    warmup_start_lr: 2e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/car_exists/search.yaml b/example/low_precision_training/ppcls/configs/PULC/car_exists/search.yaml
new file mode 100755
index 000000000..820337c02
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/car_exists/search.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_person_cls
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/clarity_assessment/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/clarity_assessment/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..0f766380f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/clarity_assessment/PPLCNet_x1_0.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 2
+  use_last_conv: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.14
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: CustomLabelDataset
+      image_root: ./dataset/
+      sample_list_path: ./dataset/ImageNet_OCR_det.txt
+      label_key: blur_image
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - BlurImage:
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/blur/
+      cls_label_path: ./dataset/blur/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./test_img/
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TopkAcc:
+        topk: [1]
diff --git a/example/low_precision_training/ppcls/configs/PULC/code_exists/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/code_exists/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..8566b0c5c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/code_exists/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,137 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 2
+  pretrained: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: test_images/
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..02a960116
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: test_imags/
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_code
+    label_1: contains_code
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..2ba2dcd1b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/code_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_code
+    label_1: contains_code
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..b6b627e10
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,150 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_code
+    label_1: contains_code
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/code_exists/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/code_exists/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..7600a4361
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/code_exists/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-5
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: test_images
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_code
+    label_1: contains_code
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/code_exists/search.yaml b/example/low_precision_training/ppcls/configs/PULC/code_exists/search.yaml
new file mode 100755
index 000000000..585450e5b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/code_exists/search.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_code_exists
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.005, 0.0075, 0.01, 0.015, 0.02]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/image_orientation/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/image_orientation/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..c55ad1a92
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/image_orientation/PPLCNet_x1_0.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 4
+  use_last_conv: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.14
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: CustomLabelDataset
+      image_root: ./dataset/OrientationDataset/
+      sample_list_path: ./dataset/OrientationDataset/train_list.txt
+      label_key: random_rot90_orientation
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+        - RandomRot90:
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/OrientationDataset/
+      cls_label_path: ./dataset/OrientationDataset/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./test_img/
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TopkAcc:
+        topk: [1]
diff --git a/example/low_precision_training/ppcls/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..c3973ff42
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  start_eval_epoch: 20
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 10
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..081d8d23f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 80, 160]
+  save_inference_dir: ./inference
+  
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 10
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+  lr_mult_list : [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 1.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+            size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..d792c573d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 10
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+        lr_mult_list : [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+        
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 1.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..49a5f1702
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 192]
+  save_inference_dir: ./inference
+  start_eval_epoch: 20
+  
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 10
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 32
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 48]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..4e1a45a9e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,160 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 10
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/language_classification/search.yaml b/example/low_precision_training/ppcls/configs/PULC/language_classification/search.yaml
new file mode 100755
index 000000000..a4b3dde56
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/language_classification/search.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_language_classification
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.2, 0.4, 0.8]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.ResizeImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+      - DataLoader.Eval.dataset.transform_ops.1.ResizeImage.size
+    search_values: [[192, 48], [180, 60], [160, 80]]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.5, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.5, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_attribute/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/person_attribute/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..94b443832
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_attribute/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,135 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "MobileNetV3_small_x0_35"
+  pretrained: True
+  class_num: 26
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+  #clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..b042ad757
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 26
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - TimmAutoAugment:
+            prob: 0.8
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.4
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml
new file mode 100755
index 000000000..bd6503488
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 26
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationMultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        model_names: ["Student"]
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - TimmAutoAugment:
+            prob: 0.8
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.4
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..8f6b0d7fe
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 26
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_attribute/Res2Net200_vd_26w_4s.yaml b/example/low_precision_training/ppcls/configs/PULC/person_attribute/Res2Net200_vd_26w_4s.yaml
new file mode 100755
index 000000000..4f7dc273c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_attribute/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "Res2Net200_vd_26w_4s"
+  pretrained: True
+  class_num: 26
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_attribute/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/person_attribute/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..36c3d6aae
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_attribute/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,135 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "SwinTransformer_tiny_patch4_window7_224"
+  pretrained: True
+  class_num: 26
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+  #clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - Padv2:
+            size: [244, 244]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [224, 224]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [224, 224]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_attribute/search.yaml b/example/low_precision_training/ppcls/configs/PULC/person_attribute/search.yaml
new file mode 100755
index 000000000..78192d113
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_attribute/search.yaml
@@ -0,0 +1,41 @@
+base_config_file: ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_attr
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0001, 0.005, 0.01, 0.02, 0.05]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.ResizeImage.size
+      - DataLoader.Train.dataset.transform_ops.4.RandomCropImage.size
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.img_size
+    search_values: [[192, 256]]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.7.RandomErasing.EPSILON
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_exists/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/person_exists/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..9510ec258
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_exists/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 2
+  pretrained: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..93e9841d9
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.9
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..3d3aa3258
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..86c25a05b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_exists/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/person_exists/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..be10d67b7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_exists/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-5
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/person_exists/search.yaml b/example/low_precision_training/ppcls/configs/PULC/person_exists/search.yaml
new file mode 100755
index 000000000..820337c02
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/person_exists/search.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_person_cls
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..9ef4beb79
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  pretrained: True
+  class_num: 2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.08
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..4c3c8642d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 2
+  use_sync_bn : True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.025
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 176
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob : 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size : 176
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON : 0.1
+            r1 : 0.3
+            sh : 1.0/3.0
+            sl : 0.02
+            attempt : 10
+            use_log_aspect : True
+            mode : pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..254db5df4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,185 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        return_stages: True
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        return_stages: True
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        key: logits
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: logits
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:
+        weight: 1.0
+        key: "blocks4"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.015
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..98f63a613
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 2
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.10
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/example/low_precision_training/ppcls/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml
new file mode 100755
index 000000000..5b987d510
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,137 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Res2Net200_vd_26w_4s
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.005
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..5863ee17e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,159 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  pretrained: True
+  class_num: 2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-5
+    eta_min: 1e-7
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/example/low_precision_training/ppcls/configs/PULC/safety_helmet/search.yaml b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/search.yaml
new file mode 100755
index 000000000..e8c1c933d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/safety_helmet/search.yaml
@@ -0,0 +1,36 @@
+base_config_file: ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_safety_helmet
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11, 0.11, 0.12]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  algorithm: "udml"
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/table_attribute/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/table_attribute/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..2c1e9b253
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/table_attribute/PPLCNet_x1_0.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 6
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/table_attribute/"
+      cls_label_path: "dataset/table_attribute/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/table_attribute/"
+      cls_label_path: "dataset/table_attribute/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/table_attribute/val_3610.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [224, 224]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: TableAttribute
+    source_threshold: 0.5
+    number_threshold: 0.5
+    color_threshold: 0.5
+    clarity_threshold : 0.5
+    obstruction_threshold: 0.5
+    angle_threshold: 0.5
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/table_attribute/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/table_attribute/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..5b48f4a1f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/table_attribute/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,155 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 6
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  infer_model_name: "Student"
+  freeze_params_list:
+  - True
+  - False
+  use_ssld: True
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationMultiLabelLoss:
+        weight: 1.0
+        model_names: ["Student"]
+        weight_ratio: True
+        size_sum: True
+    - DistillationDMLLoss:
+        weight: 1.0
+        weight_ratio: True
+        sum_across_class_dim: False
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/table_attribute/"
+      cls_label_path: "dataset/table_attribute/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/table_attribute/"
+      cls_label_path: "dataset/table_attribute/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/table_attribute/val_3253.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [224, 224]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: TableAttribute
+    source_threshold: 0.5
+    number_threshold: 0.5
+    color_threshold: 0.5
+    clarity_threshold : 0.5
+    obstruction_threshold: 0.5
+    angle_threshold: 0.5
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..7eaff9768
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  start_eval_epoch: 40
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 4
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ddeploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..c8ded908e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 4
+  pretrained: True
+  use_ssld: True
+
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..b8fd0b108
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 4
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..0ba788156
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  start_eval_epoch: 40
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 4
+  pretrained: True
+  use_ssld: True
+
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..4d123cd4b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,157 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 4
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 2.5e-4
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/search.yaml b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/search.yaml
new file mode 100755
index 000000000..d8e65f5f0
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/text_image_orientation/search.yaml
@@ -0,0 +1,41 @@
+base_config_file: ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_text_image_orientation
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.1, 0.2, 0.4, 0.8]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.4.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.0, 0.3, 0.5, 0.8, 1.0]
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/textline_orientation/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..040868378
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 18
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 2
+  pretrained: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.13
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..3ab3657d8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 18
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 80, 160]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml
new file mode 100755
index 000000000..17b9cbb15
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 18
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..2cc57e637
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 18
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 80, 160]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+        stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..e9e186377
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 18
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 192]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+  
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png 
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 48]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt 
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/textline_orientation/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..a466d5e08
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 2e-6
+    warmup_epoch: 5
+    warmup_start_lr: 2e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png 
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/example/low_precision_training/ppcls/configs/PULC/textline_orientation/search.yaml b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/search.yaml
new file mode 100755
index 000000000..4419949bc
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/textline_orientation/search.yaml
@@ -0,0 +1,41 @@
+base_config_file: ppcls/configs/PULC/text_direction/PPLCNet_x1_0.yaml
+distill_config_file: ppcls/configs/PULC/text_direction/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_text
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.ResizeImage.size
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.img_size
+      - DataLoader.Eval.dataset.transform_ops.1.ResizeImage.size
+    search_values: [[192, 48], [180, 60], [160, 80]]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.4.RandomErasing.EPSILON
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/traffic_sign/MobileNetV3_samll_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/MobileNetV3_samll_x0_35.yaml
new file mode 100755
index 000000000..5ebe7441e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/MobileNetV3_samll_x0_35.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 232
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..5362d07b7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 0
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 232
+  pretrained: True
+  use_ssld: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/traffic_sign/99603_17806.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..b00c250e1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 232
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+        pretrained: False
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train_for_distillation.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..27fbc4b86
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 0
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 232
+  pretrained: True
+  # use_ssld: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/traffic_sign/99603_17806.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/PULC/traffic_sign/SwinTransformer_tiny_patch4_window7_224.yaml b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100755
index 000000000..ae86ae622
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 0
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 232
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 2e-4
+    eta_min: 2e-6
+    warmup_epoch: 5
+    warmup_start_lr: 2e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/traffic_sign/search.yaml b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/search.yaml
new file mode 100755
index 000000000..029d042df
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/traffic_sign/search.yaml
@@ -0,0 +1,41 @@
+base_config_file: ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_traffic_sign
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.4.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  algorithm: "skl-ugi"
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/MobileNetV3_small_x0_35.yaml b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/MobileNetV3_small_x0_35.yaml
new file mode 100755
index 000000000..a35bc6114
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,115 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "MobileNetV3_small_x0_35"
+  pretrained: True
+  class_num: 19
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml
new file mode 100755
index 000000000..a3369a9ee
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  class_num: 19
+  use_ssld: True
+  lr_mult_list: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [256, 192]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: VehicleAttribute
+    color_threshold: 0.5
+    type_threshold: 0.5
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml
new file mode 100755
index 000000000..d098ca81f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 19
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  infer_model_name: "Student"
+  freeze_params_list:
+  - True
+  - False
+  use_ssld: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationMultiLabelLoss:
+        weight: 1.0
+        model_names: ["Student"]
+        weight_ratio: True
+        size_sum: True
+    - DistillationDMLLoss:
+        weight: 1.0
+        weight_ratio: True
+        sum_across_class_dim: False
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [256, 192]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: VehicleAttribute
+    color_threshold: 0.5
+    type_threshold: 0.5
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_search.yaml b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_search.yaml
new file mode 100755
index 000000000..5f84c2a65
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 19
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/Res2Net200_vd_26w_4s.yaml b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/Res2Net200_vd_26w_4s.yaml
new file mode 100755
index 000000000..c6618f960
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,122 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/mo"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "Res2Net200_vd_26w_4s"
+  pretrained: True
+  class_num: 19
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/ResNet50.yaml b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/ResNet50.yaml
new file mode 100755
index 000000000..9218769c6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/ResNet50.yaml
@@ -0,0 +1,116 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "ResNet50"
+  pretrained: True
+  class_num: 19
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/search.yaml b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/search.yaml
new file mode 100755
index 000000000..2a16266bf
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/PULC/vehicle_attribute/search.yaml
@@ -0,0 +1,35 @@
+base_config_file: ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_vehicle_attr
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.7.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  algorithm: "skl-ugi"
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
diff --git a/example/low_precision_training/ppcls/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml b/example/low_precision_training/ppcls/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml
new file mode 100755
index 000000000..ad77ea950
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_dlbhc/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  #eval_mode: "retrieval"
+  print_batch_step: 10
+  use_visualdl: False
+
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "round"
+
+# model architecture
+Arch:  
+  name: "RecModel"
+  Backbone:
+    name: "MobileNetV3_large_x1_0"
+    pretrained: True
+    class_num: 512
+  Head:
+    name: "FC"
+    class_num: 50030
+    embedding_size: 512
+    
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+ 
+# loss function config for train/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [50, 150]
+    values: [0.1, 0.01, 0.001]
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 256
+        - RandCropImage:
+            size: 227
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 227
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 227
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.4914, 0.4822, 0.4465]
+        std: [0.2023, 0.1994, 0.2010]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+        
+# switch to metric below when eval by retrieval
+#     - Recallk:
+#         topk: [1]
+#     - mAP:
+#     - Precisionk:
+#         topk: [1]
+
diff --git a/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_Aliproduct.yaml b/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_Aliproduct.yaml
new file mode 100755
index 000000000..70f805647
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_Aliproduct.yaml
@@ -0,0 +1,119 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: classification
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: FC  
+    embedding_size: 512
+    class_num: 50030
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_Inshop.yaml b/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_Inshop.yaml
new file mode 100755
index 000000000..18ddfa3a8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_Inshop.yaml
@@ -0,0 +1,157 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams"
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: ArcMargin  
+    embedding_size: 512
+    class_num: 3997
+    margin: 0.15
+    scale: 30
+   
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.04
+    milestones: [30, 60, 70, 80, 90, 100]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Inshop/
+      cls_label_path: ./dataset/Inshop/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+    sampler:
+      name: PKSampler
+      batch_size: 64
+      sample_per_id: 2
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+      
+  Eval:
+    Query:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/Inshop/
+        cls_label_path: ./dataset/Inshop/query_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/Inshop/
+        cls_label_path: ./dataset/Inshop/gallery_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_SOP.yaml b/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_SOP.yaml
new file mode 100755
index 000000000..7728a6678
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Products/ResNet50_vd_SOP.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams"
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: ArcMargin  
+    embedding_size: 512
+    class_num: 11319
+    margin: 0.15
+    scale: 30
+  infer_output_key: features
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.01
+    milestones: [30, 60, 70, 80, 90, 100]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: VeriWild
+      image_root: ./dataset/SOP/
+      cls_label_path: ./dataset/SOP/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+
+    sampler:
+      name: PKSampler
+      batch_size: 64
+      sample_per_id: 2
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/SOP/
+        cls_label_path: ./dataset/SOP/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/SOP/
+        cls_label_path: ./dataset/SOP/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/ResNet50_UReID_infer.yaml b/example/low_precision_training/ppcls/configs/ResNet50_UReID_infer.yaml
new file mode 100755
index 000000000..750851d7c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ResNet50_UReID_infer.yaml
@@ -0,0 +1,152 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  # pretrained_model: "./pd_model_trace/ISE/ISE_M_model" # pretrained ISE model for Market1501
+  # pretrained_model: "./pd_model_trace/ISE/ISE_MS_model" # pretrained ISE model for MSMT17
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 128, 256]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "BNNeck"
+    num_features: 2048
+  Head:
+    name: "FC"  
+    embedding_size: 2048
+    class_num: 751
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "Market1501" # ["Market1501", "MSMT17"]
+        image_root: "./dataset"
+        cls_label_path: "bounding_box_train"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              interpolation: 'bicubic'
+              backend: 'pil'
+          - RandFlipImage:
+              flip_code: 1
+          - Pad:
+              padding: 10
+              fill: 0
+          - RandomCrop:
+              size: [128, 256]
+              pad_if_needed: False
+          - NormalizeImage:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0.485, 0.456, 0.406] 
+
+    sampler:
+        name: PKSampler
+        batch_size: 16
+        sample_per_id: 4
+        drop_last: True
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "Market1501" # ["Market1501", "MSMT17"]
+        image_root: "./dataset"
+        cls_label_path: "query"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              interpolation: 'bicubic'
+              backend: 'pil'
+          - NormalizeImage:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "Market1501" # ["Market1501", "MSMT17"]
+        image_root: "./dataset"
+        cls_label_path: "bounding_box_test"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              interpolation: 'bicubic'
+              backend: 'pil'
+          - NormalizeImage:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/example/low_precision_training/ppcls/configs/SVTR/svtr_base.yml b/example/low_precision_training/ppcls/configs/SVTR/svtr_base.yml
new file mode 100755
index 000000000..9ab15d5c4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/SVTR/svtr_base.yml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 320]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SVTR_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/SVTR/svtr_large.yml b/example/low_precision_training/ppcls/configs/SVTR/svtr_large.yml
new file mode 100755
index 000000000..a3b556e97
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/SVTR/svtr_large.yml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 320]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SVTR_large
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/SVTR/svtr_tiny.yml b/example/low_precision_training/ppcls/configs/SVTR/svtr_tiny.yml
new file mode 100755
index 000000000..21d6788af
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/SVTR/svtr_tiny.yml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 320]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SVTR_tiny
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/StrategySearch/person.yaml b/example/low_precision_training/ppcls/configs/StrategySearch/person.yaml
new file mode 100755
index 000000000..906635595
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/StrategySearch/person.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/person/PPLCNet/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/person/Distillation/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_person
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/example/low_precision_training/ppcls/configs/Vehicle/PPLCNet_2.5x_ReID.yaml b/example/low_precision_training/ppcls/configs/Vehicle/PPLCNet_2.5x_ReID.yaml
new file mode 100755
index 000000000..eb9f145a1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Vehicle/PPLCNet_2.5x_ReID.yaml
@@ -0,0 +1,158 @@
+# global configs
+# pretrained_model: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_PPLCNet2.5x_VERIWild_v1.0_pretrained.pdparams
+# VeriWild v1 small: recall1: 0.93736, recall5: 0.98427, mAP: 0.82125
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output_reid/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "PPLCNet_x2_5"
+    pretrained: True
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: "FC"
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: True
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/example/low_precision_training/ppcls/configs/Vehicle/ResNet50.yaml b/example/low_precision_training/ppcls/configs/Vehicle/ResNet50.yaml
new file mode 100755
index 000000000..6b6172475
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Vehicle/ResNet50.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/Vehicle/ResNet50_ReID.yaml b/example/low_precision_training/ppcls/configs/Vehicle/ResNet50_ReID.yaml
new file mode 100755
index 000000000..c13d59afd
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/Vehicle/ResNet50_ReID.yaml
@@ -0,0 +1,155 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: True
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/example/low_precision_training/ppcls/configs/metric_learning/adaface_ir18.yaml b/example/low_precision_training/ppcls/configs/metric_learning/adaface_ir18.yaml
new file mode 100755
index 000000000..2cbfe5da4
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/metric_learning/adaface_ir18.yaml
@@ -0,0 +1,105 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 26
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 112, 112]
+  save_inference_dir: "./inference"
+  eval_mode: "adaface"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "AdaFace_IR_18"
+    input_size: [112, 112]
+  Head:
+    name: "AdaMargin"  
+    embedding_size: 512
+    class_num: 70722
+    m: 0.4
+    s: 64
+    h: 0.333
+    t_alpha: 0.01
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [12, 20, 24]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "AdaFaceDataset"
+        root_dir: "dataset/face/"
+        label_path: "dataset/face/train_filter_label.txt"
+        transform:
+          - CropWithPadding:
+              prob: 0.2
+              padding_num: 0
+              size: [112, 112]
+              scale: [0.2, 1.0]
+              ratio: [0.75, 1.3333333333333333]
+          - RandomInterpolationAugment:
+              prob: 0.2
+          - ColorJitter:
+              prob: 0.2
+              brightness: 0.5
+              contrast: 0.5
+              saturation: 0.5
+              hue: 0
+          - RandomHorizontalFlip:
+          - ToTensor:
+          - Normalize:
+              mean: [0.5, 0.5, 0.5]
+              std: [0.5, 0.5, 0.5]
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: FiveValidationDataset
+      val_data_path: dataset/face/faces_emore
+      concat_mem_file_name: dataset/face/faces_emore/concat_validation_memfile
+    sampler:
+        name: BatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/metric_learning/xbm_resnet50.yaml b/example/low_precision_training/ppcls/configs/metric_learning/xbm_resnet50.yaml
new file mode 100755
index 000000000..44dcfe065
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/metric_learning/xbm_resnet50.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 35
+  iter_per_epoch: &iter_per_epoch 1000
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: retrieval
+  retrieval_feature_from: features # 'backbone' or 'features'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  Backbone:
+    name: ResNet50_adaptive_max_pool2d
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    stem_act: null
+  BackboneStopLayer:
+    name: flatten
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: &feat_dim 128
+    weight_attr:
+      initializer:
+        name: KaimingNormal
+        fan_in: *feat_dim
+        negative_slope: 0.0
+        nonlinearity: leaky_relu
+    bias_attr:
+      initializer:
+        name: Constant
+        value: 0.0
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - ContrastiveLoss_XBM:
+        weight: 1.0
+        xbm_size: 55000
+        xbm_weight: 1.0
+        start_iter: 1000
+        margin: 0.5
+        embedding_size: *feat_dim
+        epsilon: 1.0e-5
+        normalize_feature: True
+        feature_from: features
+  Eval:
+    - ContrastiveLoss:
+        weight: 1.0
+        margin: 0.5
+        embedding_size: *feat_dim
+        normalize_feature: True
+        epsilon: 1.0e-5
+        feature_from: features
+
+Optimizer:
+  name: Adam
+  lr:
+    name: ReduceOnPlateau
+    learning_rate: 0.0001
+    mode: max
+    factor: 0.1
+    patience: 4
+    threshold: 0.001
+    threshold_mode: rel
+    cooldown: 2
+    min_lr: 0.000005
+    epsilon: 1e-8
+    by_epoch: True
+  regularizer:
+    name: L2
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: VeriWild
+      image_root: ./dataset/SOP
+      cls_label_path: ./dataset/SOP/train_list.txt
+      backend: pil
+      transform_ops:
+        - Resize:
+            size: 256
+        - RandomResizedCrop:
+            scale: [0.2, 1]
+            size: 224
+        - RandomHorizontalFlip:
+            prob: 0.5
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 4
+      drop_last: False
+      shuffle: True
+      max_iters: *iter_per_epoch
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/SOP
+        cls_label_path: ./dataset/SOP/test_list.txt
+        backend: pil
+        transform_ops:
+          - Resize:
+              size: 256
+          - CenterCrop:
+              size: 224
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/SOP
+        cls_label_path: ./dataset/SOP/test_list.txt
+        backend: pil
+        transform_ops:
+          - Resize:
+              size: 256
+          - CenterCrop:
+              size: 224
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/multi_scale/MobileNetV1_multi_scale.yaml b/example/low_precision_training/ppcls/configs/multi_scale/MobileNetV1_multi_scale.yaml
new file mode 100755
index 000000000..530e75075
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/multi_scale/MobileNetV1_multi_scale.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: MobileNetV1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            
+    # support to specify width and height respectively:
+    # scales: [(160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 64
+      divided_factor: 32
+      is_training: True
+
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/practical_models/.gitkeep b/example/low_precision_training/ppcls/configs/practical_models/.gitkeep
new file mode 100755
index 000000000..8b1378917
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/practical_models/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/example/low_precision_training/ppcls/configs/practical_models/CLIP_large_patch14_224_aesthetic.yaml b/example/low_precision_training/ppcls/configs/practical_models/CLIP_large_patch14_224_aesthetic.yaml
new file mode 100755
index 000000000..8d0a72371
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/practical_models/CLIP_large_patch14_224_aesthetic.yaml
@@ -0,0 +1,78 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: CLIP_large_patch14_224_aesthetic
+  pretrained: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: deploy/images/practical/aesthetic_score_predictor/Highscore.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ScoreOutput
+    decimal_places: 2
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/practical_models/EfficientNetB3_watermark.yaml b/example/low_precision_training/ppcls/configs/practical_models/EfficientNetB3_watermark.yaml
new file mode 100755
index 000000000..2fd142ae7
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/practical_models/EfficientNetB3_watermark.yaml
@@ -0,0 +1,81 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: EfficientNetB3_watermark
+  pretrained: True
+  class_num: 2
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: deploy/images/practical/watermark_exists/watermark_example.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: contains_watermark
+    label_1: no_watermark
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/practical_models/PPHGNet_tiny_calling_halfbody.yaml b/example/low_precision_training/ppcls/configs/practical_models/PPHGNet_tiny_calling_halfbody.yaml
new file mode 100755
index 000000000..c6415cd47
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/practical_models/PPHGNet_tiny_calling_halfbody.yaml
@@ -0,0 +1,150 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: PPHGNet_tiny
+  class_num: 2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05 
+    warmup_epoch: 3
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/phone_train_list_halfbody.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 2
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/phone_val_list_halfbody.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 2
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: dataset/phone_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 1]
+  Eval:
+    - TopkAcc:
+        topk: [1, 1]
diff --git a/example/low_precision_training/ppcls/configs/quick_start/MobileNetV1_retrieval.yaml b/example/low_precision_training/ppcls/configs/quick_start/MobileNetV1_retrieval.yaml
new file mode 100755
index 000000000..bac477392
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/MobileNetV1_retrieval.yaml
@@ -0,0 +1,157 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: MobileNetV1
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1024
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 101
+    margin: 0.15
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.01
+    milestones: [20, 30, 40]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: "L2"
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: VeriWild
+      image_root: ./dataset/CUB_200_2011/
+      cls_label_path: ./dataset/CUB_200_2011/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 2
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/CUB_200_2011/
+        cls_label_path: ./dataset/CUB_200_2011/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/CUB_200_2011/
+        cls_label_path: ./dataset/CUB_200_2011/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/quick_start/MobileNetV3_large_x1_0.yaml b/example/low_precision_training/ppcls/configs/quick_start/MobileNetV3_large_x1_0.yaml
new file mode 100755
index 000000000..d87dc0991
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/MobileNetV3_large_x1_0.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 102
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.00375
+    warmup_epoch: 5
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.000001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/quick_start/ResNet50_vd.yaml b/example/low_precision_training/ppcls/configs/quick_start/ResNet50_vd.yaml
new file mode 100755
index 000000000..90b2c88d6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/ResNet50_vd.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd 
+  class_num: 102
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml b/example/low_precision_training/ppcls/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml
new file mode 100755
index 000000000..6a461ccfa
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml
@@ -0,0 +1,68 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'HRNet_W18_C'
+pretrained_model: "./pretrained/HRNet_W18_C_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 10
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.00375
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/example/low_precision_training/ppcls/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml b/example/low_precision_training/ppcls/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml
new file mode 100755
index 000000000..7fad5eebe
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml
@@ -0,0 +1,69 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'ResNet50_vd'
+pretrained_model: "./pretrained/ResNet50_vd_pretrained"
+load_static_weights: true
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.00375
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 1
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 1
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/example/low_precision_training/ppcls/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml b/example/low_precision_training/ppcls/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml
new file mode 100755
index 000000000..389a5f35f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml
@@ -0,0 +1,70 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'VGG16'
+    params:
+        stop_grad_layers: 5
+pretrained_model: "./pretrained/VGG16_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.0005
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/example/low_precision_training/ppcls/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml b/example/low_precision_training/ppcls/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml
new file mode 100755
index 000000000..6ba38b974
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml
@@ -0,0 +1,70 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'VGG19'
+    params:
+        stop_grad_layers: 5
+pretrained_model: "./pretrained/VGG19_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.0005
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/example/low_precision_training/ppcls/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml b/example/low_precision_training/ppcls/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml
new file mode 100755
index 000000000..124636690
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: cpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_25
+  class_num: 102
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/quick_start/professional/MobileNetV1_multilabel.yaml b/example/low_precision_training/ppcls/configs/quick_start/professional/MobileNetV1_multilabel.yaml
new file mode 100755
index 000000000..7c64ae3b3
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/professional/MobileNetV1_multilabel.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  use_multilabel: True
+# model architecture
+Arch:
+  name: MobileNetV1
+  class_num: 33
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/images/
+      cls_label_path: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/multilabel_train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: MultiLabelDataset
+      image_root: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/images/
+      cls_label_path: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/multilabel_test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/0517_2715693311.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: "ppcls/utils/NUS-WIDE-SCENE_label_list.txt"
+    delimiter: " "
+
+Metric:
+  Train:
+    - AccuracyScore:
+    - HammingDistance:
+  Eval:
+    - AccuracyScore:
+    - HammingDistance:
diff --git a/example/low_precision_training/ppcls/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml b/example/low_precision_training/ppcls/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml
new file mode 100755
index 000000000..423a45389
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml b/example/low_precision_training/ppcls/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml
new file mode 100755
index 000000000..a27068d16
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: 100
+        pretrained: "./pretrained/best_model"
+    - Student:
+        name: MobileNetV3_large_x1_0
+        class_num: 100
+        pretrained: True
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/CIFAR100/"
+        cls_label_path: "./dataset/CIFAR100/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 32
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/CIFAR100/"
+        cls_label_path: "./dataset/CIFAR100/test_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 36
+          - CropImage:
+              size: 32
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 36
+      - CropImage:
+          size: 32
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: DistillationPostProcess
+    func: Topk
+    topk: 5
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml b/example/low_precision_training/ppcls/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml
new file mode 100755
index 000000000..10c326a40
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml b/example/low_precision_training/ppcls/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml
new file mode 100755
index 000000000..d8ff817f8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml b/example/low_precision_training/ppcls/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml
new file mode 100755
index 000000000..97228828a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  eval_mode: "retrieval"
+  epochs: 128
+  print_batch_step: 10
+  use_visualdl: False
+
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "round"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  Backbone:
+    name: "VGG19Sigmoid"
+    pretrained: True
+    class_num: 48
+  Head:
+    name: "FC"
+    class_num: 10
+    embedding_size: 48
+    
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.01
+    decay_epochs: [200]
+    values: [0.01, 0.001]
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/cifar10/
+      cls_label_path: ./dataset/cifar10/cifar10-2/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 256
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/cifar10/
+        cls_label_path: ./dataset/cifar10/cifar10-2/test.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.4914, 0.4822, 0.4465]
+              std: [0.2023, 0.1994, 0.2010]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/cifar10/
+        cls_label_path: ./dataset/cifar10/cifar10-2/database.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.4914, 0.4822, 0.4465]
+              std: [0.2023, 0.1994, 0.2010]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - mAP:
+    - Precisionk:
+        topk: [1, 5]
+        
diff --git a/example/low_precision_training/ppcls/configs/reid/MetaBIN_ResNet50_cross_domain.yaml b/example/low_precision_training/ppcls/configs/reid/MetaBIN_ResNet50_cross_domain.yaml
new file mode 100755
index 000000000..19c9adafc
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/reid/MetaBIN_ResNet50_cross_domain.yaml
@@ -0,0 +1,277 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  iter_per_epoch: &iter_per_epoch 50
+  warmup_iter: 10
+  save_interval: 8
+  eval_during_train: True
+  eval_interval: 8
+  epochs: &epochs 348 # 348*50 = 120*145 = 17400 iters
+  print_batch_step: 25
+  use_visualdl: False
+  eval_mode: "retrieval"
+  retrieval_feature_from: "features" # 'backbone' or 'features'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 256, 128]
+  save_inference_dir: "./inference"
+  train_mode: "metabin"
+
+AMP:
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+
+# model architecture
+Arch:
+  name: "RecModel"
+  Backbone:
+    name: "ResNet50_metabin"
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    bias_lr_factor: 2.0
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: MetaBNNeck
+    num_features: &feat_dim 2048
+    use_global_stats: True
+  Head:
+    name: "FC"
+    embedding_size: *feat_dim
+    class_num: 751
+    weight_attr:
+      initializer:
+        name: KaimingUniform
+        negative_slope: 2.23606 # math.sqrt(5)
+        nonlinearity: "leaky_relu"
+    bias_attr: False
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: "Market1501"
+      image_root: "./dataset/"
+      cls_label_path: "bounding_box_train"
+      backend: "pil"
+      transform_ops:
+        - ResizeImage:
+            size: [128, 256]
+            return_numpy: False
+            interpolation: "bicubic"
+            backend: "pil"
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+        - RandCropImageV2:
+            size: [128, 256]
+        - ColorJitter:
+            brightness: 0.15
+            contrast: 0.15
+            saturation: 0.1
+            hue: 0.1
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+    sampler:
+      name: NaiveIdentityBatchSampler
+      batch_size: 96
+      num_instances: 4
+      drop_last: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Metalearning:
+    Train:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_train"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bicubic"
+              backend: "pil"
+          - RandFlipImage:
+              flip_code: 1
+          - Pad:
+              padding: 10
+          - RandCropImageV2:
+              size: [128, 256]
+          - ColorJitter:
+              brightness: 0.15
+              contrast: 0.15
+              saturation: 0.1
+              hue: 0.1
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DomainShuffleBatchSampler
+        batch_size: 96
+        num_instances: 4
+        drop_last: True
+        camera_to_domain: True
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: "DukeMTMC"
+        image_root: "./dataset/"
+        cls_label_path: "query"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bicubic"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: "DukeMTMC"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_test"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bicubic"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELossForMetaBIN:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+    - IntraDomainScatterLoss:
+        weight: 1.0
+        normalize_feature: True
+        feature_from: "backbone"
+    - InterDomainShuffleLoss:
+        weight: 1.0
+        normalize_feature: False
+        feature_from: "backbone"
+  Basic:
+    - CELossForMetaBIN:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+  MetaTrain:
+    - CELossForMetaBIN:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+    - IntraDomainScatterLoss:
+        weight: 1.0
+        normalize_feature: True
+        feature_from: "backbone"
+    - InterDomainShuffleLoss:
+        weight: 1.0
+        normalize_feature: False
+        feature_from: "backbone"
+  MetaTest:
+    - CELossForMetaBIN:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+  Eval:
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+
+Optimizer:
+  - Momentum:
+      scope: ".*(conv|batch_norm|instance_norm|feat_bn|fc)"
+      lr:
+        name: MultiStepDecay
+        epochs: *epochs
+        learning_rate: 0.01
+        step_each_epoch: *iter_per_epoch
+        milestones: [145, 261]
+        gamma: 0.1
+        warmup_epoch: 29
+        warmup_start_lr: 0.0001
+        by_epoch: False
+        last_epoch: -1
+      momentum: 0.9
+      regularizer:
+        name: "L2"
+        coeff: 0.0005
+  - SGD:
+      scope: "backbone.*gate"
+      lr:
+        name: Constant
+        learning_rate: 0.2
+        last_epoch: -1
+  - SGD:
+      scope: "RecModel"
+      lr:
+        name: Cyclic
+        epochs: *epochs
+        step_each_epoch: *iter_per_epoch
+        base_learning_rate: 0.001
+        max_learning_rate: 0.1
+        warmup_epoch: 0
+        warmup_start_lr: 1
+        step_size_up: 1095
+        step_size_down: 1095
+        by_epoch: False
+        last_epoch: 0
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5, 10]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/reid/strong_baseline/baseline.yaml b/example/low_precision_training/ppcls/configs/reid/strong_baseline/baseline.yaml
new file mode 100755
index 000000000..5c83b8da8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/reid/strong_baseline/baseline.yaml
@@ -0,0 +1,158 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 40
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: "retrieval"
+  retrieval_feature_from: "backbone" # 'backbone' or 'neck'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 256, 128]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50"
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    stem_act: null
+  BackboneStopLayer:
+    name: "flatten"
+  Head:
+    name: "FC"
+    embedding_size: 2048
+    class_num: 751
+    weight_attr:
+      initializer:
+        name: KaimingUniform
+        fan_in: 12288 # 6*embedding_size
+    bias_attr:
+      initializer:
+        name: KaimingUniform
+        fan_in: 12288 # 6*embedding_size
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.3
+        normalize_feature: False
+        feature_from: "backbone"
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Adam
+  lr:
+    name: Piecewise
+    decay_epochs: [40, 70]
+    values: [0.00035, 0.000035, 0.0000035]
+    by_epoch: True
+    last_epoch: 0
+  regularizer:
+    name: "L2"
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: "Market1501"
+      image_root: "./dataset/"
+      cls_label_path: "bounding_box_train"
+      backend: "pil"
+      transform_ops:
+        - ResizeImage:
+            size: [128, 256]
+            return_numpy: False
+            interpolation: "bilinear"
+            backend: "pil"
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+        - RandCropImageV2:
+            size: [128, 256]
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 4
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "query"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_test"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/reid/strong_baseline/softmax_triplet.yaml b/example/low_precision_training/ppcls/configs/reid/strong_baseline/softmax_triplet.yaml
new file mode 100755
index 000000000..43f1de62f
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/reid/strong_baseline/softmax_triplet.yaml
@@ -0,0 +1,176 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 40
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: "retrieval"
+  retrieval_feature_from: "features" # 'backbone' or 'features'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 256, 128]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50_last_stage_stride1"
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    stem_act: null
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: BNNeck
+    num_features: &feat_dim 2048
+    weight_attr:
+      initializer:
+        name: Constant
+        value: 1.0
+    bias_attr:
+      initializer:
+        name: Constant
+        value: 0.0
+      learning_rate: 1.0e-20 # NOTE: Temporarily set lr small enough to freeze the bias to zero
+  Head:
+    name: "FC"
+    embedding_size: *feat_dim
+    class_num: 751
+    weight_attr:
+      initializer:
+        name: Normal
+        std: 0.001
+    bias_attr: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.3
+        normalize_feature: False
+        feature_from: "backbone"
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Adam
+  lr:
+    name: Piecewise
+    decay_epochs: [30, 60]
+    values: [0.00035, 0.000035, 0.0000035]
+    warmup_epoch: 10
+    warmup_start_lr: 0.0000035
+    by_epoch: True
+    last_epoch: 0
+  regularizer:
+    name: "L2"
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: "Market1501"
+      image_root: "./dataset/"
+      cls_label_path: "bounding_box_train"
+      backend: "pil"
+      transform_ops:
+        - ResizeImage:
+            size: [128, 256]
+            return_numpy: False
+            interpolation: "bilinear"
+            backend: "pil"
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+        - RandCropImageV2:
+            size: [128, 256]
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0.485, 0.456, 0.406]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 4
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "query"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_test"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/reid/strong_baseline/softmax_triplet_with_center.yaml b/example/low_precision_training/ppcls/configs/reid/strong_baseline/softmax_triplet_with_center.yaml
new file mode 100755
index 000000000..70c70a99b
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/reid/strong_baseline/softmax_triplet_with_center.yaml
@@ -0,0 +1,187 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 40
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: "retrieval"
+  retrieval_feature_from: "features" # 'backbone' or 'features'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 256, 128]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50_last_stage_stride1"
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    stem_act: null
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: BNNeck
+    num_features: &feat_dim 2048
+    weight_attr:
+      initializer:
+        name: Constant
+        value: 1.0
+    bias_attr:
+      initializer:
+        name: Constant
+        value: 0.0
+      learning_rate: 1.0e-20 # NOTE: Temporarily set lr small enough to freeze the bias to zero
+  Head:
+    name: "FC"
+    embedding_size: *feat_dim
+    class_num: &class_num 751
+    weight_attr:
+      initializer:
+        name: Normal
+        std: 0.001
+    bias_attr: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.3
+        normalize_feature: False
+        feature_from: "backbone"
+    - CenterLoss:
+        weight: 0.0005
+        num_classes: *class_num
+        feat_dim: *feat_dim
+        feature_from: "backbone"
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  - Adam:
+      scope: RecModel
+      lr:
+        name: Piecewise
+        decay_epochs: [30, 60]
+        values: [0.00035, 0.000035, 0.0000035]
+        warmup_epoch: 10
+        warmup_start_lr: 0.0000035
+        by_epoch: True
+        last_epoch: 0
+      regularizer:
+        name: "L2"
+        coeff: 0.0005
+  - SGD:
+      scope: CenterLoss
+      lr:
+        name: ConstLR
+        learning_rate: 1000.0 # NOTE: set to ori_lr*(1/centerloss_weight) to avoid manually scaling centers' gradidents.
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: "Market1501"
+      image_root: "./dataset/"
+      cls_label_path: "bounding_box_train"
+      backend: "pil"
+      transform_ops:
+        - ResizeImage:
+            size: [128, 256]
+            return_numpy: False
+            interpolation: "bilinear"
+            backend: "pil"
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+        - RandCropImageV2:
+            size: [128, 256]
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0.485, 0.456, 0.406]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 4
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "query"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_test"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/example/low_precision_training/ppcls/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml b/example/low_precision_training/ppcls/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml
new file mode 100755
index 000000000..7b21d0ba8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml
@@ -0,0 +1,154 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/general_PPLCNet_x2_5_pretrained_v1.0.pdparams
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone: 
+    name: PPLCNet_x2_5
+    pretrained: False
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: ArcMargin 
+    embedding_size: 512
+    class_num: 185341
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.002
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml b/example/low_precision_training/ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml
new file mode 100755
index 000000000..6655c3a0a
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/MobileNetV3_large_x1_0_prune.yaml
@@ -0,0 +1,139 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantization or prune model 
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/slim/MobileNetV3_large_x1_0_quantization.yaml b/example/low_precision_training/ppcls/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
new file mode 100755
index 000000000..517c4677c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.065
+    warmup_epoch: 0
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/slim/PPLCNet_x1_0_quantization.yaml b/example/low_precision_training/ppcls/configs/slim/PPLCNet_x1_0_quantization.yaml
new file mode 100755
index 000000000..40111a036
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/PPLCNet_x1_0_quantization.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 0
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/slim/ResNet50_vd_prune.yaml b/example/low_precision_training/ppcls/configs/slim/ResNet50_vd_prune.yaml
new file mode 100755
index 000000000..7bfc537b1
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/ResNet50_vd_prune.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null 
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantization or prune model 
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/slim/ResNet50_vd_quantization.yaml b/example/low_precision_training/ppcls/configs/slim/ResNet50_vd_quantization.yaml
new file mode 100755
index 000000000..f9db41020
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/ResNet50_vd_quantization.yaml
@@ -0,0 +1,137 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null 
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_cls_prune.yaml b/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_cls_prune.yaml
new file mode 100755
index 000000000..1f6fea887
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_cls_prune.yaml
@@ -0,0 +1,135 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_cls_ResNet50_CompCars_v1.2_pretrained.pdparams"
+  output_dir: "./output_vehicle_cls_prune/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+Slim:
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_cls_quantization.yaml b/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_cls_quantization.yaml
new file mode 100755
index 000000000..026b86547
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_cls_quantization.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_cls_ResNet50_CompCars_v1.2_pretrained.pdparams"
+  output_dir: "./output_vehicle_cls_pact/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 80
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+Slim:
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 64
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_reid_prune.yaml b/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_reid_prune.yaml
new file mode 100755
index 000000000..63b87f1ca
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_reid_prune.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_ResNet50_VERIWild_v1.1_pretrained.pdparams"
+  output_dir: "./output_vehicle_reid_prune/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_reid_quantization.yaml b/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_reid_quantization.yaml
new file mode 100755
index 000000000..cca9915e2
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/slim/ResNet50_vehicle_reid_quantization.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_ResNet50_VERIWild_v1.1_pretrained.pdparams"
+  output_dir: "./output_vehicle_reid_pact/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 64
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/example/low_precision_training/ppcls/configs/ssl/CCSSL/FixMatchCCSSL_cifar100_10000_4gpu.yaml b/example/low_precision_training/ppcls/configs/ssl/CCSSL/FixMatchCCSSL_cifar100_10000_4gpu.yaml
new file mode 100755
index 000000000..a2382817d
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ssl/CCSSL/FixMatchCCSSL_cifar100_10000_4gpu.yaml
@@ -0,0 +1,209 @@
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: false
+  use_dali: false
+  train_mode: fixmatch_ccssl
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+SSL:
+  T: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+Arch:
+  name: RecModel
+  infer_output_key: logits
+  infer_add_softmax: false
+  head_feature_from: backbone 
+  Backbone:
+    name: WideResNet
+    widen_factor: 8
+    depth: 28
+    dropout: 0 
+    num_classes: 100
+    low_dim: 64
+    proj: false
+    proj_after: false
+  BackboneStopLayer:
+    name: bn1
+  Neck:
+    name: FRFNNeck
+    num_features: 512
+    low_dim: 64
+  Head:
+    name: FC
+    embedding_size: 512
+    class_num: 100
+
+  use_sync_bn: true
+
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+  
+UnLabelLoss:
+  Train:
+    - CCSSLCELoss:
+        weight: 1.
+    - SoftSupConLoss:
+        weight: 1.0
+        temperature: 0.07
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: true
+  weight_decay: 0.001
+  lr:
+    name: 'CosineFixmatch'
+    learning_rate: 0.03
+    num_warmup_steps: 0
+
+DataLoader:
+  mean: [0.5071, 0.4867, 0.4408]
+  std: [0.2675, 0.2565, 0.2761]
+  Train:
+    dataset:
+      name: Cifar100
+      data_file: null
+      mode: 'train'
+      download: true
+      backend: 'pil'
+      sample_per_label: 100
+      expand_labels: 1
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5071, 0.4867, 0.4408]
+            std: [0.2675, 0.2565, 0.2761]
+            order: hwc
+    
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: true
+      shuffle: true
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+  
+  UnLabelTrain:
+    dataset:
+      name: Cifar100
+      data_file: null
+      mode: 'train'
+      backend: 'pil'
+      download: true
+
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5071, 0.4867, 0.4408]
+            std: [0.2675, 0.2565, 0.2761]
+            order: hwc
+
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5071, 0.4867, 0.4408]
+            std: [0.2675, 0.2565, 0.2761]
+            order: hwc
+      
+      transform_ops_strong2:
+        - RandomResizedCrop:
+            size: [32, 32]
+        - RandFlipImage:
+            flip_code: 1
+        - RandomApply:
+            transforms:
+              - RawColorJitter:
+                  brightness: 0.4
+                  contrast: 0.4
+                  saturation: 0.4
+                  hue: 0.1
+            p: 1.0  # refer to official settings
+        - RandomGrayscale:
+            p: 0.2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0., 0., 0.]
+            std: [1., 1., 1.]
+            order: hwc
+
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 112
+      drop_last: true
+      shuffle: true
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+
+  Eval:
+    dataset:
+      name: Cifar100
+      mode: 'test'
+      backend: 'pil'
+      download: true
+      data_file: null
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5071, 0.4867, 0.4408]
+            std: [0.2675, 0.2565, 0.2761]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ssl/CCSSL/FixMatchCCSSL_cifar10_4000_4gpu.yaml b/example/low_precision_training/ppcls/configs/ssl/CCSSL/FixMatchCCSSL_cifar10_4000_4gpu.yaml
new file mode 100755
index 000000000..79667edc6
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ssl/CCSSL/FixMatchCCSSL_cifar10_4000_4gpu.yaml
@@ -0,0 +1,208 @@
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: false
+  use_dali: false
+  train_mode: fixmatch_ccssl
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+SSL:
+  T: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+Arch:
+  name: RecModel
+  infer_output_key: logits
+  infer_add_softmax: false
+  head_feature_from: backbone 
+  Backbone:
+    name: WideResNet
+    widen_factor: 2
+    depth: 28
+    dropout: 0
+    num_classes: 10
+    low_dim: 64
+    proj: false
+    proj_after: false
+  BackboneStopLayer:
+    name: bn1
+  Neck:
+    name: FRFNNeck
+    num_features: 128
+    low_dim: 64
+  Head:
+    name: FC
+    embedding_size: 128
+    class_num: 10
+
+  use_sync_bn: true
+
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+  
+UnLabelLoss:
+  Train:
+    - CCSSLCELoss:
+        weight: 1.
+    - SoftSupConLoss:
+        weight: 1.0
+        temperature: 0.07
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: true
+  weight_decay: 0.001
+  lr:
+    name: 'CosineFixmatch'
+    learning_rate: 0.03
+    num_warmup_steps: 0
+
+DataLoader:
+  mean: [0.4914, 0.4822, 0.4465]
+  std: [0.2471, 0.2435, 0.2616]
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: null
+      mode: 'train'
+      download: true
+      backend: 'pil'
+      sample_per_label: 400
+      expand_labels: 1
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: true
+      shuffle: true
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+  
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: null
+      mode: 'train'
+      backend: 'pil'
+      download: true
+
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      
+      transform_ops_strong2:
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandFlipImage:
+            flip_code: 1
+        - RandomApply:
+            transforms:
+              - RawColorJitter:
+                  brightness: 0.4
+                  contrast: 0.4
+                  saturation: 0.4
+                  hue: 0.1
+            p: 1.0  # refer to official settings
+        - RandomGrayscale:
+            p: 0.2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 448
+      drop_last: true
+      shuffle: true
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+
+  Eval:
+    dataset:
+      name: Cifar10
+      mode: 'test'
+      backend: 'pil'
+      download: true
+      data_file: null
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_250.yaml b/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_250.yaml
new file mode 100755
index 000000000..e8e00e8c8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_250.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: '../test/torch2paddle_cifar10'
+  output_dir: ./output_25
+  device: gpu
+  save_interval: -1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: False
+  use_dali: False
+  train_mode: fixmatch
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+SSL:
+  tempture: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+# AMP:
+#   scale_loss: 65536
+#   use_dynamic_loss_scaling: True
+#   # O1: mixed fp16
+#   level: O1
+
+# model architecture
+Arch:
+  name: WideResNet
+  depth: 28
+  widen_factor: 2
+  dropout: 0
+  num_classes: 10
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+UnLabelLoss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "none"
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: bn bias
+  weight_decay: 0.0005
+  lr:
+    name: CosineFixmatch
+    learning_rate: 0.03
+    num_warmup_steps: 0
+    num_cycles: 0.4375
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: 25
+      expand_labels: 263
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 448
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+  Eval:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'test'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_40.yaml b/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_40.yaml
new file mode 100755
index 000000000..0327fcd9c
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_40.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: 'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/torch2paddle_weight/torch2paddle_initialize_cifar10_WideResNet_depth28_widenfactor2_classnum10.pdparams'
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: False
+  use_dali: False
+  train_mode: fixmatch
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+SSL:
+  tempture: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+# AMP:
+#   scale_loss: 65536
+#   use_dynamic_loss_scaling: True
+#   # O1: mixed fp16
+#   level: O1
+
+# model architecture
+Arch:
+  name: WideResNet
+  depth: 28
+  widen_factor: 2
+  dropout: 0
+  num_classes: 10
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+UnLabelLoss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "none"
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: bn bias
+  weight_decay: 0.0005
+  lr:
+    name: CosineFixmatch
+    learning_rate: 0.03
+    num_warmup_steps: 0
+    num_cycles: 0.4375
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: 4
+      expand_labels: 1639
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 448
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+  Eval:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'test'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_4000.yaml b/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_4000.yaml
new file mode 100755
index 000000000..0989fc89e
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_4000.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: 'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/torch2paddle_weight/torch2paddle_initialize_cifar10_WideResNet_depth28_widenfactor2_classnum10.pdparams'
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: False
+  use_dali: False
+  train_mode: fixmatch
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+SSL:
+  tempture: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+# AMP:
+#   scale_loss: 65536
+#   use_dynamic_loss_scaling: True
+#   # O1: mixed fp16
+#   level: O1
+
+# model architecture
+Arch:
+  name: WideResNet
+  depth: 28
+  widen_factor: 2
+  dropout: 0
+  num_classes: 10
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+UnLabelLoss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "none"
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: bn bias
+  weight_decay: 0.0005
+  lr:
+    name: CosineFixmatch
+    learning_rate: 0.03
+    num_warmup_steps: 0
+    num_cycles: 0.4375
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: 400
+      expand_labels: 17
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 448
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+  Eval:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'test'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_40_4gpu.yaml b/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_40_4gpu.yaml
new file mode 100755
index 000000000..feeb1a9e8
--- /dev/null
+++ b/example/low_precision_training/ppcls/configs/ssl/FixMatch/FixMatch_cifar10_40_4gpu.yaml
@@ -0,0 +1,176 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: 'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/torch2paddle_weight/torch2paddle_initialize_cifar10_WideResNet_depth28_widenfactor2_classnum10.pdparams'
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: False
+  use_dali: False
+  train_mode: fixmatch
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+SSL:
+  tempture: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+# AMP:
+#   scale_loss: 65536
+#   use_dynamic_loss_scaling: True
+#   # O1: mixed fp16
+#   level: O1
+
+# model architecture
+Arch:
+  name: WideResNet
+  depth: 28
+  widen_factor: 2
+  dropout: 0
+  num_classes: 10
+  use_sync_bn: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+UnLabelLoss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "none"
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: bn bias
+  weight_decay: 0.0005
+  lr:
+    name: CosineFixmatch
+    learning_rate: 0.03
+    num_warmup_steps: 0
+    num_cycles: 0.4375
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: 4
+      expand_labels: 1639
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 112
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+  Eval:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'test'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/example/low_precision_training/ppcls/data/__init__.py b/example/low_precision_training/ppcls/data/__init__.py
new file mode 100755
index 000000000..44adc7c1a
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/__init__.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import copy
+import random
+import paddle
+import numpy as np
+import paddle.distributed as dist
+from functools import partial
+from paddle.io import DistributedBatchSampler, BatchSampler, DataLoader
+from ppcls.utils import logger
+
+from ppcls.data import dataloader
+# dataset
+from ppcls.data.dataloader.imagenet_dataset import ImageNetDataset
+from ppcls.data.dataloader.multilabel_dataset import MultiLabelDataset
+from ppcls.data.dataloader.common_dataset import create_operators
+from ppcls.data.dataloader.vehicle_dataset import CompCars, VeriWild
+from ppcls.data.dataloader.logo_dataset import LogoDataset
+from ppcls.data.dataloader.icartoon_dataset import ICartoonDataset
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data.dataloader.multi_scale_dataset import MultiScaleDataset
+from ppcls.data.dataloader.person_dataset import Market1501, MSMT17, DukeMTMC
+from ppcls.data.dataloader.face_dataset import FiveValidationDataset, AdaFaceDataset
+from ppcls.data.dataloader.custom_label_dataset import CustomLabelDataset
+from ppcls.data.dataloader.cifar import Cifar10, Cifar100
+from ppcls.data.dataloader.metabin_sampler import DomainShuffleBatchSampler, NaiveIdentityBatchSampler
+
+# sampler
+from ppcls.data.dataloader.DistributedRandomIdentitySampler import DistributedRandomIdentitySampler
+from ppcls.data.dataloader.pk_sampler import PKSampler
+from ppcls.data.dataloader.mix_sampler import MixSampler
+from ppcls.data.dataloader.multi_scale_sampler import MultiScaleSampler
+from ppcls.data.dataloader.ra_sampler import RASampler
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+
+
+def create_operators(params, class_num=None):
+    """
+    create operators based on the config
+
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op_func = getattr(preprocess, op_name)
+        if "class_num" in inspect.getfullargspec(op_func).args:
+            param.update({"class_num": class_num})
+        op = op_func(**param)
+        ops.append(op)
+
+    return ops
+
+
+def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int):
+    """callback function on each worker subprocess after seeding and before data loading.
+
+    Args:
+        worker_id (int): Worker id in [0, num_workers - 1]
+        num_workers (int): Number of subprocesses to use for data loading.
+        rank (int): Rank of process in distributed environment. If in non-distributed environment, it is a constant number `0`.
+        seed (int): Random seed
+    """
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+def build_dataloader(config, mode, device, use_dali=False, seed=None):
+    assert mode in [
+        'Train', 'Eval', 'Test', 'Gallery', 'Query', 'UnLabelTrain'
+    ], "Dataset mode should be Train, Eval, Test, Gallery, Query, UnLabelTrain"
+    assert mode in config.keys(), "{} config not in yaml".format(mode)
+    # build dataset
+    if use_dali and paddle.device.is_compiled_with_cuda():
+        from ppcls.data.dataloader.dali import dali_dataloader
+        return dali_dataloader(
+            config,
+            mode,
+            paddle.device.get_device(),
+            num_threads=config[mode]['loader']["num_workers"],
+            seed=seed,
+            enable_fuse=True)
+
+    class_num = config.get("class_num", None)
+    epochs = config.get("epochs", None)
+    config_dataset = config[mode]['dataset']
+    config_dataset = copy.deepcopy(config_dataset)
+    dataset_name = config_dataset.pop('name')
+    if 'batch_transform_ops' in config_dataset:
+        batch_transform = config_dataset.pop('batch_transform_ops')
+    else:
+        batch_transform = None
+
+    dataset = eval(dataset_name)(**config_dataset)
+
+    logger.debug("build dataset({}) success...".format(dataset))
+
+    # build sampler
+    config_sampler = config[mode]['sampler']
+    if config_sampler and "name" not in config_sampler:
+        batch_sampler = None
+        batch_size = config_sampler["batch_size"]
+        drop_last = config_sampler["drop_last"]
+        shuffle = config_sampler["shuffle"]
+    else:
+        sampler_name = config_sampler.pop("name")
+        sampler_argspec = inspect.getfullargspec(eval(sampler_name)
+                                                 .__init__).args
+        if "total_epochs" in sampler_argspec:
+            config_sampler.update({"total_epochs": epochs})
+        batch_sampler = eval(sampler_name)(dataset, **config_sampler)
+
+    logger.debug("build batch_sampler({}) success...".format(batch_sampler))
+
+    # build batch operator
+    def mix_collate_fn(batch):
+        batch = transform(batch, batch_ops)
+        # batch each field
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    if isinstance(batch_transform, list):
+        batch_ops = create_operators(batch_transform, class_num)
+        batch_collate_fn = mix_collate_fn
+    else:
+        batch_collate_fn = None
+
+    # build dataloader
+    config_loader = config[mode]['loader']
+    num_workers = config_loader["num_workers"]
+    use_shared_memory = config_loader["use_shared_memory"]
+
+    init_fn = partial(
+        worker_init_fn,
+        num_workers=num_workers,
+        rank=dist.get_rank(),
+        seed=seed) if seed is not None else None
+
+    if batch_sampler is None:
+        data_loader = DataLoader(
+            dataset=dataset,
+            places=device,
+            num_workers=num_workers,
+            return_list=True,
+            use_shared_memory=use_shared_memory,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            collate_fn=batch_collate_fn,
+            worker_init_fn=init_fn)
+    else:
+        data_loader = DataLoader(
+            dataset=dataset,
+            places=device,
+            num_workers=num_workers,
+            return_list=True,
+            use_shared_memory=use_shared_memory,
+            batch_sampler=batch_sampler,
+            collate_fn=batch_collate_fn,
+            worker_init_fn=init_fn)
+
+    logger.debug("build data_loader({}) success...".format(data_loader))
+    return data_loader
+
+
+# for PaddleX
+ClsDataset = ImageNetDataset
+ShiTuRecDataset = ImageNetDataset
+MLClsDataset = MultiLabelDataset
diff --git a/example/low_precision_training/ppcls/data/dataloader/DistributedRandomIdentitySampler.py b/example/low_precision_training/ppcls/data/dataloader/DistributedRandomIdentitySampler.py
new file mode 100755
index 000000000..b4d77a4fc
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/DistributedRandomIdentitySampler.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+
+import copy
+import random
+from collections import defaultdict
+
+import numpy as np
+from paddle.io import DistributedBatchSampler
+
+
+class DistributedRandomIdentitySampler(DistributedBatchSampler):
+    """Randomly sample N identities, then for each identity,
+       randomly sample K instances, therefore batch size equals to N * K.
+    Args:
+        dataset(Dataset): Dataset which contains list of (img_path, pid, camid))
+        batch_size (int): batch size
+        num_instances (int): number of instance(s) within an class
+        drop_last (bool): whether to discard the data at the end
+        max_iters (int): max iteration(s). Default to None.
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_instances,
+                 drop_last,
+                 max_iters=None,
+                 **args):
+        assert batch_size % num_instances == 0, \
+            f"batch_size({batch_size}) must be divisible by num_instances({num_instances}) when using DistributedRandomIdentitySampler"
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_instances = num_instances
+        self.drop_last = drop_last
+        self.max_iters = max_iters
+        self.num_pids_per_batch = self.batch_size // self.num_instances
+        self.index_dic = defaultdict(list)
+        for index, pid in enumerate(self.dataset.labels):
+            self.index_dic[pid].append(index)
+        self.pids = list(self.index_dic.keys())
+        # estimate number of examples in an epoch
+        self.length = 0
+        for pid in self.pids:
+            idxs = self.index_dic[pid]
+            num = len(idxs)
+            if num < self.num_instances:
+                num = self.num_instances
+            self.length += num - num % self.num_instances
+
+    def _prepare_batch(self):
+        batch_idxs_dict = defaultdict(list)
+        count = []
+        for pid in self.pids:
+            idxs = copy.deepcopy(self.index_dic[pid])
+            if len(idxs) < self.num_instances:
+                idxs = np.random.choice(
+                    idxs, size=self.num_instances, replace=True)
+            random.shuffle(idxs)
+            batch_idxs = []
+            for idx in idxs:
+                batch_idxs.append(idx)
+                if len(batch_idxs) == self.num_instances:
+                    batch_idxs_dict[pid].append(batch_idxs)
+                    batch_idxs = []
+        count = [len(batch_idxs_dict[pid]) for pid in self.pids]
+        count = np.array(count)
+        avai_pids = copy.deepcopy(self.pids)
+        return batch_idxs_dict, avai_pids, count
+
+    def __iter__(self):
+        # prepare
+        batch_idxs_dict, avai_pids, count = self._prepare_batch()
+
+        # sample
+        if self.max_iters is not None:
+            for _ in range(self.max_iters):
+                final_idxs = []
+                if len(avai_pids) < self.num_pids_per_batch:
+                    batch_idxs_dict, avai_pids, count = self._prepare_batch()
+
+                selected_pids = np.random.choice(avai_pids,
+                                                 self.num_pids_per_batch,
+                                                 False, count / count.sum())
+                for pid in selected_pids:
+                    batch_idxs = batch_idxs_dict[pid].pop(0)
+                    final_idxs.extend(batch_idxs)
+                    pid_idx = avai_pids.index(pid)
+                    if len(batch_idxs_dict[pid]) == 0:
+                        avai_pids.pop(pid_idx)
+                        count = np.delete(count, pid_idx)
+                    else:
+                        count[pid_idx] = len(batch_idxs_dict[pid])
+                yield final_idxs
+        else:
+            final_idxs = []
+            while len(avai_pids) >= self.num_pids_per_batch:
+                selected_pids = random.sample(avai_pids,
+                                              self.num_pids_per_batch)
+                for pid in selected_pids:
+                    batch_idxs = batch_idxs_dict[pid].pop(0)
+                    final_idxs.extend(batch_idxs)
+                    if len(batch_idxs_dict[pid]) == 0:
+                        avai_pids.remove(pid)
+            _sample_iter = iter(final_idxs)
+            batch_indices = []
+            for idx in _sample_iter:
+                batch_indices.append(idx)
+                if len(batch_indices) == self.batch_size:
+                    yield batch_indices
+                    batch_indices = []
+            if not self.drop_last and len(batch_indices) > 0:
+                yield batch_indices
+
+    def __len__(self):
+        if self.max_iters is not None:
+            return self.max_iters
+        elif self.drop_last:
+            return self.length // self.batch_size
+        else:
+            return (self.length + self.batch_size - 1) // self.batch_size
diff --git a/example/low_precision_training/ppcls/data/dataloader/__init__.py b/example/low_precision_training/ppcls/data/dataloader/__init__.py
new file mode 100755
index 000000000..391dcef65
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/__init__.py
@@ -0,0 +1,16 @@
+from ppcls.data.dataloader.imagenet_dataset import ImageNetDataset
+from ppcls.data.dataloader.multilabel_dataset import MultiLabelDataset
+from ppcls.data.dataloader.common_dataset import create_operators
+from ppcls.data.dataloader.vehicle_dataset import CompCars, VeriWild
+from ppcls.data.dataloader.logo_dataset import LogoDataset
+from ppcls.data.dataloader.icartoon_dataset import ICartoonDataset
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data.dataloader.multi_scale_dataset import MultiScaleDataset
+from ppcls.data.dataloader.mix_sampler import MixSampler
+from ppcls.data.dataloader.multi_scale_sampler import MultiScaleSampler
+from ppcls.data.dataloader.pk_sampler import PKSampler
+from ppcls.data.dataloader.person_dataset import Market1501, MSMT17, DukeMTMC
+from ppcls.data.dataloader.face_dataset import AdaFaceDataset, FiveValidationDataset
+from ppcls.data.dataloader.custom_label_dataset import CustomLabelDataset
+from ppcls.data.dataloader.cifar import Cifar10, Cifar100
+from ppcls.data.dataloader.metabin_sampler import DomainShuffleBatchSampler, NaiveIdentityBatchSampler
diff --git a/example/low_precision_training/ppcls/data/dataloader/cifar.py b/example/low_precision_training/ppcls/data/dataloader/cifar.py
new file mode 100755
index 000000000..614f4f3e9
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/cifar.py
@@ -0,0 +1,136 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import cv2
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.data.dataloader.common_dataset import create_operators
+from paddle.vision.datasets import Cifar10 as Cifar10_paddle
+from paddle.vision.datasets import Cifar100 as Cifar100_paddle
+
+
+class Cifar10(Cifar10_paddle):
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 download=True,
+                 backend='cv2',
+                 sample_per_label=None,
+                 expand_labels=1,
+                 transform_ops=None,
+                 transform_ops_weak=None,
+                 transform_ops_strong=None,
+                 transform_ops_strong2=None):
+        super().__init__(data_file, mode, None, download, backend)
+        assert isinstance(expand_labels, int)
+        self._transform_ops = create_operators(transform_ops)
+        self._transform_ops_weak = create_operators(transform_ops_weak)
+        self._transform_ops_strong = create_operators(transform_ops_strong)
+        self._transform_ops_strong2 = create_operators(transform_ops_strong2)
+        self.class_num = 10
+        labels = []
+        for x in self.data:
+            labels.append(x[1])
+        labels = np.array(labels)
+        if isinstance(sample_per_label, int):
+            index = []
+            for i in range(self.class_num):
+                idx = np.where(labels == i)[0]
+                idx = np.random.choice(idx, sample_per_label, False)
+                index.extend(idx)
+            index = index * expand_labels
+            data = [self.data[x] for x in index]
+            self.data = data
+
+    def __getitem__(self, idx):
+        (image, label) = super().__getitem__(idx)
+        if self._transform_ops:
+            image1 = transform(image, self._transform_ops)
+            image1 = image1.transpose((2, 0, 1))
+            return (image1, np.int64(label))
+        elif self._transform_ops_weak and self._transform_ops_strong and self._transform_ops_strong2:
+            image2 = transform(image, self._transform_ops_weak)
+            image2 = image2.transpose((2, 0, 1))
+            image3 = transform(image, self._transform_ops_strong)
+            image3 = image3.transpose((2, 0, 1))
+            image4 = transform(image, self._transform_ops_strong2)
+            image4 = image4.transpose((2, 0, 1))
+            return (image2, image3, image4, np.int64(label))
+
+        elif self._transform_ops_weak and self._transform_ops_strong:
+            image2 = transform(image, self._transform_ops_weak)
+            image2 = image2.transpose((2, 0, 1))
+            image3 = transform(image, self._transform_ops_strong)
+            image3 = image3.transpose((2, 0, 1))
+
+            return (image2, image3, np.int64(label))
+
+
+class Cifar100(Cifar100_paddle):
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 download=True,
+                 backend='pil',
+                 sample_per_label=None,
+                 expand_labels=1,
+                 transform_ops=None,
+                 transform_ops_weak=None,
+                 transform_ops_strong=None,
+                 transform_ops_strong2=None):
+        super().__init__(data_file, mode, None, download, backend)
+        assert isinstance(expand_labels, int)
+        self._transform_ops = create_operators(transform_ops)
+        self._transform_ops_weak = create_operators(transform_ops_weak)
+        self._transform_ops_strong = create_operators(transform_ops_strong)
+        self._transform_ops_strong2 = create_operators(transform_ops_strong2)
+        self.class_num = 100
+
+        labels = []
+        for x in self.data:
+            labels.append(x[1])
+        labels = np.array(labels)
+        if isinstance(sample_per_label, int):
+            index = []
+            for i in range(self.class_num):
+                idx = np.where(labels == i)[0]
+                idx = np.random.choice(idx, sample_per_label, False)
+                index.extend(idx)
+            index = index * expand_labels
+            data = [self.data[x] for x in index]
+            self.data = data
+
+    def __getitem__(self, idx):
+        (image, label) = super().__getitem__(idx)
+        if self._transform_ops:
+            image1 = transform(image, self._transform_ops)
+            image1 = image1.transpose((2, 0, 1))
+            return (image1, np.int64(label))
+        elif self._transform_ops_weak and self._transform_ops_strong and self._transform_ops_strong2:
+            image2 = transform(image, self._transform_ops_weak)
+            image2 = image2.transpose((2, 0, 1))
+            image3 = transform(image, self._transform_ops_strong)
+            image3 = image3.transpose((2, 0, 1))
+            image4 = transform(image, self._transform_ops_strong2)
+            image4 = image4.transpose((2, 0, 1))
+            return (image2, image3, image4, np.int64(label))
+        elif self._transform_ops_weak and self._transform_ops_strong:
+            image2 = transform(image, self._transform_ops_weak)
+            image2 = image2.transpose((2, 0, 1))
+            image3 = transform(image, self._transform_ops_strong)
+            image3 = image3.transpose((2, 0, 1))
+
+            return (image2, image3, np.int64(label))
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/data/dataloader/common_dataset.py b/example/low_precision_training/ppcls/data/dataloader/common_dataset.py
new file mode 100755
index 000000000..7530137eb
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/common_dataset.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+from paddle.io import Dataset
+import cv2
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+
+def create_operators(params):
+    """
+    create operators based on the config
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    if params is None:
+        return None
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op = getattr(preprocess, op_name)(**param)
+        ops.append(op)
+
+    return ops
+
+
+class CommonDataset(Dataset):
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 transform_ops=None,
+                 label_ratio=False):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self._transform_ops = create_operators(transform_ops)
+
+        self.images = []
+        self.labels = []
+        if label_ratio:
+            self.label_ratio = self._load_anno(label_ratio=label_ratio)
+        else:
+            self._load_anno()
+
+    def _load_anno(self):
+        pass
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx])
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/example/low_precision_training/ppcls/data/dataloader/custom_label_dataset.py b/example/low_precision_training/ppcls/data/dataloader/custom_label_dataset.py
new file mode 100755
index 000000000..eeae8ad09
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/custom_label_dataset.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+from .common_dataset import CommonDataset
+
+
+class CustomLabelDataset(CommonDataset):
+    """CustomLabelDataset
+
+    Args:
+        image_root (str): image root, path to `ILSVRC2012`
+        sample_list_path (str): path to the file with samples listed.
+        transform_ops (list, optional): list of transform op(s). Defaults to None.
+        label_key (str, optional): Defaults to None.
+        delimiter (str, optional): delimiter. Defaults to None.
+    """
+
+    def __init__(self,
+                 image_root,
+                 sample_list_path,
+                 transform_ops=None,
+                 label_key=None,
+                 delimiter=None):
+        self.delimiter = delimiter
+        super().__init__(image_root, sample_list_path, transform_ops)
+        if self._transform_ops is None and label_key is not None:
+            label_key = None
+            msg = "Unable to get label by label_key when transform_ops is None. The label_key has been set to None."
+            logger.warning(msg)
+        self.label_key = label_key
+
+    def _load_anno(self, seed=None):
+        assert os.path.exists(
+            self._cls_path), f"path {self._cls_path} does not exist."
+        assert os.path.exists(
+            self._img_root), f"path {self._img_root} does not exist."
+        self.images = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(lines)
+            for line in lines:
+                line = line.strip()
+                if self.delimiter is not None:
+                    line = line.split(self.delimiter)[0]
+                self.images.append(os.path.join(self._img_root, line))
+                assert os.path.exists(self.images[
+                    -1]), f"path {self.images[-1]} does not exist."
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                processed_sample = transform({"img": img}, self._transform_ops)
+                img = processed_sample["img"].transpose((2, 0, 1))
+                if self.label_key is not None:
+                    label = processed_sample[self.label_key]
+                    sample = (img, label)
+                    return sample
+            return (img)
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
diff --git a/example/low_precision_training/ppcls/data/dataloader/dali.py b/example/low_precision_training/ppcls/data/dataloader/dali.py
new file mode 100755
index 000000000..9720262c6
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/dali.py
@@ -0,0 +1,795 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import copy
+import os
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Tuple, Union, Optional
+
+import numpy as np
+import nvidia.dali.fn as fn
+import nvidia.dali.ops as ops
+import nvidia.dali.pipeline as pipeline
+import nvidia.dali.types as types
+import paddle
+from nvidia.dali.plugin.paddle import DALIGenericIterator
+from nvidia.dali.plugin.base_iterator import LastBatchPolicy
+from ppcls.data.preprocess.ops.dali_operators import ColorJitter
+from ppcls.data.preprocess.ops.dali_operators import CropImage
+from ppcls.data.preprocess.ops.dali_operators import CropMirrorNormalize
+from ppcls.data.preprocess.ops.dali_operators import DecodeImage
+from ppcls.data.preprocess.ops.dali_operators import DecodeRandomResizedCrop
+from ppcls.data.preprocess.ops.dali_operators import NormalizeImage
+from ppcls.data.preprocess.ops.dali_operators import Pad
+from ppcls.data.preprocess.ops.dali_operators import RandCropImage
+from ppcls.data.preprocess.ops.dali_operators import RandCropImageV2
+from ppcls.data.preprocess.ops.dali_operators import RandFlipImage
+from ppcls.data.preprocess.ops.dali_operators import RandomCropImage
+from ppcls.data.preprocess.ops.dali_operators import RandomRot90
+from ppcls.data.preprocess.ops.dali_operators import RandomRotation
+from ppcls.data.preprocess.ops.dali_operators import ResizeImage
+from ppcls.data.preprocess.ops.dali_operators import ToCHWImage
+from ppcls.engine.train.utils import type_name
+from ppcls.utils import logger
+
+INTERP_MAP = {
+    "nearest": types.DALIInterpType.INTERP_NN,  # cv2.INTER_NEAREST
+    "bilinear": types.DALIInterpType.INTERP_LINEAR,  # cv2.INTER_LINEAR
+    "bicubic": types.DALIInterpType.INTERP_CUBIC,  # cv2.INTER_CUBIC
+    "lanczos": types.DALIInterpType.INTERP_LANCZOS3,  # cv2.INTER_LANCZOS4
+}
+
+
+def make_pair(x: Union[Any, Tuple[Any], List[Any]]) -> Tuple[Any]:
+    """repeat input x to be an tuple if x is an single element, else return x directly
+
+    Args:
+        x (Union[Any, Tuple[Any], List[Any]]): input x
+
+    Returns:
+        Tuple[Any]: tupled input
+    """
+    return x if isinstance(x, (tuple, list)) else (x, x)
+
+
+def parse_value_with_key(content: Union[Dict, List[Dict]],
+                         key: str) -> Union[None, Any]:
+    """parse value according to given key recursively, return None if not found
+
+    Args:
+        content (Union[Dict, List[Dict]]): content to be parsed
+        key (str): given key
+
+    Returns:
+        Union[None, Any]: result
+    """
+    if isinstance(content, dict):
+        if key in content:
+            return content[key]
+        for content_ in content.values():
+            value = parse_value_with_key(content_, key)
+            if value is not None:
+                return value
+    elif isinstance(content, (tuple, list)):
+        for content_ in content:
+            value = parse_value_with_key(content_, key)
+            if value is not None:
+                return value
+    return None
+
+
+def convert_cfg_to_dali(op_name: str, device: str, **op_cfg) -> Dict[str, Any]:
+    """convert original preprocess op params into DALI-based op params
+
+    Args:
+        op_name (str): name of operator
+        device (str): device which operator applied on
+
+    Returns:
+        Dict[str, Any]: converted arguments for DALI initialization
+    """
+    assert device in ["cpu", "gpu"
+                      ], f"device({device}) must in [\"cpu\", \"gpu\"]"
+    dali_op_cfg = {}
+    if op_name == "DecodeImage":
+        device = "cpu" if device == "cpu" else "mixed"
+        to_rgb = op_cfg.get("to_rgb", True)
+        channel_first = op_cfg.get("channel_first", False)
+        assert channel_first is False, \
+            f"`channel_first` must set to False when using DALI, but got {channel_first}"
+        dali_op_cfg.update({"device": device})
+        dali_op_cfg.update({
+            "output_type": types.DALIImageType.RGB
+            if to_rgb else types.DALIImageType.BGR
+        })
+        dali_op_cfg.update({
+            "device_memory_padding":
+            op_cfg.get("device_memory_padding", 211025920)
+        })
+        dali_op_cfg.update({
+            "host_memory_padding": op_cfg.get("host_memory_padding", 140544512)
+        })
+    elif op_name == "ResizeImage":
+        size = op_cfg.get("size", None)
+        resize_short = op_cfg.get("resize_short", None)
+        interpolation = op_cfg.get("interpolation", None)
+        if size is not None:
+            size = make_pair(size)
+            dali_op_cfg.update({"resize_y": size[0], "resize_x": size[1]})
+        if resize_short is not None:
+            dali_op_cfg.update({"resize_shorter": resize_short})
+        if interpolation is not None:
+            dali_op_cfg.update({"interp_type": INTERP_MAP[interpolation]})
+    elif op_name == "CropImage":
+        size = op_cfg.get("size", 224)
+        size = make_pair(size)
+        dali_op_cfg.update({"crop_h": size[1], "crop_w": size[0]})
+        dali_op_cfg.update({"crop_pos_x": 0.5, "crop_pos_y": 0.5})
+    elif op_name == "RandomCropImage":
+        size = op_cfg.get("size", 224)
+        if size is not None:
+            size = make_pair(size)
+            dali_op_cfg.update({"crop_h": size[1], "crop_w": size[0]})
+    elif op_name == "RandCropImage":
+        size = op_cfg.get("size", 224)
+        size = make_pair(size)
+        scale = op_cfg.get("scale", [0.08, 1.0])
+        ratio = op_cfg.get("ratio", [3.0 / 4, 4.0 / 3])
+        interpolation = op_cfg.get("interpolation", "bilinear")
+        dali_op_cfg.update({"size": size})
+        if scale is not None:
+            dali_op_cfg.update({"random_area": scale})
+        if ratio is not None:
+            dali_op_cfg.update({"random_aspect_ratio": ratio})
+        if interpolation is not None:
+            dali_op_cfg.update({"interp_type": INTERP_MAP[interpolation]})
+    elif op_name == "RandCropImageV2":
+        size = op_cfg.get("size", 224)
+        size = make_pair(size)
+        dali_op_cfg.update({"crop_h": size[1], "crop_w": size[0]})
+    elif op_name == "RandFlipImage":
+        prob = op_cfg.get("prob", 0.5)
+        flip_code = op_cfg.get("flip_code", 1)
+        dali_op_cfg.update({"prob": prob})
+        dali_op_cfg.update({"flip_code": flip_code})
+    elif op_name == "NormalizeImage":
+        # scale * (in - mean) / stddev + shift
+        scale = op_cfg.get("scale", 1.0 / 255.0)
+        if isinstance(scale, str):
+            scale = eval(scale)
+        mean = op_cfg.get("mean", [0.485, 0.456, 0.406])
+        std = op_cfg.get("std", [0.229, 0.224, 0.225])
+        mean = [v / scale for v in mean]
+        std = [v / scale for v in std]
+        order = op_cfg.get("order", "chw")
+        channel_num = op_cfg.get("channel_num", 3)
+        output_fp16 = op_cfg.get("output_fp16", False)
+        dali_op_cfg.update({
+            "mean": np.reshape(
+                np.array(
+                    mean, dtype="float32"), [channel_num, 1, 1]
+                if order == "chw" else [1, 1, channel_num])
+        })
+        dali_op_cfg.update({
+            "stddev": np.reshape(
+                np.array(
+                    std, dtype="float32"), [channel_num, 1, 1]
+                if order == "chw" else [1, 1, channel_num])
+        })
+        if output_fp16:
+            dali_op_cfg.update({"dtype": types.FLOAT16})
+    elif op_name == "ToCHWImage":
+        dali_op_cfg.update({"perm": [2, 0, 1]})
+    elif op_name == "ColorJitter":
+        prob = op_cfg.get("prob", 1.0)
+        brightness = op_cfg.get("brightness", 0.0)
+        contrast = op_cfg.get("contrast", 0.0)
+        saturation = op_cfg.get("saturation", 0.0)
+        hue = op_cfg.get("hue", 0.0)
+        dali_op_cfg.update({"prob": prob})
+        dali_op_cfg.update({"brightness_factor": brightness})
+        dali_op_cfg.update({"contrast_factor": contrast})
+        dali_op_cfg.update({"saturation_factor": saturation})
+        dali_op_cfg.update({"hue_factor": hue})
+    elif op_name == "RandomRotation":
+        prob = op_cfg.get("prob", 0.5)
+        degrees = op_cfg.get("degrees", 90)
+        interpolation = op_cfg.get("interpolation", "bilinear")
+        dali_op_cfg.update({"prob": prob})
+        dali_op_cfg.update({"angle": degrees})
+        dali_op_cfg.update({"interp_type": INTERP_MAP[interpolation]})
+    elif op_name == "Pad":
+        size = op_cfg.get("size", 224)
+        size = make_pair(size)
+        padding = op_cfg.get("padding", 0)
+        fill = op_cfg.get("fill", 0)
+        dali_op_cfg.update({
+            "crop_h": padding + size[1] + padding,
+            "crop_w": padding + size[0] + padding
+        })
+        dali_op_cfg.update({"fill_values": fill})
+        dali_op_cfg.update({"out_of_bounds_policy": "pad"})
+    elif op_name == "RandomRot90":
+        interpolation = op_cfg.get("interpolation", "nearest")
+    elif op_name == "DecodeRandomResizedCrop":
+        device = "cpu" if device == "cpu" else "mixed"
+        output_type = op_cfg.get("output_type", types.DALIImageType.RGB)
+        device_memory_padding = op_cfg.get("device_memory_padding", 211025920)
+        host_memory_padding = op_cfg.get("host_memory_padding", 140544512)
+        scale = op_cfg.get("scale", [0.08, 1.0])
+        ratio = op_cfg.get("ratio", [3.0 / 4, 4.0 / 3])
+        num_attempts = op_cfg.get("num_attempts", 100)
+        size = op_cfg.get("size", 224)
+        dali_op_cfg.update({"device": device})
+        if output_type is not None:
+            dali_op_cfg.update({"output_type": output_type})
+        if device_memory_padding is not None:
+            dali_op_cfg.update({
+                "device_memory_padding": device_memory_padding
+            })
+        if host_memory_padding is not None:
+            dali_op_cfg.update({"host_memory_padding": host_memory_padding})
+        if scale is not None:
+            dali_op_cfg.update({"random_area": scale})
+        if ratio is not None:
+            dali_op_cfg.update({"random_aspect_ratio": ratio})
+        if num_attempts is not None:
+            dali_op_cfg.update({"num_attempts": num_attempts})
+        if size is not None:
+            dali_op_cfg.update({"resize_x": size, "resize_y": size})
+    elif op_name == "CropMirrorNormalize":
+        dtype = types.FLOAT16 if op_cfg.get("output_fp16",
+                                            False) else types.FLOAT
+        output_layout = op_cfg.get("output_layout", "CHW")
+        size = op_cfg.get("size", None)
+        scale = op_cfg.get("scale", 1 / 255.0)
+        if isinstance(scale, str):
+            scale = eval(scale)
+        mean = op_cfg.get("mean", [0.485, 0.456, 0.406])
+        mean = [v / scale for v in mean]
+        std = op_cfg.get("std", [0.229, 0.224, 0.225])
+        std = [v / scale for v in std]
+        pad_output = op_cfg.get("channel_num", 3) == 4
+        prob = op_cfg.get("prob", 0.5)
+        dali_op_cfg.update({"dtype": dtype})
+        if output_layout is not None:
+            dali_op_cfg.update({"output_layout": output_layout})
+        if size is not None:
+            dali_op_cfg.update({"crop": (size, size)})
+        if mean is not None:
+            dali_op_cfg.update({"mean": mean})
+        if std is not None:
+            dali_op_cfg.update({"std": std})
+        if pad_output is not None:
+            dali_op_cfg.update({"pad_output": pad_output})
+        if prob is not None:
+            dali_op_cfg.update({"prob": prob})
+    else:
+        raise ValueError(
+            f"DALI operator \"{op_name}\"  in PaddleClas is not implemented now. please refer to docs/zh_CN/training/config_description/develop_with_DALI.md"
+        )
+    if "device" not in dali_op_cfg:
+        dali_op_cfg.update({"device": device})
+    return dali_op_cfg
+
+
+def build_dali_transforms(op_cfg_list: List[Dict[str, Any]],
+                          mode: str,
+                          device: str="gpu",
+                          enable_fuse: bool=True) -> List[Callable]:
+    """create dali operators based on the config
+    Args:
+        op_cfg_list (List[Dict[str, Any]]): a dict list, used to create some operators, such as config below
+            --------------------------------
+            - DecodeImage:
+                to_rgb: True
+                channel_first: False
+            - ResizeImage:
+                size: 224
+            - NormalizeImage:
+                scale: 0.00392157
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                order: ""
+            --------------------------------
+        mode (str): mode.
+        device (str): device which dali operator(s) applied in. Defaults to "gpu".
+        enable_fuse (bool): whether to use fused dali operators instead of single operators, such as DecodeRandomResizedCrop. Defaults to True.
+    Returns:
+        List[Callable]: Callable DALI operators in list.
+    """
+    assert isinstance(op_cfg_list, list), "operator config should be a list"
+    # build dali transforms list
+
+    dali_op_list = []
+    idx = 0
+    num_cfg_node = len(op_cfg_list)
+    while idx < num_cfg_node:
+        op_cfg = op_cfg_list[idx]
+        op_name = list(op_cfg)[0]
+        op_param = {} if op_cfg[op_name] is None else copy.deepcopy(op_cfg[
+            op_name])
+        fused_success = False
+        if enable_fuse:
+            # fuse operators if enabled
+            if idx + 1 < num_cfg_node:
+                op_name_nxt = list(op_cfg_list[idx + 1])[0]
+                if (op_name == "DecodeImage" and
+                        op_name_nxt == "RandCropImage"):
+                    fused_op_name = "DecodeRandomResizedCrop"
+                    fused_op_param = convert_cfg_to_dali(
+                        fused_op_name, device, **{
+                            ** op_param, ** (op_cfg_list[idx + 1][op_name_nxt])
+                        })
+                    fused_dali_op = eval(fused_op_name)(**fused_op_param)
+                    idx += 2
+                    dali_op_list.append(fused_dali_op)
+                    fused_success = True
+                    logger.info(
+                        f"DALI fused Operator conversion({mode}): [DecodeImage, RandCropImage] -> {type_name(dali_op_list[-1])}: {fused_op_param}"
+                    )
+            if not fused_success and 0 < idx and idx + 1 < num_cfg_node:
+                op_name_pre = list(op_cfg_list[idx - 1])[0]
+                op_name_nxt = list(op_cfg_list[idx + 1])[0]
+                if (op_name_pre == "RandCropImage" and
+                        op_name == "RandFlipImage" and
+                        op_name_nxt == "NormalizeImage"):
+                    fused_op_name = "CropMirrorNormalize"
+                    fused_op_param = convert_cfg_to_dali(
+                        fused_op_name, device, **{
+                            ** op_param, **
+                            (op_cfg_list[idx - 1][op_name_pre]), **
+                            (op_cfg_list[idx + 1][op_name_nxt])
+                        })
+                    fused_dali_op = eval(fused_op_name)(**fused_op_param)
+                    idx += 2
+                    dali_op_list.append(fused_dali_op)
+                    fused_success = True
+                    logger.info(
+                        f"DALI fused Operator conversion({mode}): [RandCropImage, RandFlipImage, NormalizeImage] -> {type_name(dali_op_list[-1])}: {fused_op_param}"
+                    )
+            if not fused_success and idx + 1 < num_cfg_node:
+                op_name_nxt = list(op_cfg_list[idx + 1])[0]
+                if (op_name == "CropImage" and
+                        op_name_nxt == "NormalizeImage"):
+                    fused_op_name = "CropMirrorNormalize"
+                    fused_op_param = convert_cfg_to_dali(
+                        fused_op_name, device, **{
+                            **
+                            op_param,
+                            **
+                            (op_cfg_list[idx + 1][op_name_nxt]),
+                            "prob": 0.0
+                        })
+                    fused_dali_op = eval(fused_op_name)(**fused_op_param)
+                    idx += 2
+                    dali_op_list.append(fused_dali_op)
+                    fused_success = True
+                    logger.info(
+                        f"DALI fused Operator conversion({mode}): [CropImage, NormalizeImage] -> {type_name(dali_op_list[-1])}: {fused_op_param}"
+                    )
+        if not enable_fuse or not fused_success:
+            assert isinstance(op_cfg,
+                              dict) and len(op_cfg) == 1, "yaml format error"
+            if op_name == "Pad":
+                # NOTE: Argument `size` must be provided for DALI operator
+                op_param.update({
+                    "size": parse_value_with_key(op_cfg_list[:idx], "size")
+                })
+            dali_param = convert_cfg_to_dali(op_name, device, **op_param)
+            dali_op = eval(op_name)(**dali_param)
+            dali_op_list.append(dali_op)
+            idx += 1
+            logger.info(
+                f"DALI Operator conversion({mode}): {op_name} -> {type_name(dali_op_list[-1])}: {dali_param}"
+            )
+    return dali_op_list
+
+
+class ExternalSource_RandomIdentity(object):
+    """PKsampler implemented with ExternalSource
+
+    Args:
+        batch_size (int): batch size
+        sample_per_id (int): number of instance(s) within an class
+        device_id (int): device id
+        shard_id (int): shard id
+        num_gpus (int): number of gpus
+        image_root (str): image root directory
+        cls_label_path (str): path to annotation file, such as `train_list.txt` or `val_list.txt`
+        delimiter (Optional[str], optional): delimiter. Defaults to None.
+        relabel (bool, optional): whether do relabel when original label do not starts from 0 or are discontinuous. Defaults to False.
+        sample_method (str, optional): sample method when generating prob_list. Defaults to "sample_avg_prob".
+        id_list (List[int], optional): list of (start_id, end_id, start_id, end_id) for set of ids to duplicated. Defaults to None.
+        ratio (List[Union[int, float]], optional): list of (ratio1, ratio2..) the duplication number for ids in id_list. Defaults to None.
+        shuffle (bool): whether to shuffle label list. Defaults to True.
+    """
+
+    def __init__(self,
+                 batch_size: int,
+                 sample_per_id: int,
+                 device_id: int,
+                 shard_id: int,
+                 num_gpus: int,
+                 image_root: str,
+                 cls_label_path: str,
+                 delimiter: Optional[str]=None,
+                 relabel: bool=False,
+                 sample_method: str="sample_avg_prob",
+                 id_list: List[int]=None,
+                 ratio: List[Union[int, float]]=None,
+                 shuffle: bool=True):
+        self.batch_size = batch_size
+        self.sample_per_id = sample_per_id
+        self.label_per_batch = self.batch_size // self.sample_per_id
+        self.device_id = device_id
+        self.shard_id = shard_id
+        self.num_gpus = num_gpus
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.relabel = relabel
+        self.sample_method = sample_method
+        self.image_paths = []
+        self.labels = []
+        self.epoch = 0
+
+        # NOTE: code from ImageNetDataset below
+        with open(self._cls_path, "r") as fd:
+            lines = fd.readlines()
+            if self.relabel:
+                label_set = set()
+                for line in lines:
+                    line = line.strip().split(self.delimiter)
+                    label_set.add(np.int64(line[1]))
+                label_map = {
+                    oldlabel: newlabel
+                    for newlabel, oldlabel in enumerate(label_set)
+                }
+
+            for line in lines:
+                line = line.strip().split(self.delimiter)
+                self.image_paths.append(os.path.join(self._img_root, line[0]))
+                if self.relabel:
+                    self.labels.append(label_map[np.int64(line[1])])
+                else:
+                    self.labels.append(np.int64(line[1]))
+                assert os.path.exists(self.image_paths[
+                    -1]), f"path {self.image_paths[-1]} does not exist."
+
+        # NOTE: code from PKSampler below
+        # group sample indexes into their label bucket
+        self.label_dict = defaultdict(list)
+        for idx, label in enumerate(self.labels):
+            self.label_dict[label].append(idx)
+        # get all label
+        self.label_list = list(self.label_dict)
+        assert len(self.label_list) * self.sample_per_id >= self.batch_size, \
+            f"batch size({self.batch_size}) should not be bigger than than #classes({len(self.label_list)})*sample_per_id({self.sample_per_id})"
+
+        if self.sample_method == "id_avg_prob":
+            self.prob_list = np.array([1 / len(self.label_list)] *
+                                      len(self.label_list))
+        elif self.sample_method == "sample_avg_prob":
+            counter = []
+            for label_i in self.label_list:
+                counter.append(len(self.label_dict[label_i]))
+            self.prob_list = np.array(counter) / sum(counter)
+
+        # reweight prob_list according to id_list and ratio if provided
+        if id_list and ratio:
+            assert len(id_list) % 2 == 0 and len(id_list) == len(ratio) * 2
+            for i in range(len(self.prob_list)):
+                for j in range(len(ratio)):
+                    if i >= id_list[j * 2] and i <= id_list[j * 2 + 1]:
+                        self.prob_list[i] = self.prob_list[i] * ratio[j]
+                        break
+            self.prob_list = self.prob_list / sum(self.prob_list)
+
+        assert os.path.exists(
+            self._cls_path), f"path {self._cls_path} does not exist."
+        assert os.path.exists(
+            self._img_root), f"path {self._img_root} does not exist."
+
+        diff = np.abs(sum(self.prob_list) - 1)
+        if diff > 0.00000001:
+            self.prob_list[-1] = 1 - sum(self.prob_list[:-1])
+            if self.prob_list[-1] > 1 or self.prob_list[-1] < 0:
+                logger.error("PKSampler prob list error")
+            else:
+                logger.info(
+                    "sum of prob list not equal to 1, diff is {}, change the last prob".
+                    format(diff))
+
+        # whole dataset size
+        self.data_set_len = len(self.image_paths)
+
+        # get sharded size
+        self.sharded_data_set_len = self.data_set_len // self.num_gpus
+
+        # iteration log
+        self.shuffle = shuffle
+        self.total_iter = self.sharded_data_set_len // batch_size
+        self.iter_count = 0
+
+    def __iter__(self):
+        if self.shuffle:
+            seed = self.shard_id * 12345 + self.epoch
+            np.random.RandomState(seed).shuffle(self.label_list)
+            np.random.RandomState(seed).shuffle(self.prob_list)
+            self.epoch += 1
+        return self
+
+    def __next__(self):
+        if self.iter_count >= self.total_iter:
+            self.__iter__()
+            self.iter_count = 0
+
+        batch_indexes = []
+        for _ in range(self.sharded_data_set_len):
+            batch_label_list = np.random.choice(
+                self.label_list,
+                size=self.label_per_batch,
+                replace=False,
+                p=self.prob_list)
+            for label_i in batch_label_list:
+                label_i_indexes = self.label_dict[label_i]
+                if self.sample_per_id <= len(label_i_indexes):
+                    batch_indexes.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_id,
+                            replace=False))
+                else:
+                    batch_indexes.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_id,
+                            replace=True))
+            if len(batch_indexes) == self.batch_size:
+                break
+            batch_indexes = []
+
+        batch_raw_images = []
+        batch_labels = []
+        for index in batch_indexes:
+            batch_raw_images.append(
+                np.fromfile(
+                    self.image_paths[index], dtype="uint8"))
+            batch_labels.append(self.labels[index])
+
+        self.iter_count += 1
+        return (batch_raw_images, np.array(batch_labels, dtype="int64"))
+
+    def __len__(self):
+        return self.sharded_data_set_len
+
+
+class HybridPipeline(pipeline.Pipeline):
+    """Hybrid Pipeline
+
+    Args:
+        device (str): device
+        batch_size (int): batch size
+        py_num_workers (int): number of python worker(s)
+        num_threads (int): number of thread(s)
+        device_id (int): device id
+        seed (int): random seed
+        file_root (str): file root path
+        file_list (str): path to annotation file, such as `train_list.txt` or `val_list.txt`
+        transform_list (List[Callable]): List of DALI transform operator(s)
+        shard_id (int, optional): shard id. Defaults to 0.
+        num_shards (int, optional): number of shard(s). Defaults to 1.
+        random_shuffle (bool, optional): whether shuffle data during training. Defaults to True.
+        ext_src (optional): custom external source. Defaults to None.
+    """
+
+    def __init__(self,
+                 device: str,
+                 batch_size: int,
+                 py_num_workers: int,
+                 num_threads: int,
+                 device_id: int,
+                 seed: int,
+                 file_root: str,
+                 file_list: str,
+                 transform_list: List[Callable],
+                 shard_id: int=0,
+                 num_shards: int=1,
+                 random_shuffle: bool=True,
+                 ext_src=None):
+        super(HybridPipeline, self).__init__(
+            batch_size=batch_size,
+            device_id=device_id,
+            seed=seed,
+            py_start_method="fork" if ext_src is None else "spawn",
+            py_num_workers=py_num_workers,
+            num_threads=num_threads)
+        self.device = device
+        self.ext_src = ext_src
+        if ext_src is None:
+            self.reader = ops.readers.File(
+                file_root=file_root,
+                file_list=file_list,
+                shard_id=shard_id,
+                num_shards=num_shards,
+                random_shuffle=random_shuffle)
+        self.transforms = ops.Compose(transform_list)
+        self.cast = ops.Cast(dtype=types.DALIDataType.INT64, device=device)
+
+    def define_graph(self):
+        if self.ext_src:
+            raw_images, labels = fn.external_source(
+                source=self.ext_src,
+                num_outputs=2,
+                dtype=[types.DALIDataType.UINT8, types.DALIDataType.INT64],
+                batch=True,
+                parallel=True)
+        else:
+            raw_images, labels = self.reader(name="Reader")
+        images = self.transforms(raw_images)
+        return [
+            images, self.cast(labels.gpu() if self.device == "gpu" else labels)
+        ]
+
+    def __len__(self):
+        if self.ext_src is not None:
+            return len(self.ext_src)
+        return self.epoch_size(name="Reader")
+
+
+class DALIImageNetIterator(DALIGenericIterator):
+    def __init__(self, *kargs, **kwargs):
+        super(DALIImageNetIterator, self).__init__(*kargs, **kwargs)
+        self.in_dynamic_mode = paddle.in_dynamic_mode()
+
+    def __next__(self) -> List[paddle.Tensor]:
+        data_batch = super(DALIImageNetIterator,
+                           self).__next__()  # List[Dict[str, Tensor], ...]
+        # reformat to List[Tensor1, Tensor2, ...]
+        data_batch = [
+            paddle.to_tensor(data_batch[0][key])
+            if self.in_dynamic_mode else data_batch[0][key]
+            for key in self.output_map
+        ]
+        return data_batch
+
+
+def dali_dataloader(config: Dict[str, Any],
+                    mode: str,
+                    device: str,
+                    py_num_workers: int=1,
+                    num_threads: int=4,
+                    seed: Optional[int]=None,
+                    enable_fuse: bool=True) -> DALIImageNetIterator:
+    """build and return HybridPipeline
+
+    Args:
+        config (Dict[str, Any]): train/eval dataloader configuration
+        mode (str): mode
+        device (str): device string
+        py_num_workers (int, optional): number of python worker(s). Defaults to 1.
+        num_threads (int, optional): number of thread(s). Defaults to 4.
+        seed (Optional[int], optional): random seed. Defaults to None.
+        enable_fuse (bool, optional): enable fused operator(s). Defaults to True.
+
+    Returns:
+        DALIImageNetIterator: Iterable DALI dataloader
+    """
+    assert "gpu" in device, f"device must be \"gpu\" when running with DALI, but got {device}"
+    config_dataloader = config[mode]
+    device_id = int(device.split(":")[1])
+    device = "gpu"
+    seed = 42 if seed is None else seed
+    env = os.environ
+    num_gpus = paddle.distributed.get_world_size()
+
+    batch_size = config_dataloader["sampler"]["batch_size"]
+    file_root = config_dataloader["dataset"]["image_root"]
+    file_list = config_dataloader["dataset"]["cls_label_path"]
+    sampler_name = config_dataloader["sampler"].get("name",
+                                                    "DistributedBatchSampler")
+    transform_ops_cfg = config_dataloader["dataset"]["transform_ops"]
+    random_shuffle = config_dataloader["sampler"].get("shuffle", None)
+    dali_transforms = build_dali_transforms(
+        transform_ops_cfg, mode, device, enable_fuse=enable_fuse)
+    if "ToCHWImage" not in [type_name(op) for op in dali_transforms] and (
+            "CropMirrorNormalize" not in
+        [type_name(op) for op in dali_transforms]):
+        dali_transforms.append(ToCHWImage(perm=[2, 0, 1], device=device))
+        logger.info(
+            "Append DALI operator \"ToCHWImage\" at the end of dali_transforms for getting output in \"CHW\" shape"
+        )
+
+    if mode.lower() in ["train"]:
+        if "PADDLE_TRAINER_ID" in env and "PADDLE_TRAINERS_NUM" in env and "FLAGS_selected_gpus" in env:
+            shard_id = int(env["PADDLE_TRAINER_ID"])
+            num_shards = int(env["PADDLE_TRAINERS_NUM"])
+            device_id = int(env["FLAGS_selected_gpus"])
+        else:
+            shard_id = 0
+            num_shards = 1
+        logger.info(
+            f"Building DALI {mode} pipeline with num_shards: {num_shards}, num_gpus: {num_gpus}"
+        )
+
+        random_shuffle = random_shuffle if random_shuffle is not None else True
+        if sampler_name in ["PKSampler", "DistributedRandomIdentitySampler"]:
+            ext_src = ExternalSource_RandomIdentity(
+                batch_size=batch_size,
+                sample_per_id=config_dataloader["sampler"][
+                    "sample_per_id"
+                    if sampler_name == "PKSampler" else "num_instances"],
+                device_id=device_id,
+                shard_id=shard_id,
+                num_gpus=num_gpus,
+                image_root=file_root,
+                cls_label_path=file_list,
+                delimiter=None,
+                relabel=config_dataloader["dataset"].get("relabel", False),
+                sample_method=config_dataloader["sampler"].get(
+                    "sample_method", "sample_avg_prob"),
+                id_list=config_dataloader["sampler"].get("id_list", None),
+                ratio=config_dataloader["sampler"].get("ratio", None),
+                shuffle=random_shuffle)
+            logger.info(
+                f"Building DALI {mode} pipeline with ext_src({type_name(ext_src)})"
+            )
+        else:
+            ext_src = None
+
+        pipe = HybridPipeline(device, batch_size, py_num_workers, num_threads,
+                              device_id, seed + shard_id, file_root, file_list,
+                              dali_transforms, shard_id, num_shards,
+                              random_shuffle, ext_src)
+        pipe.build()
+        pipelines = [pipe]
+        if ext_src is None:
+            return DALIImageNetIterator(
+                pipelines, ["data", "label"], reader_name="Reader")
+        else:
+            return DALIImageNetIterator(
+                pipelines,
+                ["data", "label"],
+                size=len(ext_src),
+                last_batch_policy=LastBatchPolicy.
+                DROP  # make reset() successfully
+            )
+    elif mode.lower() in ["eval", "gallery", "query"]:
+        assert sampler_name in ["DistributedBatchSampler"], \
+            f"sampler_name({sampler_name}) must in [\"DistributedBatchSampler\"]"
+        if "PADDLE_TRAINER_ID" in env and "PADDLE_TRAINERS_NUM" in env and "FLAGS_selected_gpus" in env:
+            shard_id = int(env["PADDLE_TRAINER_ID"])
+            num_shards = int(env["PADDLE_TRAINERS_NUM"])
+            device_id = int(env["FLAGS_selected_gpus"])
+        else:
+            shard_id = 0
+            num_shards = 1
+        logger.info(
+            f"Building DALI {mode} pipeline with num_shards: {num_shards}, num_gpus: {num_gpus}..."
+        )
+
+        random_shuffle = random_shuffle if random_shuffle is not None else False
+        pipe = HybridPipeline(device, batch_size, py_num_workers, num_threads,
+                              device_id, seed + shard_id, file_root, file_list,
+                              dali_transforms, shard_id, num_shards,
+                              random_shuffle)
+        pipe.build()
+        pipelines = [pipe]
+        return DALIImageNetIterator(
+            pipelines, ["data", "label"], reader_name="Reader")
+    else:
+        raise ValueError(f"Invalid mode({mode}) when building DALI pipeline")
diff --git a/example/low_precision_training/ppcls/data/dataloader/face_dataset.py b/example/low_precision_training/ppcls/data/dataloader/face_dataset.py
new file mode 100755
index 000000000..a32cc2c5f
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/face_dataset.py
@@ -0,0 +1,163 @@
+import os
+import json
+import numpy as np
+from PIL import Image
+import cv2
+import paddle
+import paddle.vision.datasets as datasets
+from paddle.vision import transforms
+from paddle.vision.transforms import functional as F
+from paddle.io import Dataset
+from .common_dataset import create_operators
+from ppcls.data.preprocess import transform as transform_func
+
+# code is based on AdaFace: https://github.com/mk-minchul/AdaFace
+
+
+class AdaFaceDataset(Dataset):
+    def __init__(self, root_dir, label_path, transform=None):
+        self.root_dir = root_dir
+        self.transform = create_operators(transform)
+
+        with open(label_path) as fd:
+            lines = fd.readlines()
+        self.samples = []
+        for l in lines:
+            l = l.strip().split()
+            self.samples.append([os.path.join(root_dir, l[0]), int(l[1])])
+
+    def __len__(self):
+        return len(self.samples)
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        [path, target] = self.samples[index]
+        with open(path, 'rb') as f:
+            img = Image.open(f)
+            sample = img.convert('RGB')
+
+        # if 'WebFace' in self.root:
+        #     # swap rgb to bgr since image is in rgb for webface
+        #     sample = Image.fromarray(np.asarray(sample)[:, :, ::-1]
+        if self.transform is not None:
+            sample = transform_func(sample, self.transform)
+        return sample, target
+
+
+class FiveValidationDataset(Dataset):
+    def __init__(self, val_data_path, concat_mem_file_name):
+        '''
+        concatenates all validation datasets from emore
+        val_data_dict = {
+        'agedb_30': (agedb_30, agedb_30_issame),
+        "cfp_fp": (cfp_fp, cfp_fp_issame),
+        "lfw": (lfw, lfw_issame),
+        "cplfw": (cplfw, cplfw_issame),
+        "calfw": (calfw, calfw_issame),
+        }
+        agedb_30: 0
+        cfp_fp: 1
+        lfw: 2
+        cplfw: 3
+        calfw: 4
+        '''
+        val_data = get_val_data(val_data_path)
+        age_30, cfp_fp, lfw, age_30_issame, cfp_fp_issame, lfw_issame, cplfw, cplfw_issame, calfw, calfw_issame = val_data
+        val_data_dict = {
+            'agedb_30': (age_30, age_30_issame),
+            "cfp_fp": (cfp_fp, cfp_fp_issame),
+            "lfw": (lfw, lfw_issame),
+            "cplfw": (cplfw, cplfw_issame),
+            "calfw": (calfw, calfw_issame),
+        }
+        self.dataname_to_idx = {
+            "agedb_30": 0,
+            "cfp_fp": 1,
+            "lfw": 2,
+            "cplfw": 3,
+            "calfw": 4
+        }
+
+        self.val_data_dict = val_data_dict
+        # concat all dataset
+        all_imgs = []
+        all_issame = []
+        all_dataname = []
+        key_orders = []
+        for key, (imgs, issame) in val_data_dict.items():
+            all_imgs.append(imgs)
+            dup_issame = [
+            ]  # hacky way to make the issame length same as imgs. [1, 1, 0, 0, ...]
+            for same in issame:
+                dup_issame.append(same)
+                dup_issame.append(same)
+            all_issame.append(dup_issame)
+            all_dataname.append([self.dataname_to_idx[key]] * len(imgs))
+            key_orders.append(key)
+        assert key_orders == ['agedb_30', 'cfp_fp', 'lfw', 'cplfw', 'calfw']
+
+        if isinstance(all_imgs[0], np.memmap):
+            self.all_imgs = read_memmap(concat_mem_file_name)
+        else:
+            self.all_imgs = np.concatenate(all_imgs)
+
+        self.all_issame = np.concatenate(all_issame)
+        self.all_dataname = np.concatenate(all_dataname)
+
+    def __getitem__(self, index):
+        x_np = self.all_imgs[index].copy()
+        x = paddle.to_tensor(x_np)
+        y = self.all_issame[index]
+        dataname = self.all_dataname[index]
+        return x, y, dataname, index
+
+    def __len__(self):
+        return len(self.all_imgs)
+
+
+def read_memmap(mem_file_name):
+    # r+ mode: Open existing file for reading and writing
+    with open(mem_file_name + '.conf', 'r') as file:
+        memmap_configs = json.load(file)
+        return np.memmap(mem_file_name, mode='r+', \
+                        shape=tuple(memmap_configs['shape']), \
+                        dtype=memmap_configs['dtype'])
+
+
+def get_val_pair(path, name, use_memfile=True):
+    # installing bcolz should set proxy to access internet
+    import bcolz
+    if use_memfile:
+        mem_file_dir = os.path.join(path, name, 'memfile')
+        mem_file_name = os.path.join(mem_file_dir, 'mem_file.dat')
+        if os.path.isdir(mem_file_dir):
+            print('laoding validation data memfile')
+            np_array = read_memmap(mem_file_name)
+        else:
+            os.makedirs(mem_file_dir)
+            carray = bcolz.carray(rootdir=os.path.join(path, name), mode='r')
+            np_array = np.array(carray)
+            #  mem_array = make_memmap(mem_file_name, np_array)
+            #  del np_array, mem_array
+            del np_array
+            np_array = read_memmap(mem_file_name)
+    else:
+        np_array = bcolz.carray(rootdir=os.path.join(path, name), mode='r')
+
+    issame = np.load(os.path.join(path, '{}_list.npy'.format(name)))
+    return np_array, issame
+
+
+def get_val_data(data_path):
+    agedb_30, agedb_30_issame = get_val_pair(data_path, 'agedb_30')
+    cfp_fp, cfp_fp_issame = get_val_pair(data_path, 'cfp_fp')
+    lfw, lfw_issame = get_val_pair(data_path, 'lfw')
+    cplfw, cplfw_issame = get_val_pair(data_path, 'cplfw')
+    calfw, calfw_issame = get_val_pair(data_path, 'calfw')
+    return agedb_30, cfp_fp, lfw, agedb_30_issame, cfp_fp_issame, lfw_issame, cplfw, cplfw_issame, calfw, calfw_issame
diff --git a/example/low_precision_training/ppcls/data/dataloader/icartoon_dataset.py b/example/low_precision_training/ppcls/data/dataloader/icartoon_dataset.py
new file mode 100755
index 000000000..18e3b4b7f
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/icartoon_dataset.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from .common_dataset import CommonDataset
+
+
+class ICartoonDataset(CommonDataset):
+    def _load_anno(self, seed=None):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(np.int64(l[1]))
+                assert os.path.exists(self.images[-1])
diff --git a/example/low_precision_training/ppcls/data/dataloader/imagenet_dataset.py b/example/low_precision_training/ppcls/data/dataloader/imagenet_dataset.py
new file mode 100755
index 000000000..cc66007d9
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/imagenet_dataset.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from .common_dataset import CommonDataset
+
+
+class ImageNetDataset(CommonDataset):
+    """ImageNetDataset
+
+    Args:
+        image_root (str): image root, path to `ILSVRC2012`
+        cls_label_path (str): path to annotation file `train_list.txt` or `val_list.txt`
+        transform_ops (list, optional): list of transform op(s). Defaults to None.
+        delimiter (str, optional): delimiter. Defaults to None.
+        relabel (bool, optional): whether do relabel when original label do not starts from 0 or are discontinuous. Defaults to False.
+    """
+
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 transform_ops=None,
+                 delimiter=None,
+                 relabel=False):
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.relabel = relabel
+        super(ImageNetDataset, self).__init__(image_root, cls_label_path,
+                                              transform_ops)
+
+    def _load_anno(self, seed=None):
+        assert os.path.exists(
+            self._cls_path), f"path {self._cls_path} does not exist."
+        assert os.path.exists(
+            self._img_root), f"path {self._img_root} does not exist."
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            if self.relabel:
+                label_set = set()
+                for line in lines:
+                    line = line.strip().split(self.delimiter)
+                    label_set.add(np.int64(line[1]))
+                label_map = {
+                    oldlabel: newlabel
+                    for newlabel, oldlabel in enumerate(label_set)
+                }
+
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(lines)
+            for line in lines:
+                line = line.strip().split(self.delimiter)
+                self.images.append(os.path.join(self._img_root, line[0]))
+                if self.relabel:
+                    self.labels.append(label_map[np.int64(line[1])])
+                else:
+                    self.labels.append(np.int64(line[1]))
+                assert os.path.exists(self.images[
+                    -1]), f"path {self.images[-1]} does not exist."
diff --git a/example/low_precision_training/ppcls/data/dataloader/logo_dataset.py b/example/low_precision_training/ppcls/data/dataloader/logo_dataset.py
new file mode 100755
index 000000000..132ead989
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/logo_dataset.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import tarfile
+import numpy as np
+from PIL import Image  #all use default backend
+
+import paddle
+from paddle.io import Dataset
+import pickle
+import os
+import cv2
+import random
+
+from .common_dataset import CommonDataset
+
+
+class LogoDataset(CommonDataset):
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                if l[0] == 'image_id':
+                    continue
+                self.images.append(os.path.join(self._img_root, l[3]))
+                self.labels.append(np.int64(l[1]) - 1)
+                assert os.path.exists(self.images[-1])
diff --git a/example/low_precision_training/ppcls/data/dataloader/metabin_sampler.py b/example/low_precision_training/ppcls/data/dataloader/metabin_sampler.py
new file mode 100755
index 000000000..f5cb29f75
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/metabin_sampler.py
@@ -0,0 +1,290 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+from collections import defaultdict
+import numpy as np
+from paddle.io import Sampler, BatchSampler
+
+
+class DomainShuffleSampler(Sampler):
+    """
+    Domain shuffle sampler
+    Args:
+        dataset(Dataset): Dataset for sampling
+        batch_size (int): Number of examples in a batch.
+        num_instances (int): Number of instances per identity in a batch.
+        camera_to_domain (bool): If True, consider each camera as an individual domain
+    
+    Code was heavily based on https://github.com/bismex/MetaBIN
+    reference: https://arxiv.org/abs/2011.14670v2
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_instances,
+                 camera_to_domain=True):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_instances = num_instances
+        self.num_pids_per_batch = batch_size // self.num_instances
+
+        self.index_pid = defaultdict(list)
+        self.pid_domain = defaultdict(list)
+        self.pid_index = defaultdict(list)
+        # data_source: [(img_path, pid, camera, domain), ...]  (camera_to_domain = True)
+        if camera_to_domain:
+            data_source = zip(dataset.images, dataset.labels, dataset.cameras,
+                              dataset.cameras)
+        else:
+            data_source = zip(dataset.images, dataset.labels, dataset.cameras,
+                              dataset.domains)
+        for index, info in enumerate(data_source):
+            domainid = info[3]
+            if camera_to_domain:
+                pid = 'p' + str(info[1]) + '_d' + str(domainid)
+            else:
+                pid = 'p' + str(info[1])
+            self.index_pid[index] = pid
+            self.pid_domain[pid] = domainid
+            self.pid_index[pid].append(index)
+
+        self.pids = list(self.pid_index.keys())
+        self.domains = list(self.pid_domain.values())
+
+        self.num_identities = len(self.pids)
+        self.num_domains = len(set(self.domains))
+
+        self.batch_size //= self.num_domains
+        self.num_pids_per_batch //= self.num_domains
+
+        val_pid_index = [len(x) for x in self.pid_index.values()]
+
+        val_pid_index_upper = []
+        for x in val_pid_index:
+            v_remain = x % self.num_instances
+            if v_remain == 0:
+                val_pid_index_upper.append(x)
+            else:
+                val_pid_index_upper.append(x - v_remain + self.num_instances)
+
+        cnt_domains = [0 for x in range(self.num_domains)]
+        for val, index in zip(val_pid_index_upper, self.domains):
+            cnt_domains[index] += val
+        self.max_cnt_domains = max(cnt_domains)
+        self.total_images = self.num_domains * (
+            self.max_cnt_domains -
+            (self.max_cnt_domains % self.batch_size) - self.batch_size)
+
+    def _get_epoch_indices(self):
+        def _get_batch_idxs(pids, pid_index, num_instances):
+            batch_idxs_dict = defaultdict(list)
+            for pid in pids:
+                idxs = copy.deepcopy(pid_index[pid])
+                if len(
+                        idxs
+                ) < self.num_instances:  # if idxs is smaller than num_instance, choice redundantly
+                    idxs = np.random.choice(
+                        idxs, size=self.num_instances, replace=True)
+                elif (len(idxs) % self.num_instances) != 0:
+                    idxs.extend(
+                        np.random.choice(
+                            idxs,
+                            size=self.num_instances - len(idxs) %
+                            self.num_instances,
+                            replace=False))
+
+                np.random.shuffle(idxs)
+                batch_idxs = []
+                for idx in idxs:
+                    batch_idxs.append(int(idx))
+                    if len(batch_idxs) == num_instances:
+                        batch_idxs_dict[pid].append(batch_idxs)
+                        batch_idxs = []
+            return batch_idxs_dict
+
+        batch_idxs_dict = _get_batch_idxs(self.pids, self.pid_index,
+                                          self.num_instances)
+
+        # batch_idxs_dict: dictionary, len(batch_idxs_dict) is len(pidx), each pidx, num_instance x k samples
+        avai_pids = copy.deepcopy(self.pids)
+
+        local_avai_pids = \
+            [[pids for pids, idx in zip(avai_pids, self.domains) if idx == i]
+             for i in list(set(self.domains))]
+        local_avai_pids_save = copy.deepcopy(local_avai_pids)
+
+        revive_idx = [False for i in range(self.num_domains)]
+        final_idxs = []
+        while len(avai_pids) >= self.num_pids_per_batch and not all(
+                revive_idx):
+            for i in range(self.num_domains):
+                selected_pids = np.random.choice(
+                    local_avai_pids[i], self.num_pids_per_batch, replace=False)
+                for pid in selected_pids:
+                    batch_idxs = batch_idxs_dict[pid].pop(0)
+                    final_idxs.extend(batch_idxs)
+                    if len(batch_idxs_dict[pid]) == 0:
+                        avai_pids.remove(pid)
+                        local_avai_pids[i].remove(pid)
+            for i in range(self.num_domains):
+                if len(local_avai_pids[i]) < self.num_pids_per_batch:
+                    batch_idxs_dict_new = _get_batch_idxs(
+                        self.pids, self.pid_index, self.num_instances)
+
+                    revive_idx[i] = True
+                    cnt = 0
+                    for pid, val in batch_idxs_dict_new.items():
+                        if self.domains[cnt] == i:
+                            batch_idxs_dict[pid] = copy.deepcopy(
+                                batch_idxs_dict_new[pid])
+                        cnt += 1
+                    local_avai_pids[i] = copy.deepcopy(local_avai_pids_save[i])
+                    avai_pids.extend(local_avai_pids_save[i])
+                    avai_pids = list(set(avai_pids))
+        return final_idxs
+
+    def __iter__(self):
+        yield from itertools.islice(self._infinite_indices(), 0, None, 1)
+
+    def _infinite_indices(self):
+        while True:
+            indices = self._get_epoch_indices()
+            yield from indices
+
+
+class DomainShuffleBatchSampler(BatchSampler):
+    def __init__(self, dataset, batch_size, num_instances, camera_to_domain,
+                 drop_last):
+        sampler = DomainShuffleSampler(
+            dataset=dataset,
+            batch_size=batch_size,
+            num_instances=num_instances,
+            camera_to_domain=camera_to_domain)
+        super().__init__(
+            sampler=sampler, batch_size=batch_size, drop_last=drop_last)
+
+
+class NaiveIdentitySampler(Sampler):
+    """
+    Randomly sample N identities, then for each identity,
+    randomly sample K instances, therefore batch size is N*K.
+    Args:
+        dataset(Dataset): Dataset for sampling
+        batch_size (int): Number of examples in a batch.
+        num_instances (int): Number of instances per identity in a batch.
+
+    Code was heavily based on https://github.com/bismex/MetaBIN
+    reference: https://arxiv.org/abs/2011.14670v2
+    """
+
+    def __init__(self, dataset, batch_size, num_instances):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_instances = num_instances
+        self.num_pids_per_batch = batch_size // self.num_instances
+
+        self.index_pid = defaultdict(list)
+        self.pid_cam = defaultdict(list)
+        self.pid_index = defaultdict(list)
+
+        # data_source: [(img_path, pid, camera, domain), ...]  (camera_to_domain = True)
+        data_source = zip(dataset.images, dataset.labels, dataset.cameras,
+                          dataset.cameras)
+        for index, info in enumerate(data_source):
+            pid = info[1]
+            camid = info[2]
+            self.index_pid[index] = pid
+            self.pid_cam[pid].append(camid)
+            self.pid_index[pid].append(index)
+
+        self.pids = list(self.pid_index.keys())
+        self.num_identities = len(self.pids)
+
+        val_pid_index = [len(x) for x in self.pid_index.values()]
+
+        val_pid_index_upper = []
+        for x in val_pid_index:
+            v_remain = x % self.num_instances
+            if v_remain == 0:
+                val_pid_index_upper.append(x)
+            else:
+                val_pid_index_upper.append(x - v_remain + self.num_instances)
+
+        total_images = sum(val_pid_index_upper)
+        total_images = total_images - (total_images % self.batch_size
+                                       ) - self.batch_size  # approax
+        self.total_images = total_images
+
+    def _get_epoch_indices(self):
+        batch_idxs_dict = defaultdict(list)
+
+        for pid in self.pids:
+            idxs = copy.deepcopy(
+                self.pid_index[pid])  # whole index for each ID
+            if len(
+                    idxs
+            ) < self.num_instances:  # if idxs is smaller than num_instance, choice redundantly
+                idxs = np.random.choice(
+                    idxs, size=self.num_instances, replace=True)
+            elif (len(idxs) % self.num_instances) != 0:
+                idxs.extend(
+                    np.random.choice(
+                        idxs,
+                        size=self.num_instances - len(idxs) %
+                        self.num_instances,
+                        replace=False))
+
+            np.random.shuffle(idxs)
+            batch_idxs = []
+            for idx in idxs:
+                batch_idxs.append(int(idx))
+                if len(batch_idxs) == self.num_instances:
+                    batch_idxs_dict[pid].append(batch_idxs)
+                    batch_idxs = []
+        # batch_idxs_dict: dictionary, len(batch_idxs_dict) is len(pidx), each pidx, num_instance x k samples
+        avai_pids = copy.deepcopy(self.pids)
+        final_idxs = []
+
+        while len(avai_pids) >= self.num_pids_per_batch:
+            selected_pids = np.random.choice(
+                avai_pids, self.num_pids_per_batch, replace=False)
+            for pid in selected_pids:
+                batch_idxs = batch_idxs_dict[pid].pop(0)
+                final_idxs.extend(batch_idxs)
+                if len(batch_idxs_dict[pid]) == 0: avai_pids.remove(pid)
+        return final_idxs
+
+    def __iter__(self):
+        yield from itertools.islice(self._infinite_indices(), 0, None, 1)
+
+    def _infinite_indices(self):
+        while True:
+            indices = self._get_epoch_indices()
+            yield from indices
+
+    def __len__(self):
+        return self.total_images
+
+
+class NaiveIdentityBatchSampler(BatchSampler):
+    def __init__(self, dataset, batch_size, num_instances, drop_last):
+        sampler = NaiveIdentitySampler(
+            dataset=dataset,
+            batch_size=batch_size,
+            num_instances=num_instances)
+        super().__init__(
+            sampler=sampler, batch_size=batch_size, drop_last=drop_last)
diff --git a/example/low_precision_training/ppcls/data/dataloader/mix_dataset.py b/example/low_precision_training/ppcls/data/dataloader/mix_dataset.py
new file mode 100755
index 000000000..cbf4b4028
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/mix_dataset.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from paddle.io import Dataset
+from .. import dataloader
+
+
+class MixDataset(Dataset):
+    def __init__(self, datasets_config):
+        super().__init__()
+        self.dataset_list = []
+        start_idx = 0
+        end_idx = 0
+        for config_i in datasets_config:
+            dataset_name = config_i.pop('name')
+            dataset = getattr(dataloader, dataset_name)(**config_i)
+            end_idx += len(dataset)
+            self.dataset_list.append([end_idx, start_idx, dataset])
+            start_idx = end_idx
+
+        self.length = end_idx
+
+    def __getitem__(self, idx):
+        for dataset_i in self.dataset_list:
+            if dataset_i[0] > idx:
+                dataset_i_idx = idx - dataset_i[1]
+                return dataset_i[2][dataset_i_idx]
+
+    def __len__(self):
+        return self.length
+
+    def get_dataset_list(self):
+        return self.dataset_list
diff --git a/example/low_precision_training/ppcls/data/dataloader/mix_sampler.py b/example/low_precision_training/ppcls/data/dataloader/mix_sampler.py
new file mode 100755
index 000000000..2df3109ce
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/mix_sampler.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+
+from paddle.io import DistributedBatchSampler, Sampler
+
+from ppcls.utils import logger
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data import dataloader
+
+
+class MixSampler(DistributedBatchSampler):
+    def __init__(self, dataset, batch_size, sample_configs, iter_per_epoch):
+        super().__init__(dataset, batch_size)
+        assert isinstance(dataset,
+                          MixDataset), "MixSampler only support MixDataset"
+        self.sampler_list = []
+        self.batch_size = batch_size
+        self.start_list = []
+        self.length = iter_per_epoch
+        dataset_list = dataset.get_dataset_list()
+        batch_size_left = self.batch_size
+        self.iter_list = []
+        for i, config_i in enumerate(sample_configs):
+            self.start_list.append(dataset_list[i][1])
+            sample_method = config_i.pop("name")
+            ratio_i = config_i.pop("ratio")
+            if i < len(sample_configs) - 1:
+                batch_size_i = int(self.batch_size * ratio_i)
+                batch_size_left -= batch_size_i
+            else:
+                batch_size_i = batch_size_left
+            assert batch_size_i <= len(dataset_list[i][2])
+            config_i["batch_size"] = batch_size_i
+            if sample_method == "DistributedBatchSampler":
+                sampler_i = DistributedBatchSampler(dataset_list[i][2],
+                                                    **config_i)
+            else:
+                sampler_i = getattr(dataloader, sample_method)(
+                    dataset_list[i][2], **config_i)
+            self.sampler_list.append(sampler_i)
+            self.iter_list.append(iter(sampler_i))
+            self.length += len(dataset_list[i][2]) * ratio_i
+            self.iter_counter = 0
+
+    def __iter__(self):
+        while self.iter_counter < self.length:
+            batch = []
+            for i, iter_i in enumerate(self.iter_list):
+                batch_i = next(iter_i, None)
+                if batch_i is None:
+                    iter_i = iter(self.sampler_list[i])
+                    self.iter_list[i] = iter_i
+                    batch_i = next(iter_i, None)
+                    assert batch_i is not None, "dataset {} return None".format(
+                        i)
+                batch += [idx + self.start_list[i] for idx in batch_i]
+            if len(batch) == self.batch_size:
+                self.iter_counter += 1
+                yield batch
+            else:
+                logger.info("Some dataset reaches end")
+        self.iter_counter = 0
+
+    def __len__(self):
+        return self.length
diff --git a/example/low_precision_training/ppcls/data/dataloader/multi_scale_dataset.py b/example/low_precision_training/ppcls/data/dataloader/multi_scale_dataset.py
new file mode 100755
index 000000000..27e84ceb6
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/multi_scale_dataset.py
@@ -0,0 +1,118 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from paddle.io import Dataset
+from paddle.vision import transforms
+import cv2
+import warnings
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.data.preprocess.ops.operators import DecodeImage
+from ppcls.utils import logger
+from ppcls.data.dataloader.common_dataset import create_operators
+
+
+class MultiScaleDataset(Dataset):
+    """MultiScaleDataset
+
+    Args:
+        image_root (str): image root
+        cls_label_path (str): path to annotation file `train_list.txt` or `val_list.txt`
+        transform_ops (list, optional): list of transform op(s). Defaults to None.
+        delimiter (str, optional): delimiter. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            image_root,
+            cls_label_path,
+            transform_ops=None,
+            delimiter=None, ):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self.transform_ops = transform_ops
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.images = []
+        self.labels = []
+        self._load_anno()
+        self.has_crop_flag = 1
+
+    def _load_anno(self, seed=None):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(lines)
+            for l in lines:
+                l = l.strip().split(self.delimiter)
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(np.int64(l[1]))
+                assert os.path.exists(self.images[-1])
+
+    def __getitem__(self, properties):
+        # properites is a tuple, contains (width, height, index)
+        img_width = properties[0]
+        img_height = properties[1]
+        index = properties[2]
+        has_crop = False
+        if self.transform_ops:
+            for i in range(len(self.transform_ops)):
+                op = self.transform_ops[i]
+                resize_op = ['RandCropImage', 'ResizeImage', 'CropImage']
+                for resize in resize_op:
+                    if resize in op:
+                        if self.has_crop_flag:
+                            logger.warning(
+                                "Multi scale dataset will crop image according to the multi scale resolution"
+                            )
+                        self.transform_ops[i][resize] = {
+                            'size': (img_width, img_height)
+                        }
+                        has_crop = True
+                        self.has_crop_flag = 0
+        if has_crop == False:
+            logger.error("Multi scale dateset requests RandCropImage")
+            raise RuntimeError("Multi scale dateset requests RandCropImage")
+        self._transform_ops = create_operators(self.transform_ops)
+
+        try:
+            with open(self.images[index], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[index])
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[index], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/example/low_precision_training/ppcls/data/dataloader/multi_scale_sampler.py b/example/low_precision_training/ppcls/data/dataloader/multi_scale_sampler.py
new file mode 100755
index 000000000..9208ed828
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/multi_scale_sampler.py
@@ -0,0 +1,133 @@
+from paddle.io import Sampler
+import paddle.distributed as dist
+
+import math
+import random
+import numpy as np
+
+from ppcls import data
+
+
+class MultiScaleSampler(Sampler):
+    def __init__(self,
+                 data_source,
+                 scales,
+                 first_bs,
+                 divided_factor=32,
+                 is_training=True,
+                 shuffle=True,
+                 seed=None):
+        """
+            multi scale samper
+            Args:
+                data_source(dataset)
+                scales(list): several scales for image resolution
+                first_bs(int): batch size for the first scale in scales
+                divided_factor(int): ImageNet models down-sample images by a factor, ensure that width and height dimensions are multiples are multiple of devided_factor.
+                is_training(boolean): mode
+        """
+        # min. and max. spatial dimensions
+        self.data_source = data_source
+        self.n_data_samples = len(self.data_source)
+
+        if isinstance(scales[0], tuple):
+            width_dims = [i[0] for i in scales]
+            height_dims = [i[1] for i in scales]
+        elif isinstance(scales[0], int):
+            width_dims = scales
+            height_dims = scales
+        base_im_w = width_dims[0]
+        base_im_h = height_dims[0]
+        base_batch_size = first_bs
+
+        # Get the GPU and node related information
+        num_replicas = dist.get_world_size()
+        rank = dist.get_rank()
+        # adjust the total samples to avoid batch dropping
+        num_samples_per_replica = int(
+            math.ceil(self.n_data_samples * 1.0 / num_replicas))
+        img_indices = [idx for idx in range(self.n_data_samples)]
+
+        self.shuffle = shuffle
+        if is_training:
+            # compute the spatial dimensions and corresponding batch size
+            # ImageNet models down-sample images by a factor of 32.
+            # Ensure that width and height dimensions are multiples are multiple of 32.
+            width_dims = [
+                int((w // divided_factor) * divided_factor) for w in width_dims
+            ]
+            height_dims = [
+                int((h // divided_factor) * divided_factor)
+                for h in height_dims
+            ]
+
+            img_batch_pairs = list()
+            base_elements = base_im_w * base_im_h * base_batch_size
+            for (h, w) in zip(height_dims, width_dims):
+                batch_size = int(max(1, (base_elements / (h * w))))
+                img_batch_pairs.append((w, h, batch_size))
+            self.img_batch_pairs = img_batch_pairs
+        else:
+            self.img_batch_pairs = [(base_im_w, base_im_h, base_batch_size)]
+
+        self.img_indices = img_indices
+        self.n_samples_per_replica = num_samples_per_replica
+        self.epoch = 0
+        self.rank = rank
+        self.num_replicas = num_replicas
+        self.seed = seed
+        self.batch_list = []
+        self.current = 0
+        indices_rank_i = self.img_indices[self.rank:len(self.img_indices):
+                                          self.num_replicas]
+        while self.current < self.n_samples_per_replica:
+            curr_w, curr_h, curr_bsz = random.choice(self.img_batch_pairs)
+
+            end_index = min(self.current + curr_bsz,
+                            self.n_samples_per_replica)
+
+            batch_ids = indices_rank_i[self.current:end_index]
+            n_batch_samples = len(batch_ids)
+            if n_batch_samples != curr_bsz:
+                batch_ids += indices_rank_i[:(curr_bsz - n_batch_samples)]
+            self.current += curr_bsz
+
+            if len(batch_ids) > 0:
+                batch = [curr_w, curr_h, len(batch_ids)]
+                self.batch_list.append(batch)
+        self.length = len(self.batch_list)
+
+    def __iter__(self):
+        if self.shuffle:
+            if self.seed is not None:
+                random.seed(self.seed)
+            else:
+                random.seed(self.epoch)
+            self.epoch += 1
+            random.shuffle(self.img_indices)
+            random.shuffle(self.img_batch_pairs)
+            indices_rank_i = self.img_indices[self.rank:len(self.img_indices):
+                                              self.num_replicas]
+        else:
+            indices_rank_i = self.img_indices[self.rank:len(self.img_indices):
+                                              self.num_replicas]
+
+        start_index = 0
+        for batch_tuple in self.batch_list:
+            curr_w, curr_h, curr_bsz = batch_tuple
+            end_index = min(start_index + curr_bsz, self.n_samples_per_replica)
+            batch_ids = indices_rank_i[start_index:end_index]
+            n_batch_samples = len(batch_ids)
+            if n_batch_samples != curr_bsz:
+                batch_ids += indices_rank_i[:(curr_bsz - n_batch_samples)]
+            start_index += curr_bsz
+
+            if len(batch_ids) > 0:
+                batch = [(curr_w, curr_h, b_id) for b_id in batch_ids]
+                yield batch
+
+    def set_epoch(self, epoch: int):
+        self.epoch = epoch
+
+    def __len__(self):
+        return self.length
diff --git a/example/low_precision_training/ppcls/data/dataloader/multilabel_dataset.py b/example/low_precision_training/ppcls/data/dataloader/multilabel_dataset.py
new file mode 100755
index 000000000..c67a5ae78
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/multilabel_dataset.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+import cv2
+
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+from .common_dataset import CommonDataset
+
+
+class MultiLabelDataset(CommonDataset):
+    def _load_anno(self, label_ratio=False):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.label_ratio = label_ratio
+        self.images = []
+        self.labels = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                self.images.append(os.path.join(self._img_root, l[0]))
+
+                labels = l[1].split(',')
+                labels = [np.int64(i) for i in labels]
+
+                self.labels.append(labels)
+                assert os.path.exists(self.images[-1])
+        if self.label_ratio is not False:
+            return np.array(self.labels).mean(0).astype("float32")
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            label = np.array(self.labels[idx]).astype("float32")
+            if self.label_ratio is not False:
+                return (img, np.array([label, self.label_ratio]))
+            else:
+                return (img, label)
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
diff --git a/example/low_precision_training/ppcls/data/dataloader/person_dataset.py b/example/low_precision_training/ppcls/data/dataloader/person_dataset.py
new file mode 100755
index 000000000..eac7bc782
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/person_dataset.py
@@ -0,0 +1,269 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle.io import Dataset
+import os
+import cv2
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+from .common_dataset import create_operators
+import os.path as osp
+import glob
+import re
+from PIL import Image
+
+
+class Market1501(Dataset):
+    """
+    Market1501
+    Reference:
+    Zheng et al. Scalable Person Re-identification: A Benchmark. ICCV 2015.
+    URL: http://www.liangzheng.org/Project/project_reid.html
+
+    Dataset statistics:
+    # identities: 1501 (+1 for background)
+    # images: 12936 (train) + 3368 (query) + 15913 (gallery)
+    """
+    _dataset_dir = 'market1501/Market-1501-v15.09.15'
+
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 transform_ops=None,
+                 backend="cv2"):
+        self._img_root = image_root
+        self._cls_path = cls_label_path  # the sub folder in the dataset
+        self._dataset_dir = osp.join(image_root, self._dataset_dir,
+                                     self._cls_path)
+        self._check_before_run()
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self.backend = backend
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno(relabel=True if 'train' in self._cls_path else False)
+
+    def _check_before_run(self):
+        """Check if the file is available before going deeper"""
+        if not osp.exists(self._dataset_dir):
+            raise RuntimeError("'{}' is not available".format(
+                self._dataset_dir))
+
+    def _load_anno(self, relabel=False):
+        img_paths = glob.glob(osp.join(self._dataset_dir, '*.jpg'))
+        pattern = re.compile(r'([-\d]+)_c(\d)')
+
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        pid_container = set()
+
+        for img_path in sorted(img_paths):
+            pid, _ = map(int, pattern.search(img_path).groups())
+            if pid == -1: continue  # junk images are just ignored
+            pid_container.add(pid)
+        pid2label = {pid: label for label, pid in enumerate(pid_container)}
+
+        for img_path in sorted(img_paths):
+            pid, camid = map(int, pattern.search(img_path).groups())
+            if pid == -1: continue  # junk images are just ignored
+            assert 0 <= pid <= 1501  # pid == 0 means background
+            assert 1 <= camid <= 6
+            camid -= 1  # index starts from 0
+            if relabel: pid = pid2label[pid]
+            self.images.append(img_path)
+            self.labels.append(pid)
+            self.cameras.append(camid)
+
+        self.num_pids, self.num_imgs, self.num_cams = get_imagedata_info(
+            self.images, self.labels, self.cameras, subfolder=self._cls_path)
+
+    def __getitem__(self, idx):
+        try:
+            img = Image.open(self.images[idx]).convert('RGB')
+            if self.backend == "cv2":
+                img = np.array(img, dtype="float32").astype(np.uint8)
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            if self.backend == "cv2":
+                img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx], self.cameras[idx])
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
+
+
+class MSMT17(Dataset):
+    """
+    MSMT17
+
+    Reference:
+    Wei et al. Person Transfer GAN to Bridge Domain Gap for Person Re-Identification. CVPR 2018.
+
+    URL: http://www.pkuvmc.com/publications/msmt17.html
+
+    Dataset statistics:
+    # identities: 4101
+    # images: 32621 (train) + 11659 (query) + 82161 (gallery)
+    # cameras: 15
+    """
+    _dataset_dir = 'msmt17/MSMT17_V1'
+
+    def __init__(self, image_root, cls_label_path, transform_ops=None):
+        self._img_root = image_root
+        self._cls_path = cls_label_path  # the sub folder in the dataset
+        self._dataset_dir = osp.join(image_root, self._dataset_dir,
+                                     self._cls_path)
+        self._check_before_run()
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno(relabel=True if 'train' in self._cls_path else False)
+
+    def _check_before_run(self):
+        """Check if the file is available before going deeper"""
+        if not osp.exists(self._dataset_dir):
+            raise RuntimeError("'{}' is not available".format(
+                self._dataset_dir))
+
+    def _load_anno(self, relabel=False):
+        img_paths = glob.glob(osp.join(self._dataset_dir, '*.jpg'))
+        pattern = re.compile(r'([-\d]+)_c(\d+)')
+
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        pid_container = set()
+
+        for img_path in img_paths:
+            pid, _ = map(int, pattern.search(img_path).groups())
+            if pid == -1:
+                continue  # junk images are just ignored
+            pid_container.add(pid)
+        pid2label = {pid: label for label, pid in enumerate(pid_container)}
+
+        for img_path in img_paths:
+            pid, camid = map(int, pattern.search(img_path).groups())
+            if pid == -1:
+                continue  # junk images are just ignored
+            assert 1 <= camid <= 15
+            camid -= 1  # index starts from 0
+            if relabel:
+                pid = pid2label[pid]
+            self.images.append(img_path)
+            self.labels.append(pid)
+            self.cameras.append(camid)
+
+        self.num_pids, self.num_imgs, self.num_cams = get_imagedata_info(
+            self.images, self.labels, self.cameras, subfolder=self._cls_path)
+
+    def __getitem__(self, idx):
+        try:
+            img = Image.open(self.images[idx]).convert('RGB')
+            img = np.array(img, dtype="float32").astype(np.uint8)
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx], self.cameras[idx])
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
+
+
+class DukeMTMC(Market1501):
+    """
+    DukeMTMC-reID.
+
+    Reference:
+    Ristani et al. Performance Measures and a Data Set for Multi-Target, Multi-Camera Tracking. ECCVW 2016.
+    Zheng et al. Unlabeled Samples Generated by GAN Improve the Person Re-identification Baseline in vitro. ICCV 2017.
+
+    URL: https://github.com/layumi/DukeMTMC-reID_evaluation
+
+    Dataset statistics:
+    # identities: 1404 (train + query)
+    # images: 16522 (train) + 2228 (query) + 17661 (gallery)
+    # cameras: 8
+    """
+    _dataset_dir = 'dukemtmc/DukeMTMC-reID'
+
+    def _load_anno(self, relabel=False):
+        img_paths = glob.glob(osp.join(self._dataset_dir, '*.jpg'))
+        pattern = re.compile(r'([-\d]+)_c(\d+)')
+
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        pid_container = set()
+
+        for img_path in img_paths:
+            pid, _ = map(int, pattern.search(img_path).groups())
+            pid_container.add(pid)
+        pid2label = {pid: label for label, pid in enumerate(pid_container)}
+
+        for img_path in img_paths:
+            pid, camid = map(int, pattern.search(img_path).groups())
+            assert 1 <= camid <= 8
+            camid -= 1  # index starts from 0
+            if relabel:
+                pid = pid2label[pid]
+            self.images.append(img_path)
+            self.labels.append(pid)
+            self.cameras.append(camid)
+
+        self.num_pids, self.num_imgs, self.num_cams = get_imagedata_info(
+            self.images, self.labels, self.cameras, subfolder=self._cls_path)
+
+
+def get_imagedata_info(data, labels, cameras, subfolder='train'):
+    pids, cams = [], []
+    for _, pid, camid in zip(data, labels, cameras):
+        pids += [pid]
+        cams += [camid]
+    pids = set(pids)
+    cams = set(cams)
+    num_pids = len(pids)
+    num_cams = len(cams)
+    num_imgs = len(data)
+    print("Dataset statistics:")
+    print("  ----------------------------------------")
+    print("  subset   | # ids | # images | # cameras")
+    print("  ----------------------------------------")
+    print("  {}    | {:5d} | {:8d} | {:9d}".format(subfolder, num_pids,
+                                                   num_imgs, num_cams))
+    print("  ----------------------------------------")
+    return num_pids, num_imgs, num_cams
diff --git a/example/low_precision_training/ppcls/data/dataloader/pk_sampler.py b/example/low_precision_training/ppcls/data/dataloader/pk_sampler.py
new file mode 100755
index 000000000..11d1ac8e6
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/pk_sampler.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+
+from collections import defaultdict
+
+import numpy as np
+import paddle.distributed as dist
+from paddle.io import DistributedBatchSampler
+
+from ppcls.utils import logger
+
+
+class PKSampler(DistributedBatchSampler):
+    """First, randomly sample P identities.
+        Then for each identity randomly sample K instances.
+        Therefore batch size equals to P * K, and the sampler called PKSampler.
+
+    Args:
+        dataset (Dataset): Dataset which contains list of (img_path, pid, camid))
+        batch_size (int): batch size
+        sample_per_id (int): number of instance(s) within an class
+        shuffle (bool, optional): _description_. Defaults to True.
+        id_list(list): list of (start_id, end_id, start_id, end_id) for set of ids to duplicated.
+        ratio(list): list of (ratio1, ratio2..) the duplication number for ids in id_list.
+        drop_last (bool, optional): whether to discard the data at the end. Defaults to True.
+        sample_method (str, optional): sample method when generating prob_list. Defaults to "sample_avg_prob".
+        total_epochs (int, optional): total epochs. Defaults to 0.
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 sample_per_id,
+                 shuffle=True,
+                 drop_last=True,
+                 id_list=None,
+                 ratio=None,
+                 sample_method="sample_avg_prob",
+                 total_epochs=0):
+        super().__init__(
+            dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+        assert batch_size % sample_per_id == 0, \
+            f"PKSampler configs error, sample_per_id({sample_per_id}) must be a divisor of batch_size({batch_size})."
+        assert hasattr(self.dataset,
+                       "labels"), "Dataset must have labels attribute."
+        self.sample_per_id = sample_per_id
+        self.label_dict = defaultdict(list)
+        self.sample_method = sample_method
+        self.total_epochs = total_epochs
+        for idx, label in enumerate(self.dataset.labels):
+            self.label_dict[label].append(idx)
+        self.label_list = list(self.label_dict)
+        assert len(self.label_list) * self.sample_per_id >= self.batch_size, \
+            f"batch size({self.batch_size}) should not be bigger than than #classes({len(self.label_list)})*sample_per_id({self.sample_per_id})"
+        if self.sample_method == "id_avg_prob":
+            self.prob_list = np.array([1 / len(self.label_list)] *
+                                      len(self.label_list))
+        elif self.sample_method == "sample_avg_prob":
+            counter = []
+            for label_i in self.label_list:
+                counter.append(len(self.label_dict[label_i]))
+            self.prob_list = np.array(counter) / sum(counter)
+        else:
+            logger.error(
+                "PKSampler only support id_avg_prob and sample_avg_prob sample method, "
+                "but receive {}.".format(self.sample_method))
+
+        if id_list and ratio:
+            assert len(id_list) % 2 == 0 and len(id_list) == len(ratio) * 2
+            for i in range(len(self.prob_list)):
+                for j in range(len(ratio)):
+                    if i >= id_list[j * 2] and i <= id_list[j * 2 + 1]:
+                        self.prob_list[i] = self.prob_list[i] * ratio[j]
+                        break
+            self.prob_list = self.prob_list / sum(self.prob_list)
+
+        diff = np.abs(sum(self.prob_list) - 1)
+        if diff > 0.00000001:
+            self.prob_list[-1] = 1 - sum(self.prob_list[:-1])
+            if self.prob_list[-1] > 1 or self.prob_list[-1] < 0:
+                logger.error("PKSampler prob list error")
+            else:
+                logger.info(
+                    "PKSampler: sum of prob list not equal to 1, diff is {}, change the last prob".
+                    format(diff))
+
+    def __iter__(self):
+        # shuffle manually, same as DistributedBatchSampler.__iter__
+        if self.shuffle:
+            rank = dist.get_rank()
+            np.random.RandomState(rank * self.total_epochs +
+                                  self.epoch).shuffle(self.label_list)
+            np.random.RandomState(rank * self.total_epochs +
+                                  self.epoch).shuffle(self.prob_list)
+            self.epoch += 1
+
+        label_per_batch = self.batch_size // self.sample_per_id
+        for _ in range(len(self)):
+            batch_index = []
+            batch_label_list = np.random.choice(
+                self.label_list,
+                size=label_per_batch,
+                replace=False,
+                p=self.prob_list)
+            for label_i in batch_label_list:
+                label_i_indexes = self.label_dict[label_i]
+                if self.sample_per_id <= len(label_i_indexes):
+                    batch_index.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_id,
+                            replace=False))
+                else:
+                    batch_index.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_id,
+                            replace=True))
+            if not self.drop_last or len(batch_index) == self.batch_size:
+                yield batch_index
diff --git a/example/low_precision_training/ppcls/data/dataloader/ra_sampler.py b/example/low_precision_training/ppcls/data/dataloader/ra_sampler.py
new file mode 100755
index 000000000..cfba5492d
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/ra_sampler.py
@@ -0,0 +1,57 @@
+import math
+import numpy as np
+
+from paddle.io import DistributedBatchSampler
+
+
+class RASampler(DistributedBatchSampler):
+    """
+    based on https://github.com/facebookresearch/deit/blob/main/samplers.py
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False,
+                 num_repeats: int=3):
+        super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
+                         drop_last)
+        self.num_repeats = num_repeats
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * num_repeats / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+        self.num_selected_samples = int(
+            math.floor(len(self.dataset) // 256 * 256 / self.nranks))
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+
+        indices = [ele for ele in indices for i in range(self.num_repeats)]
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.local_rank:self.total_size:self.nranks]
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices[:self.num_selected_samples])
+
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = self.num_selected_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
diff --git a/example/low_precision_training/ppcls/data/dataloader/vehicle_dataset.py b/example/low_precision_training/ppcls/data/dataloader/vehicle_dataset.py
new file mode 100755
index 000000000..b2dba4d49
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/dataloader/vehicle_dataset.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle.io import Dataset
+import os
+import cv2
+from PIL import Image
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+from .common_dataset import create_operators
+
+
+class CompCars(Dataset):
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 label_root=None,
+                 transform_ops=None,
+                 bbox_crop=False):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self._label_root = label_root
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self._bbox_crop = bbox_crop
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno()
+
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        if self._bbox_crop:
+            assert os.path.exists(self._label_root)
+        self.images = []
+        self.labels = []
+        self.bboxes = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split()
+                if not self._bbox_crop:
+                    self.images.append(os.path.join(self._img_root, l[0]))
+                    self.labels.append(int(l[1]))
+                else:
+                    label_path = os.path.join(self._label_root,
+                                              l[0].split('.')[0] + '.txt')
+                    assert os.path.exists(label_path)
+                    with open(label_path) as f:
+                        bbox = f.readlines()[-1].strip().split()
+                    bbox = [int(x) for x in bbox]
+                    self.images.append(os.path.join(self._img_root, l[0]))
+                    self.labels.append(int(l[1]))
+                    self.bboxes.append(bbox)
+                    assert os.path.exists(self.images[-1])
+
+    def __getitem__(self, idx):
+        img = cv2.imread(self.images[idx])
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        if self._bbox_crop:
+            bbox = self.bboxes[idx]
+            img = img[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+        if self._transform_ops:
+            img = transform(img, self._transform_ops)
+        img = img.transpose((2, 0, 1))
+        return (img, self.labels[idx])
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
+
+
+class VeriWild(Dataset):
+    """Dataset for Vehicle and other similar data structure, such as VeRI-Wild, SOP, Inshop...
+    Args:
+        image_root (str): image root
+        cls_label_path (str): path to annotation file
+        transform_ops (List[Callable], optional): list of transform op(s). Defaults to None.
+        backend (str, optional): pil or cv2. Defaults to "cv2".
+        relabel (bool, optional): whether do relabel when original label do not starts from 0 or are discontinuous. Defaults to False.
+    """
+
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 transform_ops=None,
+                 backend="cv2",
+                 relabel=False):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self.backend = backend
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno(relabel)
+
+    def _load_anno(self, relabel):
+        assert os.path.exists(
+            self._cls_path), f"path {self._cls_path} does not exist."
+        assert os.path.exists(
+            self._img_root), f"path {self._img_root} does not exist."
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            if relabel:
+                label_set = set()
+                for line in lines:
+                    line = line.strip().split()
+                    label_set.add(np.int64(line[1]))
+                label_map = {
+                    oldlabel: newlabel
+                    for newlabel, oldlabel in enumerate(label_set)
+                }
+            for line in lines:
+                line = line.strip().split()
+                self.images.append(os.path.join(self._img_root, line[0]))
+                if relabel:
+                    self.labels.append(label_map[np.int64(line[1])])
+                else:
+                    self.labels.append(np.int64(line[1]))
+                if len(line) >= 3:
+                    self.cameras.append(np.int64(line[2]))
+                assert os.path.exists(self.images[-1]), \
+                    f"path {self.images[-1]} does not exist."
+
+        self.has_camera = len(self.cameras) > 0
+
+    def __getitem__(self, idx):
+        try:
+            if self.backend == "cv2":
+                with open(self.images[idx], 'rb') as f:
+                    img = f.read()
+            else:
+                img = Image.open(self.images[idx]).convert("RGB")
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            if self.backend == "cv2":
+                img = img.transpose((2, 0, 1))
+            if self.has_camera:
+                return (img, self.labels[idx], self.cameras[idx])
+            else:
+                return (img, self.labels[idx])
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/example/low_precision_training/ppcls/data/postprocess/__init__.py b/example/low_precision_training/ppcls/data/postprocess/__init__.py
new file mode 100755
index 000000000..202f5be8b
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/postprocess/__init__.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import importlib
+
+from . import topk, threshoutput
+
+from .topk import Topk
+from .threshoutput import ThreshOutput, MultiLabelThreshOutput
+from .attr_rec import VehicleAttribute, PersonAttribute, TableAttribute
+from .scoreoutput import ScoreOutput
+
+
+def build_postprocess(config):
+    config = copy.deepcopy(config)
+    model_name = config.pop("name")
+    mod = importlib.import_module(__name__)
+    postprocess_func = getattr(mod, model_name)(**config)
+    return postprocess_func
+
+
+class DistillationPostProcess(object):
+    def __init__(self, model_name="Student", key=None, func="Topk", **kargs):
+        super().__init__()
+        self.func = eval(func)(**kargs)
+        self.model_name = model_name
+        self.key = key
+
+    def __call__(self, x, file_names=None):
+        x = x[self.model_name]
+        if self.key is not None:
+            x = x[self.key]
+        return self.func(x, file_names=file_names)
diff --git a/example/low_precision_training/ppcls/data/postprocess/attr_rec.py b/example/low_precision_training/ppcls/data/postprocess/attr_rec.py
new file mode 100755
index 000000000..ff6dcee16
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/postprocess/attr_rec.py
@@ -0,0 +1,284 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+class VehicleAttribute(object):
+    def __init__(self, color_threshold=0.5, type_threshold=0.5):
+        self.color_threshold = color_threshold
+        self.type_threshold = type_threshold
+        self.color_list = [
+            "yellow", "orange", "green", "gray", "red", "blue", "white",
+            "golden", "brown", "black"
+        ]
+        self.type_list = [
+            "sedan", "suv", "van", "hatchback", "mpv", "pickup", "bus",
+            "truck", "estate"
+        ]
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.sigmoid(x).numpy()
+
+        # postprocess output of predictor
+        batch_res = []
+        for idx, res in enumerate(x):
+            res = res.tolist()
+            label_res = []
+            color_idx = np.argmax(res[:10])
+            type_idx = np.argmax(res[10:])
+            print(color_idx, type_idx)
+            if res[color_idx] >= self.color_threshold:
+                color_info = f"Color: ({self.color_list[color_idx]}, prob: {res[color_idx]})"
+            else:
+                color_info = "Color unknown"
+
+            if res[type_idx + 10] >= self.type_threshold:
+                type_info = f"Type: ({self.type_list[type_idx]}, prob: {res[type_idx + 10]})"
+            else:
+                type_info = "Type unknown"
+
+            label_res = f"{color_info}, {type_info}"
+
+            threshold_list = [self.color_threshold
+                              ] * 10 + [self.type_threshold] * 9
+            pred_res = (np.array(res) > np.array(threshold_list)
+                        ).astype(np.int8).tolist()
+            batch_res.append({
+                "attr": label_res,
+                "pred": pred_res,
+                "file_name": file_names[idx]
+            })
+        return batch_res
+
+
+class PersonAttribute(object):
+    def __init__(self,
+                 threshold=0.5,
+                 glasses_threshold=0.3,
+                 hold_threshold=0.6):
+        self.threshold = threshold
+        self.glasses_threshold = glasses_threshold
+        self.hold_threshold = hold_threshold
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.sigmoid(x).numpy()
+
+        # postprocess output of predictor
+        age_list = ['AgeLess18', 'Age18-60', 'AgeOver60']
+        direct_list = ['Front', 'Side', 'Back']
+        bag_list = ['HandBag', 'ShoulderBag', 'Backpack']
+        upper_list = ['UpperStride', 'UpperLogo', 'UpperPlaid', 'UpperSplice']
+        lower_list = [
+            'LowerStripe', 'LowerPattern', 'LongCoat', 'Trousers', 'Shorts',
+            'Skirt&Dress'
+        ]
+        batch_res = []
+        for idx, res in enumerate(x):
+            res = res.tolist()
+            label_res = []
+            # gender 
+            gender = 'Female' if res[22] > self.threshold else 'Male'
+            label_res.append(gender)
+            # age
+            age = age_list[np.argmax(res[19:22])]
+            label_res.append(age)
+            # direction 
+            direction = direct_list[np.argmax(res[23:])]
+            label_res.append(direction)
+            # glasses
+            glasses = 'Glasses: '
+            if res[1] > self.glasses_threshold:
+                glasses += 'True'
+            else:
+                glasses += 'False'
+            label_res.append(glasses)
+            # hat
+            hat = 'Hat: '
+            if res[0] > self.threshold:
+                hat += 'True'
+            else:
+                hat += 'False'
+            label_res.append(hat)
+            # hold obj
+            hold_obj = 'HoldObjectsInFront: '
+            if res[18] > self.hold_threshold:
+                hold_obj += 'True'
+            else:
+                hold_obj += 'False'
+            label_res.append(hold_obj)
+            # bag
+            bag = bag_list[np.argmax(res[15:18])]
+            bag_score = res[15 + np.argmax(res[15:18])]
+            bag_label = bag if bag_score > self.threshold else 'No bag'
+            label_res.append(bag_label)
+            # upper
+            upper_res = res[4:8]
+            upper_label = 'Upper:'
+            sleeve = 'LongSleeve' if res[3] > res[2] else 'ShortSleeve'
+            upper_label += ' {}'.format(sleeve)
+            for i, r in enumerate(upper_res):
+                if r > self.threshold:
+                    upper_label += ' {}'.format(upper_list[i])
+            label_res.append(upper_label)
+            # lower
+            lower_res = res[8:14]
+            lower_label = 'Lower: '
+            has_lower = False
+            for i, l in enumerate(lower_res):
+                if l > self.threshold:
+                    lower_label += ' {}'.format(lower_list[i])
+                    has_lower = True
+            if not has_lower:
+                lower_label += ' {}'.format(lower_list[np.argmax(lower_res)])
+
+            label_res.append(lower_label)
+            # shoe
+            shoe = 'Boots' if res[14] > self.threshold else 'No boots'
+            label_res.append(shoe)
+
+            threshold_list = [0.5] * len(res)
+            threshold_list[1] = self.glasses_threshold
+            threshold_list[18] = self.hold_threshold
+            pred_res = (np.array(res) > np.array(threshold_list)
+                        ).astype(np.int8).tolist()
+
+            batch_res.append({"attributes": label_res, "output": pred_res})
+        return batch_res
+
+
+class FaceAttribute(object):
+    def __init__(self, threshold=0.65, convert_cn=False):
+        self.threshold = threshold
+        self.convert_cn = convert_cn
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.sigmoid(x).numpy()
+
+        attribute_list = [
+            ["CheekWhiskers", "刚长出的双颊胡须"], ["ArchedEyebrows", "柳叶眉"],
+            ["Attractive", "吸引人的"], ["BagsUnderEyes", "眼袋"], ["Bald", "秃头"],
+            ["Bangs", "刘海"], ["BigLips", "大嘴唇"], ["BigNose", "大鼻子"],
+            ["BlackHair", "黑发"], ["BlondHair", "金发"], ["Blurry", "模糊的"],
+            ["BrownHair", "棕发"], ["BushyEyebrows", "浓眉"], ["Chubby", "圆胖的"],
+            ["DoubleChin", "双下巴"], ["Eyeglasses", "带眼镜"], ["Goatee", "山羊胡子"],
+            ["GrayHair", "灰发或白发"], ["HeavyMakeup", "浓妆"],
+            ["HighCheekbones", "高颧骨"], ["Male", "男性"],
+            ["MouthSlightlyOpen", "微微张开嘴巴"], ["Mustache", "胡子"],
+            ["NarrowEyes", "细长的眼睛"], ["NoBeard", "无胡子"],
+            ["OvalFace", "椭圆形的脸"], ["PaleSkin", "苍白的皮肤"],
+            ["PointyNose", "尖鼻子"], ["RecedingHairline", "发际线后移"],
+            ["RosyCheeks", "红润的双颊"], ["Sideburns", "连鬓胡子"], ["Smiling", "微笑"],
+            ["StraightHair", "直发"], ["WavyHair", "卷发"],
+            ["WearingEarrings", "戴着耳环"], ["WearingHat", "戴着帽子"],
+            ["WearingLipstick", "涂了唇膏"], ["WearingNecklace", "戴着项链"],
+            ["WearingNecktie", "戴着领带"], ["Young", "年轻人"]
+        ]
+        gender_list = [["Male", "男性"], ["Female", "女性"]]
+        age_list = [["Young", "年轻人"], ["Old", "老年人"]]
+        batch_res = []
+        index = 1 if self.convert_cn else 0
+        for idx, res in enumerate(x):
+            res = res.tolist()
+            label_res = []
+            threshold_list = [self.threshold] * len(res)
+            pred_res = (np.array(res) > np.array(threshold_list)
+                        ).astype(np.int8).tolist()
+            for i, value in enumerate(pred_res):
+                if i == 20:
+                    label_res.append(gender_list[0][index]
+                                     if value == 1 else gender_list[1][index])
+                elif i == 39:
+                    label_res.append(age_list[0][index]
+                                     if value == 1 else age_list[1][index])
+                else:
+                    if value == 1:
+                        label_res.append(attribute_list[i][index])
+            batch_res.append({"attributes": label_res, "output": pred_res})
+        return batch_res
+
+
+class TableAttribute(object):
+    def __init__(
+            self,
+            source_threshold=0.5,
+            number_threshold=0.5,
+            color_threshold=0.5,
+            clarity_threshold=0.5,
+            obstruction_threshold=0.5,
+            angle_threshold=0.5, ):
+        self.source_threshold = source_threshold
+        self.number_threshold = number_threshold
+        self.color_threshold = color_threshold
+        self.clarity_threshold = clarity_threshold
+        self.obstruction_threshold = obstruction_threshold
+        self.angle_threshold = angle_threshold
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.sigmoid(x).numpy()
+
+        # postprocess output of predictor
+        batch_res = []
+        for idx, res in enumerate(x):
+            res = res.tolist()
+            label_res = []
+            source = 'Scanned' if res[0] > self.source_threshold else 'Photo'
+            number = 'Little' if res[1] > self.number_threshold else 'Numerous'
+            color = 'Black-and-White' if res[
+                2] > self.color_threshold else 'Multicolor'
+            clarity = 'Clear' if res[3] > self.clarity_threshold else 'Blurry'
+            obstruction = 'Without-Obstacles' if res[
+                4] > self.number_threshold else 'With-Obstacles'
+            angle = 'Horizontal' if res[
+                5] > self.number_threshold else 'Tilted'
+
+            label_res = [source, number, color, clarity, obstruction, angle]
+
+            threshold_list = [
+                self.source_threshold, self.number_threshold,
+                self.color_threshold, self.clarity_threshold,
+                self.obstruction_threshold, self.angle_threshold
+            ]
+            pred_res = (np.array(res) > np.array(threshold_list)
+                        ).astype(np.int8).tolist()
+            batch_res.append({
+                "attributes": label_res,
+                "output": pred_res,
+                "file_name": file_names[idx]
+            })
+        return batch_res
diff --git a/example/low_precision_training/ppcls/data/postprocess/scoreoutput.py b/example/low_precision_training/ppcls/data/postprocess/scoreoutput.py
new file mode 100755
index 000000000..d68dd54f7
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/postprocess/scoreoutput.py
@@ -0,0 +1,18 @@
+import numpy
+import numpy as np
+import paddle
+
+
+class ScoreOutput(object):
+    def __init__(self, decimal_places):
+        self.decimal_places = decimal_places
+
+    def __call__(self, x, file_names=None):
+        y = []
+        for idx, probs in enumerate(x):
+            score = np.around(x[idx].numpy(), self.decimal_places)
+            result = {"scores": score}
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            y.append(result)
+        return y
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/data/postprocess/threshoutput.py b/example/low_precision_training/ppcls/data/postprocess/threshoutput.py
new file mode 100755
index 000000000..b329288d9
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/postprocess/threshoutput.py
@@ -0,0 +1,90 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle.nn.functional as F
+
+
+class ThreshOutput(object):
+    def __init__(self, threshold, label_0="0", label_1="1"):
+        self.threshold = threshold
+        self.label_0 = label_0
+        self.label_1 = label_1
+
+    def __call__(self, x, file_names=None):
+        y = []
+        x = F.softmax(x, axis=-1).numpy()
+        for idx, probs in enumerate(x):
+            score = probs[1]
+            if score < self.threshold:
+                result = {"class_ids": [0], "scores":  [1 - score], "label_names": [self.label_0]}
+            else:
+                result = {"class_ids": [1], "scores": [score], "label_names": [self.label_1]}
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            y.append(result)
+        return y
+
+
+class MultiLabelThreshOutput(object):
+    def __init__(self, threshold=0.5, class_id_map_file=None, delimiter=None):
+        self.threshold = threshold
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.class_id_map = self.parse_class_id_map(class_id_map_file)
+
+    def parse_class_id_map(self, class_id_map_file):
+        if class_id_map_file is None:
+            return None
+        if not os.path.exists(class_id_map_file):
+            print(
+                "Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!"
+            )
+            return None
+
+        try:
+            class_id_map = {}
+            with open(class_id_map_file, "r") as fin:
+                lines = fin.readlines()
+                for line in lines:
+                    partition = line.split("\n")[0].partition(self.delimiter)
+                    class_id_map[int(partition[0])] = str(partition[-1])
+        except Exception as ex:
+            print(ex)
+            class_id_map = None
+        return class_id_map
+
+    def __call__(self, x, file_names=None):
+        y = []
+        x = F.sigmoid(x).numpy()
+        for idx, probs in enumerate(x):
+            index = np.where(probs >= self.threshold)[0].astype("int32")
+            clas_id_list = []
+            score_list = []
+            label_name_list = []
+            for i in index:
+                clas_id_list.append(i.item())
+                score_list.append(probs[i].item())
+                if self.class_id_map is not None:
+                    label_name_list.append(self.class_id_map[i.item()])
+            result = {
+                "class_ids": clas_id_list,
+                "scores": np.around(
+                    score_list, decimals=5).tolist(),
+                "label_names": label_name_list    
+            }
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            y.append(result)
+        return y
diff --git a/example/low_precision_training/ppcls/data/postprocess/topk.py b/example/low_precision_training/ppcls/data/postprocess/topk.py
new file mode 100755
index 000000000..50cc40d49
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/postprocess/topk.py
@@ -0,0 +1,84 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+class Topk(object):
+    def __init__(self, topk=1, class_id_map_file=None, delimiter=None):
+        assert isinstance(topk, (int, ))
+        self.topk = topk
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.class_id_map = self.parse_class_id_map(class_id_map_file)
+
+    def parse_class_id_map(self, class_id_map_file):
+        if class_id_map_file is None:
+            return None
+        if not os.path.exists(class_id_map_file):
+            print(
+                "Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!"
+            )
+            return None
+
+        try:
+            class_id_map = {}
+            try:
+                with open(class_id_map_file, "r", encoding='utf-8') as fin:
+                    lines = fin.readlines()
+            except Exception as e:
+                with open(class_id_map_file, "r", encoding='gbk') as fin:
+                    lines = fin.readlines()
+            for line in lines:
+                partition = line.split("\n")[0].partition(self.delimiter)
+                class_id_map[int(partition[0])] = str(partition[-1])
+        except Exception as ex:
+            print(ex)
+            class_id_map = None
+        return class_id_map
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.softmax(x, axis=-1)
+        x = x.numpy()
+        y = []
+        for idx, probs in enumerate(x):
+            index = probs.argsort(axis=0)[-self.topk:][::-1].astype(
+                "int32")
+            clas_id_list = []
+            score_list = []
+            label_name_list = []
+            for i in index:
+                clas_id_list.append(i.item())
+                score_list.append(probs[i].item())
+                if self.class_id_map is not None:
+                    label_name_list.append(self.class_id_map[i.item()])
+            result = {
+                "class_ids": clas_id_list,
+                "scores": np.around(
+                    score_list, decimals=5).tolist(),
+            }
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            if label_name_list is not None:
+                result["label_names"] = label_name_list
+            y.append(result)
+        return y
+
diff --git a/example/low_precision_training/ppcls/data/preprocess/__init__.py b/example/low_precision_training/ppcls/data/preprocess/__init__.py
new file mode 100755
index 000000000..20451d7bf
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/__init__.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.data.preprocess.ops.autoaugment import ImageNetPolicy as RawImageNetPolicy
+from ppcls.data.preprocess.ops.randaugment import RandAugment as RawRandAugment
+from ppcls.data.preprocess.ops.randaugment import RandomApply
+from ppcls.data.preprocess.ops.randaugment import RandAugmentV2 as RawRandAugmentV2
+from ppcls.data.preprocess.ops.randaugment import RandAugmentV3 as RawRandAugmentV3
+from ppcls.data.preprocess.ops.randaugment import RandAugmentV4 as RawRandAugmentV4
+from ppcls.data.preprocess.ops.timm_autoaugment import RawTimmAutoAugment
+from ppcls.data.preprocess.ops.cutout import Cutout
+
+from ppcls.data.preprocess.ops.hide_and_seek import HideAndSeek
+from ppcls.data.preprocess.ops.random_erasing import RandomErasing
+from ppcls.data.preprocess.ops.grid import GridMask
+
+from ppcls.data.preprocess.ops.operators import DecodeImage
+from ppcls.data.preprocess.ops.operators import ResizeImage
+from ppcls.data.preprocess.ops.operators import CropImage
+from ppcls.data.preprocess.ops.operators import CropImageAtRatio
+from ppcls.data.preprocess.ops.operators import CenterCrop, Resize
+from ppcls.data.preprocess.ops.operators import RandCropImage
+from ppcls.data.preprocess.ops.operators import RandCropImageV2
+from ppcls.data.preprocess.ops.operators import RandFlipImage
+from ppcls.data.preprocess.ops.operators import NormalizeImage
+from ppcls.data.preprocess.ops.operators import ToCHWImage
+from ppcls.data.preprocess.ops.operators import AugMix
+from ppcls.data.preprocess.ops.operators import Pad
+from ppcls.data.preprocess.ops.operators import ToTensor
+from ppcls.data.preprocess.ops.operators import Normalize
+from ppcls.data.preprocess.ops.operators import RandomHorizontalFlip
+from ppcls.data.preprocess.ops.operators import RandomResizedCrop
+from ppcls.data.preprocess.ops.operators import CropWithPadding
+from ppcls.data.preprocess.ops.operators import RandomInterpolationAugment
+from ppcls.data.preprocess.ops.operators import ColorJitter
+from ppcls.data.preprocess.ops.operators import RandomGrayscale
+from ppcls.data.preprocess.ops.operators import RandomCropImage
+from ppcls.data.preprocess.ops.operators import RandomRotation
+from ppcls.data.preprocess.ops.operators import Padv2
+from ppcls.data.preprocess.ops.operators import RandomRot90
+from ppcls.data.preprocess.ops.operators import PCALighting
+from .ops.operators import format_data
+from paddle.vision.transforms import Pad as Pad_paddle_vision
+
+from ppcls.data.preprocess.batch_ops.batch_operators import MixupOperator, CutmixOperator, OpSampler, FmixOperator
+from ppcls.data.preprocess.batch_ops.batch_operators import MixupCutmixHybrid
+
+import numpy as np
+from PIL import Image
+import random
+
+
+def transform(data, ops=[]):
+    """ transform """
+    for op in ops:
+        data = op(data)
+    return data
+
+
+class AutoAugment(RawImageNetPolicy):
+    """ ImageNetPolicy wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugment(RawRandAugment):
+    """ RandAugment wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugmentV2(RawRandAugmentV2):
+    """ RandAugmentV2 wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugmentV3(RawRandAugmentV3):
+    """ RandAugmentV3 wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugmentV4(RawRandAugmentV4):
+    """ RandAugmentV4 wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class TimmAutoAugment(RawTimmAutoAugment):
+    """ TimmAutoAugment wrapper to auto fit different img tyeps. """
+
+    def __init__(self, prob=1.0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prob = prob
+
+    @format_data
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+        if random.random() < self.prob:
+            img = super().__call__(img)
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
diff --git a/example/low_precision_training/ppcls/data/preprocess/batch_ops/__init__.py b/example/low_precision_training/ppcls/data/preprocess/batch_ops/__init__.py
new file mode 100755
index 000000000..8b1378917
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/batch_ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/example/low_precision_training/ppcls/data/preprocess/batch_ops/batch_operators.py b/example/low_precision_training/ppcls/data/preprocess/batch_ops/batch_operators.py
new file mode 100755
index 000000000..0040bda42
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/batch_ops/batch_operators.py
@@ -0,0 +1,501 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import random
+
+import numpy as np
+
+from ppcls.utils import logger
+from ppcls.data.preprocess.ops.fmix import sample_mask
+
+import paddle
+import paddle.nn.functional as F
+
+
+class BatchOperator(object):
+    """ BatchOperator """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def _unpack(self, batch):
+        """ _unpack """
+        assert isinstance(batch, list), \
+                'batch should be a list filled with tuples (img, label)'
+        bs = len(batch)
+        assert bs > 0, 'size of the batch data should > 0'
+        #imgs, labels = list(zip(*batch))
+        imgs = []
+        labels = []
+        for item in batch:
+            imgs.append(item[0])
+            labels.append(item[1])
+        return np.array(imgs), np.array(labels), bs
+
+    def _one_hot(self, targets):
+        return np.eye(self.class_num, dtype="float32")[targets]
+
+    def _mix_target(self, targets0, targets1, lam):
+        one_hots0 = self._one_hot(targets0)
+        one_hots1 = self._one_hot(targets1)
+        return one_hots0 * lam + one_hots1 * (1 - lam)
+
+    def __call__(self, batch):
+        return batch
+
+
+class MixupOperator(BatchOperator):
+    """ Mixup operator
+    reference: https://arxiv.org/abs/1710.09412
+
+    """
+
+    def __init__(self, class_num, alpha: float=1.):
+        """Build Mixup operator
+
+        Args:
+            alpha (float, optional): The parameter alpha of mixup. Defaults to 1..
+
+        Raises:
+            Exception: The value of parameter is illegal.
+        """
+        if alpha <= 0:
+            raise Exception(
+                f"Parameter \"alpha\" of Mixup should be greater than 0. \"alpha\": {alpha}."
+            )
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"MixupOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self.class_num = class_num
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class CutmixOperator(BatchOperator):
+    """ Cutmix operator
+    reference: https://arxiv.org/abs/1905.04899
+
+    """
+
+    def __init__(self, class_num, alpha=0.2):
+        """Build Cutmix operator
+
+        Args:
+            alpha (float, optional): The parameter alpha of cutmix. Defaults to 0.2.
+
+        Raises:
+            Exception: The value of parameter is illegal.
+        """
+        if alpha <= 0:
+            raise Exception(
+                f"Parameter \"alpha\" of Cutmix should be greater than 0. \"alpha\": {alpha}."
+            )
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"CutmixOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self.class_num = class_num
+
+    def _rand_bbox(self, size, lam):
+        """ _rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = int(w * cut_rat)
+        cut_h = int(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+
+        bbx1, bby1, bbx2, bby2 = self._rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class FmixOperator(BatchOperator):
+    """ Fmix operator
+    reference: https://arxiv.org/abs/2002.12047
+
+    """
+
+    def __init__(self,
+                 class_num,
+                 alpha=1,
+                 decay_power=3,
+                 max_soft=0.,
+                 reformulate=False):
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"FmixOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self._decay_power = decay_power
+        self._max_soft = max_soft
+        self._reformulate = reformulate
+        self.class_num = class_num
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        size = (imgs.shape[2], imgs.shape[3])
+        lam, mask = sample_mask(self._alpha, self._decay_power, \
+                size, self._max_soft, self._reformulate)
+        imgs = mask * imgs + (1 - mask) * imgs[idx]
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class OpSampler(object):
+    """ Sample a operator from  """
+
+    def __init__(self, class_num, **op_dict):
+        """Build OpSampler
+
+        Raises:
+            Exception: The parameter \"prob\" of operator(s) are be set error.
+        """
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"OpSampler\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        if len(op_dict) < 1:
+            msg = f"ConfigWarning: No operator in \"OpSampler\". \"OpSampler\" has been skipped."
+            logger.warning(msg)
+
+        self.ops = {}
+        total_prob = 0
+        for op_name in op_dict:
+            param = op_dict[op_name]
+            if "prob" not in param:
+                msg = f"ConfigWarning: Parameter \"prob\" should be set when use operator in \"OpSampler\". The operator \"{op_name}\"'s prob has been set \"0\"."
+                logger.warning(msg)
+            prob = param.pop("prob", 0)
+            total_prob += prob
+            param.update({"class_num": class_num})
+            op = eval(op_name)(**param)
+            self.ops.update({op: prob})
+
+        if total_prob > 1:
+            msg = f"ConfigError: The total prob of operators in \"OpSampler\" should be less 1."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        # add "None Op" when total_prob < 1, "None Op" do nothing
+        self.ops[None] = 1 - total_prob
+
+    def __call__(self, batch):
+        op = random.choices(
+            list(self.ops.keys()), weights=list(self.ops.values()), k=1)[0]
+        # return batch directly when None Op
+        return op(batch) if op else batch
+
+
+class MixupCutmixHybrid(object):
+    """ Mixup/Cutmix that applies different params to each element or whole batch
+
+    Args:
+        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
+        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
+        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
+        prob (float): probability of applying mixup or cutmix per batch or element
+        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
+        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
+        label_smoothing (float): apply label smoothing to the mixed target tensor
+        num_classes (int): number of classes for target
+    """
+
+    def __init__(self,
+                 mixup_alpha=1.,
+                 cutmix_alpha=0.,
+                 cutmix_minmax=None,
+                 prob=1.0,
+                 switch_prob=0.5,
+                 mode='batch',
+                 correct_lam=True,
+                 label_smoothing=0.1,
+                 num_classes=4):
+        self.mixup_alpha = mixup_alpha
+        self.cutmix_alpha = cutmix_alpha
+        self.cutmix_minmax = cutmix_minmax
+        if self.cutmix_minmax is not None:
+            assert len(self.cutmix_minmax) == 2
+            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
+            self.cutmix_alpha = 1.0
+        self.mix_prob = prob
+        self.switch_prob = switch_prob
+        self.label_smoothing = label_smoothing
+        self.num_classes = num_classes
+        self.mode = mode
+        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
+        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)
+
+    def _one_hot(self, x, num_classes, on_value=1., off_value=0.):
+        x = paddle.cast(x, dtype='int64')
+        on_value = paddle.full([x.shape[0], num_classes], on_value)
+        off_value = paddle.full([x.shape[0], num_classes], off_value)
+        return paddle.where(
+            F.one_hot(x, num_classes) == 1, on_value, off_value)
+
+    def _mixup_target(self, target, num_classes, lam=1., smoothing=0.0):
+        off_value = smoothing / num_classes
+        on_value = 1. - smoothing + off_value
+        y1 = self._one_hot(
+            target,
+            num_classes,
+            on_value=on_value,
+            off_value=off_value, )
+        y2 = self._one_hot(
+            target.flip(0),
+            num_classes,
+            on_value=on_value,
+            off_value=off_value)
+        return y1 * lam + y2 * (1. - lam)
+
+    def _rand_bbox(self, img_shape, lam, margin=0., count=None):
+        """ Standard CutMix bounding-box
+        Generates a random square bbox based on lambda value. This impl includes
+        support for enforcing a border margin as percent of bbox dimensions.
+
+        Args:
+            img_shape (tuple): Image shape as tuple
+            lam (float): Cutmix lambda value
+            margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
+            count (int): Number of bbox to generate
+        """
+        ratio = np.sqrt(1 - lam)
+        img_h, img_w = img_shape[-2:]
+        cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
+        margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
+        cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
+        cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
+        yl = np.clip(cy - cut_h // 2, 0, img_h)
+        yh = np.clip(cy + cut_h // 2, 0, img_h)
+        xl = np.clip(cx - cut_w // 2, 0, img_w)
+        xh = np.clip(cx + cut_w // 2, 0, img_w)
+        return yl, yh, xl, xh
+
+    def _rand_bbox_minmax(self, img_shape, minmax, count=None):
+        """ Min-Max CutMix bounding-box
+        Inspired by Darknet cutmix impl, generates a random rectangular bbox
+        based on min/max percent values applied to each dimension of the input image.
+
+        Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.
+
+        Args:
+            img_shape (tuple): Image shape as tuple
+            minmax (tuple or list): Min and max bbox ratios (as percent of image size)
+            count (int): Number of bbox to generate
+        """
+        assert len(minmax) == 2
+        img_h, img_w = img_shape[-2:]
+        cut_h = np.random.randint(
+            int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
+        cut_w = np.random.randint(
+            int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
+        yl = np.random.randint(0, img_h - cut_h, size=count)
+        xl = np.random.randint(0, img_w - cut_w, size=count)
+        yu = yl + cut_h
+        xu = xl + cut_w
+        return yl, yu, xl, xu
+
+    def _cutmix_bbox_and_lam(self,
+                             img_shape,
+                             lam,
+                             ratio_minmax=None,
+                             correct_lam=True,
+                             count=None):
+        """ Generate bbox and apply lambda correction.
+        """
+        if ratio_minmax is not None:
+            yl, yu, xl, xu = self._rand_bbox_minmax(
+                img_shape, ratio_minmax, count=count)
+        else:
+            yl, yu, xl, xu = self._rand_bbox(img_shape, lam, count=count)
+        if correct_lam or ratio_minmax is not None:
+            bbox_area = (yu - yl) * (xu - xl)
+            lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
+        return (yl, yu, xl, xu), lam
+
+    def _params_per_elem(self, batch_size):
+        lam = np.ones(batch_size, dtype=np.float32)
+        use_cutmix = np.zeros(batch_size, dtype=np.bool)
+        if self.mixup_enabled:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand(batch_size) < self.switch_prob
+                lam_mix = np.where(
+                    use_cutmix,
+                    np.random.beta(
+                        self.cutmix_alpha, self.cutmix_alpha, size=batch_size),
+                    np.random.beta(
+                        self.mixup_alpha, self.mixup_alpha, size=batch_size))
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(
+                    self.mixup_alpha, self.mixup_alpha, size=batch_size)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = np.ones(batch_size, dtype=np.bool)
+                lam_mix = np.random.beta(
+                    self.cutmix_alpha, self.cutmix_alpha, size=batch_size)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = np.where(
+                np.random.rand(batch_size) < self.mix_prob,
+                lam_mix.astype(np.float32), lam)
+        return lam, use_cutmix
+
+    def _params_per_batch(self):
+        lam = 1.
+        use_cutmix = False
+        if self.mixup_enabled and np.random.rand() < self.mix_prob:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand() < self.switch_prob
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = True
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = float(lam_mix)
+        return lam, use_cutmix
+
+    def _mix_elem(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size)
+        x_orig = x.clone(
+        )  # need to keep an unmodified original for mixing source
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = self._cutmix_bbox_and_lam(
+                        x[i].shape,
+                        lam,
+                        ratio_minmax=self.cutmix_minmax,
+                        correct_lam=self.correct_lam)
+                    if yl < yh and xl < xh:
+                        x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+        return paddle.to_tensor(lam_batch, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_pair(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        x_orig = x.clone(
+        )  # need to keep an unmodified original for mixing source
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = self._cutmix_bbox_and_lam(
+                        x[i].shape,
+                        lam,
+                        ratio_minmax=self.cutmix_minmax,
+                        correct_lam=self.correct_lam)
+                    if yl < yh and xl < xh:
+                        x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                        x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return paddle.to_tensor(lam_batch, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_batch(self, x):
+        lam, use_cutmix = self._params_per_batch()
+        if lam == 1.:
+            return 1.
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = self._cutmix_bbox_and_lam(
+                x.shape,
+                lam,
+                ratio_minmax=self.cutmix_minmax,
+                correct_lam=self.correct_lam)
+            if yl < yh and xl < xh:
+                x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
+
+        else:
+            x_flipped = x.flip(0) * (1. - lam)
+            x[:] = x * lam + x_flipped
+        return lam
+
+    def _unpack(self, batch):
+        """ _unpack """
+        assert isinstance(batch, list), \
+                'batch should be a list filled with tuples (img, label)'
+        bs = len(batch)
+        assert bs > 0, 'size of the batch data should > 0'
+        #imgs, labels = list(zip(*batch))
+        imgs = []
+        labels = []
+        for item in batch:
+            imgs.append(item[0])
+            labels.append(item[1])
+        return np.array(imgs), np.array(labels), bs
+
+    def __call__(self, batch):
+        x, target, bs = self._unpack(batch)
+        x = paddle.to_tensor(x)
+        target = paddle.to_tensor(target)
+        assert len(x) % 2 == 0, 'Batch size should be even when using this'
+        if self.mode == 'elem':
+            lam = self._mix_elem(x)
+        elif self.mode == 'pair':
+            lam = self._mix_pair(x)
+        else:
+            lam = self._mix_batch(x)
+        target = self._mixup_target(target, self.num_classes, lam,
+                                    self.label_smoothing)
+
+        return list(zip(x.numpy(), target.numpy()))
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/__init__.py b/example/low_precision_training/ppcls/data/preprocess/ops/__init__.py
new file mode 100755
index 000000000..8b1378917
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/autoaugment.py b/example/low_precision_training/ppcls/data/preprocess/ops/autoaugment.py
new file mode 100755
index 000000000..43327950f
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/autoaugment.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/DeepVoltaire/AutoAugment/blob/master/autoaugment.py
+# reference: https://arxiv.org/abs/1805.09501
+
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+
+
+class ImageNetPolicy(object):
+    """ Randomly choose one of the best 24 Sub-policies on ImageNet.
+
+        Example:
+        >>> policy = ImageNetPolicy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     ImageNetPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.4, "posterize", 8, 0.6, "rotate", 9, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "posterize", 7, 0.6, "posterize", 6, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.4, "equalize", 4, 0.8, "rotate", 8, fillcolor),
+            SubPolicy(0.6, "solarize", 3, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.8, "posterize", 5, 1.0, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "rotate", 3, 0.6, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "equalize", 8, 0.4, "posterize", 6, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 0.4, "color", 0, fillcolor),
+            SubPolicy(0.4, "rotate", 9, 0.6, "equalize", 2, fillcolor),
+            SubPolicy(0.0, "equalize", 7, 0.8, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 1.0, "color", 2, fillcolor),
+            SubPolicy(0.8, "color", 8, 0.8, "solarize", 7, fillcolor),
+            SubPolicy(0.4, "sharpness", 7, 0.6, "invert", 8, fillcolor),
+            SubPolicy(0.6, "shearX", 5, 1.0, "equalize", 9, fillcolor),
+            SubPolicy(0.4, "color", 0, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment ImageNet Policy"
+
+
+class CIFAR10Policy(object):
+    """ Randomly choose one of the best 25 Sub-policies on CIFAR10.
+
+        Example:
+        >>> policy = CIFAR10Policy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     CIFAR10Policy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.1, "invert", 7, 0.2, "contrast", 6, fillcolor),
+            SubPolicy(0.7, "rotate", 2, 0.3, "translateX", 9, fillcolor),
+            SubPolicy(0.8, "sharpness", 1, 0.9, "sharpness", 3, fillcolor),
+            SubPolicy(0.5, "shearY", 8, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.5, "autocontrast", 8, 0.9, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "shearY", 7, 0.3, "posterize", 7, fillcolor),
+            SubPolicy(0.4, "color", 3, 0.6, "brightness", 7, fillcolor),
+            SubPolicy(0.3, "sharpness", 9, 0.7, "brightness", 9, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.5, "equalize", 1, fillcolor),
+            SubPolicy(0.6, "contrast", 7, 0.6, "sharpness", 5, fillcolor),
+            SubPolicy(0.7, "color", 7, 0.5, "translateX", 8, fillcolor),
+            SubPolicy(0.3, "equalize", 7, 0.4, "autocontrast", 8, fillcolor),
+            SubPolicy(0.4, "translateY", 3, 0.2, "sharpness", 6, fillcolor),
+            SubPolicy(0.9, "brightness", 6, 0.2, "color", 8, fillcolor),
+            SubPolicy(0.5, "solarize", 2, 0.0, "invert", 3, fillcolor),
+            SubPolicy(0.2, "equalize", 0, 0.6, "autocontrast", 0, fillcolor),
+            SubPolicy(0.2, "equalize", 8, 0.8, "equalize", 4, fillcolor),
+            SubPolicy(0.9, "color", 9, 0.6, "equalize", 6, fillcolor),
+            SubPolicy(0.8, "autocontrast", 4, 0.2, "solarize", 8, fillcolor),
+            SubPolicy(0.1, "brightness", 3, 0.7, "color", 0, fillcolor),
+            SubPolicy(0.4, "solarize", 5, 0.9, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "translateY", 9, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.9, "autocontrast", 2, 0.8, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.1, "invert", 3, fillcolor),
+            SubPolicy(0.7, "translateY", 9, 0.9, "autocontrast", 1, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment CIFAR10 Policy"
+
+
+class SVHNPolicy(object):
+    """ Randomly choose one of the best 25 Sub-policies on SVHN.
+
+        Example:
+        >>> policy = SVHNPolicy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     SVHNPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.9, "shearX", 4, 0.2, "invert", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.7, "invert", 5, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.6, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 3, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "equalize", 1, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.8, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.4, "invert", 5, fillcolor),
+            SubPolicy(0.9, "shearY", 5, 0.2, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 6, 0.8, "autocontrast", 1, fillcolor),
+            SubPolicy(0.6, "equalize", 3, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.3, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "shearY", 8, 0.7, "invert", 4, fillcolor),
+            SubPolicy(0.9, "equalize", 5, 0.6, "translateY", 6, fillcolor),
+            SubPolicy(0.9, "invert", 4, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.3, "contrast", 3, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(0.8, "invert", 5, 0.0, "translateY", 2, fillcolor),
+            SubPolicy(0.7, "shearY", 6, 0.4, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(
+                0.3, "shearY", 7, 0.9, "translateX", 3, fillcolor), SubPolicy(
+                    0.1, "shearX", 6, 0.6, "invert", 5, fillcolor), SubPolicy(
+                        0.7, "solarize", 2, 0.6, "translateY", 7,
+                        fillcolor), SubPolicy(0.8, "shearY", 4, 0.8, "invert",
+                                              8, fillcolor), SubPolicy(
+                                                  0.7, "shearX", 9, 0.8,
+                                                  "translateY", 3,
+                                                  fillcolor), SubPolicy(
+                                                      0.8, "shearY", 5, 0.7,
+                                                      "autocontrast", 3,
+                                                      fillcolor),
+            SubPolicy(0.7, "shearX", 2, 0.1, "invert", 5, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment SVHN Policy"
+
+
+class SubPolicy(object):
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int_),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10
+        }
+
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/cutout.py b/example/low_precision_training/ppcls/data/preprocess/ops/cutout.py
new file mode 100755
index 000000000..7519ce844
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/cutout.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/uoguelph-mlrg/Cutout
+# reference: https://arxiv.org/abs/1708.04552
+
+import random
+
+import numpy as np
+import cv2
+
+
+class Cutout(object):
+    def __init__(self, n_holes=1, length=112, fill_value=(0, 0, 0)):
+        self.n_holes = n_holes
+        self.length = length
+        if fill_value == 'none' or fill_value is None:
+            self.fill_value = None
+
+    def __call__(self, img):
+        """ cutout_image """
+        h, w = img.shape[:2]
+
+        for n in range(self.n_holes):
+            y = np.random.randint(h)
+            x = np.random.randint(w)
+
+            y1 = np.clip(y - self.length // 2, 0, h)
+            y2 = np.clip(y + self.length // 2, 0, h)
+            x1 = np.clip(x - self.length // 2, 0, w)
+            x2 = np.clip(x + self.length // 2, 0, w)
+
+            fill_value = self.fill_value
+            if fill_value is None:
+                if img.ndim == 2:
+                    fill_value = random.randint(0, 255)
+                else:
+                    fill_value = [random.randint(0, 255),
+                                  random.randint(0, 255),
+                                  random.randint(0, 255)]
+
+                img = cv2.rectangle(np.array(img), (x1, y1), (x2, y2), fill_value, -1)
+
+        return img
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/dali_operators.py b/example/low_precision_training/ppcls/data/preprocess/ops/dali_operators.py
new file mode 100755
index 000000000..baf4b087c
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/dali_operators.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import nvidia.dali.fn as fn
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+
+
+class DecodeImage(ops.decoders.Image):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(DecodeImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(DecodeImage, self).__call__(data, **kwargs)
+
+
+class ToCHWImage(ops.Transpose):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(ToCHWImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(ToCHWImage, self).__call__(data, **kwargs)
+
+
+class ColorJitter(ops.ColorTwist):
+    def __init__(self,
+                 *kargs,
+                 device="cpu",
+                 prob=1.0,
+                 brightness_factor=0.0,
+                 contrast_factor=0.0,
+                 saturation_factor=0.0,
+                 hue_factor=0.0,
+                 **kwargs):
+        super(ColorJitter, self).__init__(*kargs, device=device, **kwargs)
+        self.brightness_factor = brightness_factor
+        self.contrast_factor = contrast_factor
+        self.saturation_factor = saturation_factor
+        self.hue_factor = hue_factor
+        self.rng = ops.random.CoinFlip(probability=prob)
+
+    def __call__(self, data, **kwargs):
+        do_jitter = self.rng()
+        brightness = fn.random.uniform(
+            range=(max(0, 1 - self.brightness_factor),
+                   1 + self.brightness_factor)) * do_jitter
+        contrast = fn.random.uniform(
+            range=(max(0, 1 - self.contrast_factor),
+                   1 + self.contrast_factor)) * do_jitter
+        saturation = fn.random.uniform(
+            range=(max(0, 1 - self.saturation_factor),
+                   1 + self.saturation_factor)) * do_jitter
+        hue = fn.random.uniform(range=(-self.hue_factor,
+                                       self.hue_factor)) * do_jitter
+        return super(ColorJitter, self).__call__(
+            data,
+            brightness=brightness,
+            contrast=contrast,
+            saturation=saturation,
+            hue=hue,
+            **kwargs)
+
+
+class DecodeRandomResizedCrop(ops.decoders.ImageRandomCrop):
+    def __init__(self,
+                 *kargs,
+                 device="cpu",
+                 resize_x=224,
+                 resize_y=224,
+                 resize_short=None,
+                 interp_type=types.DALIInterpType.INTERP_LINEAR,
+                 **kwargs):
+        super(DecodeRandomResizedCrop, self).__init__(
+            *kargs, device=device, **kwargs)
+        if resize_short is None:
+            self.resize = ops.Resize(
+                device="gpu" if device == "mixed" else "cpu",
+                resize_x=resize_x,
+                resize_y=resize_y,
+                interp_type=interp_type)
+        else:
+            self.resize = ops.Resize(
+                device="gpu" if device == "mixed" else "cpu",
+                resize_short=resize_short,
+                interp_type=interp_type)
+
+    def __call__(self, data, **kwargs):
+        data = super(DecodeRandomResizedCrop, self).__call__(data, **kwargs)
+        data = self.resize(data)
+        return data
+
+
+class CropMirrorNormalize(ops.CropMirrorNormalize):
+    def __init__(self, *kargs, device="cpu", prob=0.5, **kwargs):
+        super(CropMirrorNormalize, self).__init__(
+            *kargs, device=device, **kwargs)
+        self.rng = ops.random.CoinFlip(probability=prob)
+
+    def __call__(self, data, **kwargs):
+        do_mirror = self.rng()
+        return super(CropMirrorNormalize, self).__call__(
+            data, mirror=do_mirror, **kwargs)
+
+
+class RandCropImage(ops.RandomResizedCrop):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(RandCropImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(RandCropImage, self).__call__(data, **kwargs)
+
+
+class CropImage(ops.Crop):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(CropImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(CropImage, self).__call__(data, **kwargs)
+
+
+class ResizeImage(ops.Resize):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(ResizeImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(ResizeImage, self).__call__(data, **kwargs)
+
+
+class RandFlipImage(ops.Flip):
+    def __init__(self, *kargs, device="cpu", prob=0.5, flip_code=1, **kwargs):
+        super(RandFlipImage, self).__init__(*kargs, device=device, **kwargs)
+        self.flip_code = flip_code
+        self.rng = ops.random.CoinFlip(probability=prob)
+
+    def __call__(self, data, **kwargs):
+        do_flip = self.rng()
+        if self.flip_code == 1:
+            return super(RandFlipImage, self).__call__(
+                data, horizontal=do_flip, vertical=0, **kwargs)
+        elif self.flip_code == 0:
+            return super(RandFlipImage, self).__call__(
+                data, horizontal=0, vertical=do_flip, **kwargs)
+        else:
+            return super(RandFlipImage, self).__call__(
+                data, horizontal=do_flip, vertical=do_flip, **kwargs)
+
+
+class Pad(ops.Crop):
+    """
+    use ops.Crop to implement Pad operator, for ops.Pad alwayls only pad in right and bottom.
+    """
+
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(Pad, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(Pad, self).__call__(data, **kwargs)
+
+
+class RandCropImageV2(ops.Crop):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(RandCropImageV2, self).__init__(*kargs, device=device, **kwargs)
+        self.rng_x = ops.random.Uniform(range=(0.0, 1.0))
+        self.rng_y = ops.random.Uniform(range=(0.0, 1.0))
+
+    def __call__(self, data, **kwargs):
+        pos_x = self.rng_x()
+        pos_y = self.rng_y()
+        return super(RandCropImageV2, self).__call__(
+            data, crop_pos_x=pos_x, crop_pos_y=pos_y, **kwargs)
+
+
+class RandomCropImage(ops.Crop):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(RandomCropImage, self).__init__(*kargs, device=device, **kwargs)
+        self.rng_x = ops.random.Uniform(range=(0.0, 1.0))
+        self.rng_y = ops.random.Uniform(range=(0.0, 1.0))
+
+    def __call__(self, data, **kwargs):
+        pos_x = self.rng_x()
+        pos_y = self.rng_y()
+        return super(RandomCropImage, self).__call__(
+            data, crop_pos_x=pos_x, crop_pos_y=pos_y, **kwargs)
+
+
+class RandomRotation(ops.Rotate):
+    def __init__(self, *kargs, device="cpu", prob=0.5, angle=0, **kwargs):
+        super(RandomRotation, self).__init__(*kargs, device=device, **kwargs)
+        self.rng = ops.random.CoinFlip(probability=prob)
+        discrete_angle = list(range(-angle, angle + 1))
+        self.rng_angle = ops.random.Uniform(values=discrete_angle)
+
+    def __call__(self, data, **kwargs):
+        do_rotate = self.rng()
+        angle = self.rng_angle()
+        flip_data = super(RandomRotation, self).__call__(
+            data,
+            angle=do_rotate * angle,
+            keep_size=True,
+            fill_value=0,
+            **kwargs)
+        return flip_data
+
+
+class RandomRot90(ops.Rotate):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(RandomRot90, self).__init__(*kargs, device=device, **kwargs)
+        self.rng_angle = ops.random.Uniform(values=[0.0, 1.0, 2.0, 3.0])
+
+    def __call__(self, data, **kwargs):
+        angle = self.rng_angle() * 90.0
+        flip_data = super(RandomRot90, self).__call__(
+            data, angle=angle, keep_size=True, fill_value=0, **kwargs)
+        return flip_data
+
+
+class NormalizeImage(ops.Normalize):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(NormalizeImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(NormalizeImage, self).__call__(data, **kwargs)
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/fmix.py b/example/low_precision_training/ppcls/data/preprocess/ops/fmix.py
new file mode 100755
index 000000000..019f618c5
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/fmix.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/ecs-vlc/FMix
+# reference: https://arxiv.org/abs/2002.12047
+
+import math
+import random
+
+import numpy as np
+from scipy.stats import beta
+
+
+def fftfreqnd(h, w=None, z=None):
+    """ Get bin values for discrete fourier transform of size (h, w, z)
+
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    fz = fx = 0
+    fy = np.fft.fftfreq(h)
+
+    if w is not None:
+        fy = np.expand_dims(fy, -1)
+
+        if w % 2 == 1:
+            fx = np.fft.fftfreq(w)[:w // 2 + 2]
+        else:
+            fx = np.fft.fftfreq(w)[:w // 2 + 1]
+
+    if z is not None:
+        fy = np.expand_dims(fy, -1)
+        if z % 2 == 1:
+            fz = np.fft.fftfreq(z)[:, None]
+        else:
+            fz = np.fft.fftfreq(z)[:, None]
+
+    return np.sqrt(fx * fx + fy * fy + fz * fz)
+
+
+def get_spectrum(freqs, decay_power, ch, h, w=0, z=0):
+    """ Samples a fourier image with given size and frequencies decayed by decay power
+
+    :param freqs: Bin values for the discrete fourier transform
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param ch: Number of channels for the resulting mask
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    scale = np.ones(1) / (np.maximum(freqs, np.array([1. / max(w, h, z)]))
+                          **decay_power)
+
+    param_size = [ch] + list(freqs.shape) + [2]
+    param = np.random.randn(*param_size)
+
+    scale = np.expand_dims(scale, -1)[None, :]
+
+    return scale * param
+
+
+def make_low_freq_image(decay, shape, ch=1):
+    """ Sample a low frequency image from fourier space
+
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param ch: Number of channels for desired mask
+    """
+    freqs = fftfreqnd(*shape)
+    spectrum = get_spectrum(freqs, decay, ch,
+                            *shape)  #.reshape((1, *shape[:-1], -1))
+    spectrum = spectrum[:, 0] + 1j * spectrum[:, 1]
+    mask = np.real(np.fft.irfftn(spectrum, shape))
+
+    if len(shape) == 1:
+        mask = mask[:1, :shape[0]]
+    if len(shape) == 2:
+        mask = mask[:1, :shape[0], :shape[1]]
+    if len(shape) == 3:
+        mask = mask[:1, :shape[0], :shape[1], :shape[2]]
+
+    mask = mask
+    mask = (mask - mask.min())
+    mask = mask / mask.max()
+    return mask
+
+
+def sample_lam(alpha, reformulate=False):
+    """ Sample a lambda from symmetric beta distribution with given alpha
+
+    :param alpha: Alpha value for beta distribution
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if reformulate:
+        lam = beta.rvs(alpha + 1, alpha)
+    else:
+        lam = beta.rvs(alpha, alpha)
+
+    return lam
+
+
+def binarise_mask(mask, lam, in_shape, max_soft=0.0):
+    """ Binarises a given low frequency image such that it has mean lambda.
+
+    :param mask: Low frequency image, usually the result of `make_low_freq_image`
+    :param lam: Mean value of final mask
+    :param in_shape: Shape of inputs
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :return:
+    """
+    idx = mask.reshape(-1).argsort()[::-1]
+    mask = mask.reshape(-1)
+    num = math.ceil(lam * mask.size) if random.random() > 0.5 else math.floor(
+        lam * mask.size)
+
+    eff_soft = max_soft
+    if max_soft > lam or max_soft > (1 - lam):
+        eff_soft = min(lam, 1 - lam)
+
+    soft = int(mask.size * eff_soft)
+    num_low = int(num - soft)
+    num_high = int(num + soft)
+
+    mask[idx[:num_high]] = 1
+    mask[idx[num_low:]] = 0
+    mask[idx[num_low:num_high]] = np.linspace(1, 0, (num_high - num_low))
+
+    mask = mask.reshape((1, 1, in_shape[0], in_shape[1]))
+    return mask
+
+
+def sample_mask(alpha, decay_power, shape, max_soft=0.0, reformulate=False):
+    """ Samples a mean lambda from beta distribution parametrised by alpha, creates a low frequency image and binarises
+    it based on this lambda
+
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+
+    # Choose lambda
+    lam = sample_lam(alpha, reformulate)
+
+    # Make mask, get mean / std
+    mask = make_low_freq_image(decay_power, shape)
+    mask = binarise_mask(mask, lam, shape, max_soft)
+
+    return float(lam), mask
+
+
+def sample_and_apply(x,
+                     alpha,
+                     decay_power,
+                     shape,
+                     max_soft=0.0,
+                     reformulate=False):
+    """
+
+    :param x: Image batch on which to apply fmix of shape [b, c, shape*]
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    :return: mixed input, permutation indices, lambda value of mix,
+    """
+    lam, mask = sample_mask(alpha, decay_power, shape, max_soft, reformulate)
+    index = np.random.permutation(x.shape[0])
+
+    x1, x2 = x * mask, x[index] * (1 - mask)
+    return x1 + x2, index, lam
+
+
+class FMixBase:
+    """ FMix augmentation
+
+        Args:
+            decay_power (float): Decay power for frequency decay prop 1/f**d
+            alpha (float): Alpha value for beta distribution from which to sample mean of mask
+            size ([int] | [int, int] | [int, int, int]): Shape of desired mask, list up to 3 dims
+            max_soft (float): Softening value between 0 and 0.5 which smooths hard edges in the mask.
+            reformulate (bool): If True, uses the reformulation of [1].
+    """
+
+    def __init__(self,
+                 decay_power=3,
+                 alpha=1,
+                 size=(32, 32),
+                 max_soft=0.0,
+                 reformulate=False):
+        super().__init__()
+        self.decay_power = decay_power
+        self.reformulate = reformulate
+        self.size = size
+        self.alpha = alpha
+        self.max_soft = max_soft
+        self.index = None
+        self.lam = None
+
+    def __call__(self, x):
+        raise NotImplementedError
+
+    def loss(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/functional.py b/example/low_precision_training/ppcls/data/preprocess/ops/functional.py
new file mode 100755
index 000000000..9f1369eef
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/functional.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# encoding: utf-8
+
+import numpy as np
+from PIL import Image, ImageOps, ImageEnhance
+
+
+
+def int_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval .
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      An int that results from scaling `maxval` according to `level`.
+    """
+    return int(level * maxval / 10)
+
+
+def float_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval.
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      A float that results from scaling `maxval` according to `level`.
+    """
+    return float(level) * maxval / 10.
+
+
+def sample_level(n):
+    return np.random.uniform(low=0.1, high=n)
+
+
+def autocontrast(pil_img, *args):
+    return ImageOps.autocontrast(pil_img)
+
+
+def equalize(pil_img, *args):
+    return ImageOps.equalize(pil_img)
+
+
+def posterize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 4)
+    return ImageOps.posterize(pil_img, 4 - level)
+
+
+def rotate(pil_img, level, *args):
+    degrees = int_parameter(sample_level(level), 30)
+    if np.random.uniform() > 0.5:
+        degrees = -degrees
+    return pil_img.rotate(degrees, resample=Image.BILINEAR)
+
+
+def solarize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 256)
+    return ImageOps.solarize(pil_img, 256 - level)
+
+
+def shear_x(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, level, 0, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def shear_y(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, level, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_x(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[0] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, level, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_y(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[1] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, 0, 1, level),
+                             resample=Image.BILINEAR)
+
+
+# operation that overlaps with ImageNet-C's test set
+def color(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Color(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def contrast(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Contrast(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def brightness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Brightness(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def sharpness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Sharpness(pil_img).enhance(level)
+
+
+augmentations = [
+    autocontrast, equalize, posterize, rotate, solarize, shear_x, shear_y,
+    translate_x, translate_y
+]
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/grid.py b/example/low_precision_training/ppcls/data/preprocess/ops/grid.py
new file mode 100755
index 000000000..1a9a76d86
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/grid.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/akuxcw/GridMask
+# reference: https://arxiv.org/abs/2001.04086.
+
+import numpy as np
+from PIL import Image
+import pdb
+
+# curr
+CURR_EPOCH = 0
+# epoch for the prob to be the upper limit
+NUM_EPOCHS = 240
+
+
+class GridMask(object):
+    def __init__(self, d1=96, d2=224, rotate=1, ratio=0.5, mode=0, prob=1.):
+        self.d1 = d1
+        self.d2 = d2
+        self.rotate = rotate
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.last_prob = -1
+
+    def set_prob(self):
+        global CURR_EPOCH
+        global NUM_EPOCHS
+        self.prob = self.st_prob * min(1, 1.0 * CURR_EPOCH / NUM_EPOCHS)
+
+    def __call__(self, img):
+        self.set_prob()
+        if abs(self.last_prob - self.prob) > 1e-10:
+            global CURR_EPOCH
+            global NUM_EPOCHS
+            print(
+                "self.prob is updated, self.prob={}, CURR_EPOCH: {}, NUM_EPOCHS: {}".
+                format(self.prob, CURR_EPOCH, NUM_EPOCHS))
+            self.last_prob = self.prob
+        # print("CURR_EPOCH: {}, NUM_EPOCHS: {}, self.prob is set as: {}".format(CURR_EPOCH, NUM_EPOCHS, self.prob) )
+        if np.random.rand() > self.prob:
+            return img
+        _, h, w = img.shape
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(self.d1, self.d2)
+        #d = self.d
+        self.l = int(d * self.ratio + 0.5)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        for i in range(-1, hh // d + 1):
+            s = d * i + st_h
+            t = s + self.l
+            s = max(min(s, hh), 0)
+            t = max(min(t, hh), 0)
+            mask[s:t, :] *= 0
+        for i in range(-1, ww // d + 1):
+            s = d * i + st_w
+            t = s + self.l
+            s = max(min(s, ww), 0)
+            t = max(min(t, ww), 0)
+            mask[:, s:t] *= 0
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) //
+                    2 + w]
+
+        if self.mode == 1:
+            mask = 1 - mask
+
+        mask = np.expand_dims(mask, axis=0)
+        img = (img * mask).astype(img.dtype)
+
+        return img
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/hide_and_seek.py b/example/low_precision_training/ppcls/data/preprocess/ops/hide_and_seek.py
new file mode 100755
index 000000000..16fc671cf
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/hide_and_seek.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/kkanshul/Hide-and-Seek
+# reference: http://krsingh.cs.ucdavis.edu/krishna_files/papers/hide_and_seek/my_files/iccv2017.pdf
+
+import numpy as np
+import random
+
+
+class HideAndSeek(object):
+    def __init__(self):
+        # possible grid size, 0 means no hiding
+        self.grid_sizes = [0, 16, 32, 44, 56]
+        # hiding probability
+        self.hide_prob = 0.5
+
+    def __call__(self, img):
+        # randomly choose one grid size
+        grid_size = np.random.choice(self.grid_sizes)
+
+        _, h, w = img.shape
+
+        # hide the patches
+        if grid_size == 0:
+            return img
+        for x in range(0, w, grid_size):
+            for y in range(0, h, grid_size):
+                x_end = min(w, x + grid_size)
+                y_end = min(h, y + grid_size)
+                if (random.random() <= self.hide_prob):
+                    img[:, x:x_end, y:y_end] = 0
+
+        return img
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/operators.py b/example/low_precision_training/ppcls/data/preprocess/ops/operators.py
new file mode 100755
index 000000000..7ab50e791
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/operators.py
@@ -0,0 +1,926 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from functools import partial
+import io
+import six
+import math
+import random
+import cv2
+import numpy as np
+from PIL import Image, ImageOps, __version__ as PILLOW_VERSION
+from paddle.vision.transforms import ColorJitter as RawColorJitter
+from paddle.vision.transforms import CenterCrop, Resize
+from paddle.vision.transforms import RandomRotation as RawRandomRotation
+from paddle.vision.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomResizedCrop
+from paddle.vision.transforms import functional as F
+from .autoaugment import ImageNetPolicy
+from .functional import augmentations
+from ppcls.utils import logger
+
+
+def format_data(func):
+    def warpper(self, data):
+        if isinstance(data, dict):
+            img = data["img"]
+            result = func(self, img)
+            if not isinstance(result, dict):
+                result = {"img": result}
+            return { ** data, ** result}
+        else:
+            result = func(self, data)
+            if isinstance(result, dict):
+                result = result["img"]
+            return result
+
+    return warpper
+
+
+class UnifiedResize(object):
+    def __init__(self, interpolation=None, backend="cv2", return_numpy=True):
+        _cv2_interp_from_str = {
+            'nearest': cv2.INTER_NEAREST,
+            'bilinear': cv2.INTER_LINEAR,
+            'area': cv2.INTER_AREA,
+            'bicubic': cv2.INTER_CUBIC,
+            'lanczos': cv2.INTER_LANCZOS4,
+            'random': (cv2.INTER_LINEAR, cv2.INTER_CUBIC)
+        }
+        _pil_interp_from_str = {
+            'nearest': Image.NEAREST,
+            'bilinear': Image.BILINEAR,
+            'bicubic': Image.BICUBIC,
+            'box': Image.BOX,
+            'lanczos': Image.LANCZOS,
+            'hamming': Image.HAMMING,
+            'random': (Image.BILINEAR, Image.BICUBIC)
+        }
+
+        def _cv2_resize(src, size, resample):
+            if isinstance(resample, tuple):
+                resample = random.choice(resample)
+            return cv2.resize(src, size, interpolation=resample)
+
+        def _pil_resize(src, size, resample, return_numpy=True):
+            if isinstance(resample, tuple):
+                resample = random.choice(resample)
+            if isinstance(src, np.ndarray):
+                pil_img = Image.fromarray(src)
+            else:
+                pil_img = src
+            pil_img = pil_img.resize(size, resample)
+            if return_numpy:
+                return np.asarray(pil_img)
+            return pil_img
+
+        if backend.lower() == "cv2":
+            if isinstance(interpolation, str):
+                interpolation = _cv2_interp_from_str[interpolation.lower()]
+            # compatible with opencv < version 4.4.0
+            elif interpolation is None:
+                interpolation = cv2.INTER_LINEAR
+            self.resize_func = partial(_cv2_resize, resample=interpolation)
+        elif backend.lower() == "pil":
+            if isinstance(interpolation, str):
+                interpolation = _pil_interp_from_str[interpolation.lower()]
+            elif interpolation is None:
+                interpolation = Image.BILINEAR
+            self.resize_func = partial(
+                _pil_resize, resample=interpolation, return_numpy=return_numpy)
+        else:
+            logger.warning(
+                f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead."
+            )
+            self.resize_func = cv2.resize
+
+    def __call__(self, src, size):
+        if isinstance(size, list):
+            size = tuple(size)
+        return self.resize_func(src, size)
+
+
+class RandomInterpolationAugment(object):
+    def __init__(self, prob):
+        self.prob = prob
+
+    def _aug(self, img):
+        img_shape = img.shape
+        side_ratio = np.random.uniform(0.2, 1.0)
+        small_side = int(side_ratio * img_shape[0])
+        interpolation = np.random.choice([
+            cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA,
+            cv2.INTER_CUBIC, cv2.INTER_LANCZOS4
+        ])
+        small_img = cv2.resize(
+            img, (small_side, small_side), interpolation=interpolation)
+        interpolation = np.random.choice([
+            cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA,
+            cv2.INTER_CUBIC, cv2.INTER_LANCZOS4
+        ])
+        aug_img = cv2.resize(
+            small_img, (img_shape[1], img_shape[0]),
+            interpolation=interpolation)
+        return aug_img
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            if isinstance(img, np.ndarray):
+                return self._aug(img)
+            else:
+                pil_img = np.array(img)
+                aug_img = self._aug(pil_img)
+                img = Image.fromarray(aug_img.astype(np.uint8))
+                return img
+        else:
+            return img
+
+
+class OperatorParamError(ValueError):
+    """ OperatorParamError
+    """
+    pass
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self,
+                 to_np=True,
+                 to_rgb=True,
+                 channel_first=False,
+                 backend="cv2"):
+        self.to_np = to_np  # to numpy
+        self.to_rgb = to_rgb  # only enabled when to_np is True
+        self.channel_first = channel_first  # only enabled when to_np is True
+
+        if backend.lower() not in ["cv2", "pil"]:
+            logger.warning(
+                f"The backend of DecodeImage only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead."
+            )
+            backend = "cv2"
+        self.backend = backend.lower()
+
+        if not to_np:
+            logger.warning(
+                f"\"to_rgb\" and \"channel_first\" are only enabled when to_np is True. \"to_np\" is now {to_np}."
+            )
+
+    @format_data
+    def __call__(self, img):
+        if isinstance(img, Image.Image):
+            assert self.backend == "pil", "invalid input 'img' in DecodeImage"
+        elif isinstance(img, np.ndarray):
+            assert self.backend == "cv2", "invalid input 'img' in DecodeImage"
+        elif isinstance(img, bytes):
+            if self.backend == "pil":
+                data = io.BytesIO(img)
+                img = Image.open(data).convert("RGB")
+            else:
+                data = np.frombuffer(img, dtype="uint8")
+                img = cv2.imdecode(data, 1)
+        else:
+            raise ValueError("invalid input 'img' in DecodeImage")
+
+        if self.to_np:
+            if self.backend == "pil":
+                assert img.mode == "RGB", f"invalid mode of image[{img.mode}]"
+                img = np.asarray(img)[:, :, ::-1]  # BRG
+
+            if self.to_rgb:
+                assert img.shape[
+                    2] == 3, f"invalid shape of image[{img.shape}]"
+                img = img[:, :, ::-1]
+
+            if self.channel_first:
+                img = img.transpose((2, 0, 1))
+        return img
+
+
+class ResizeImage(object):
+    """ resize image """
+
+    def __init__(self,
+                 size=None,
+                 resize_short=None,
+                 interpolation=None,
+                 backend="cv2",
+                 return_numpy=True):
+        if resize_short is not None and resize_short > 0:
+            self.resize_short = resize_short
+            self.w = None
+            self.h = None
+        elif size is not None:
+            self.resize_short = None
+            self.w = size if type(size) is int else size[0]
+            self.h = size if type(size) is int else size[1]
+        else:
+            raise OperatorParamError("invalid params for ReisizeImage for '\
+                'both 'size' and 'resize_short' are None")
+
+        self._resize_func = UnifiedResize(
+            interpolation=interpolation,
+            backend=backend,
+            return_numpy=return_numpy)
+
+    @format_data
+    def __call__(self, img):
+        if isinstance(img, np.ndarray):
+            img_h, img_w = img.shape[:2]
+        else:
+            img_w, img_h = img.size
+
+        if self.resize_short is not None:
+            percent = float(self.resize_short) / min(img_w, img_h)
+            w = int(round(img_w * percent))
+            h = int(round(img_h * percent))
+        else:
+            w = self.w
+            h = self.h
+        return self._resize_func(img, (w, h))
+
+
+class CropWithPadding(RandomResizedCrop):
+    """
+    crop image and padding to original size
+    """
+
+    def __init__(self,
+                 prob=1,
+                 padding_num=0,
+                 size=224,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4, 4. / 3),
+                 interpolation='bilinear',
+                 key=None):
+        super().__init__(size, scale, ratio, interpolation, key)
+        self.prob = prob
+        self.padding_num = padding_num
+
+    def __call__(self, img):
+        is_cv2_img = False
+        if isinstance(img, np.ndarray):
+            flag = True
+        if np.random.random() < self.prob:
+            # RandomResizedCrop augmentation
+            new = np.zeros_like(np.array(img)) + self.padding_num
+            #  orig_W, orig_H = F._get_image_size(sample)
+            orig_W, orig_H = self._get_image_size(img)
+            i, j, h, w = self._get_param(img)
+            cropped = F.crop(img, i, j, h, w)
+            new[i:i + h, j:j + w, :] = np.array(cropped)
+            if not isinstance:
+                new = Image.fromarray(new.astype(np.uint8))
+            return new
+        else:
+            return img
+
+    def _get_image_size(self, img):
+        if F._is_pil_image(img):
+            return img.size
+        elif F._is_numpy_image(img):
+            return img.shape[:2][::-1]
+        elif F._is_tensor_image(img):
+            return img.shape[1:][::-1]  # chw
+        else:
+            raise TypeError("Unexpected type {}".format(type(img)))
+
+
+class CropImage(object):
+    """ crop image """
+
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size  # (h, w)
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class CropImageAtRatio(object):
+    """ crop image with specified size and padding"""
+
+    def __init__(self, size: int, pad: int, interpolation="bilinear"):
+        self.size = size
+        self.ratio = size / (size + pad)
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        height, width = img.shape[:2]
+        crop_size = int(self.ratio * min(height, width))
+
+        y = (height - crop_size) // 2
+        x = (width - crop_size) // 2
+
+        crop_img = img[y:y + crop_size, x:x + crop_size, :]
+        return F.resize(crop_img, [self.size, self.size], self.interpolation)
+
+
+class Padv2(object):
+    def __init__(self,
+                 size=None,
+                 size_divisor=32,
+                 pad_mode=0,
+                 offsets=None,
+                 fill_value=(127.5, 127.5, 127.5)):
+        """
+        Pad image to a specified size or multiple of size_divisor.
+        Args:
+            size (int, list): image target size, if None, pad to multiple of size_divisor, default None
+            size_divisor (int): size divisor, default 32
+            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
+                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
+            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
+            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
+        """
+
+        if not isinstance(size, (int, list)):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. \
+                            Must be List, now is {}".format(type(size)))
+
+        if isinstance(size, int):
+            size = [size, size]
+
+        assert pad_mode in [
+            -1, 0, 1, 2
+        ], 'currently only supports four modes [-1, 0, 1, 2]'
+        if pad_mode == -1:
+            assert offsets, 'if pad_mode is -1, offsets should not be None'
+
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_mode = pad_mode
+        self.fill_value = fill_value
+        self.offsets = offsets
+
+    def apply_image(self, image, offsets, im_size, size):
+        x, y = offsets
+        im_h, im_w = im_size
+        h, w = size
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
+        return canvas
+
+    def __call__(self, img):
+        im_h, im_w = img.shape[:2]
+        if self.size:
+            w, h = self.size
+            assert (
+                im_h <= h and im_w <= w
+            ), '(h, w) of target size should be greater than (im_h, im_w)'
+        else:
+            h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
+            w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
+
+        if h == im_h and w == im_w:
+            return img.astype(np.float32)
+
+        if self.pad_mode == -1:
+            offset_x, offset_y = self.offsets
+        elif self.pad_mode == 0:
+            offset_y, offset_x = 0, 0
+        elif self.pad_mode == 1:
+            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
+        else:
+            offset_y, offset_x = h - im_h, w - im_w
+
+        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
+
+        return self.apply_image(img, offsets, im_size, size)
+
+
+class RandomCropImage(object):
+    """Random crop image only
+    """
+
+    def __init__(self, size):
+        super(RandomCropImage, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+
+    def __call__(self, img):
+
+        h, w = img.shape[:2]
+        tw, th = self.size
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
+
+        img = img[i:i + th, j:j + tw, :]
+        return img
+
+
+class RandCropImage(object):
+    """ random crop image """
+
+    def __init__(self,
+                 size,
+                 progress_size=None,
+                 scale=None,
+                 ratio=None,
+                 interpolation=None,
+                 use_log_aspect=False,
+                 backend="cv2"):
+        if type(size) is int:
+            self.size = (size, size)  # (h, w)
+        else:
+            self.size = size
+
+        self.progress_size = progress_size
+        self.scale = [0.08, 1.0] if scale is None else scale
+        self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
+        self.use_log_aspect = use_log_aspect
+
+        self._resize_func = UnifiedResize(
+            interpolation=interpolation, backend=backend)
+
+    @format_data
+    def __call__(self, img):
+        size = self.size
+        scale = self.scale
+        ratio = self.ratio
+
+        if self.use_log_aspect:
+            log_ratio = list(map(math.log, ratio))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+        else:
+            aspect_ratio = random.uniform(*ratio)
+
+        img_h, img_w = img.shape[:2]
+        bound = min((float(img_w) / img_h) / aspect_ratio,
+                    (float(img_h) / img_w) * aspect_ratio)
+        scale_max = min(scale[1], bound)
+        scale_min = min(scale[0], bound)
+
+        target_area = img_w * img_h * random.uniform(scale_min, scale_max)
+        w = int(math.sqrt(target_area * aspect_ratio))
+        h = int(math.sqrt(target_area / aspect_ratio))
+
+        i = random.randint(0, img_w - w)
+        j = random.randint(0, img_h - h)
+
+        img = self._resize_func(img[j:j + h, i:i + w, :], size)
+        return img
+
+
+class RandCropImageV2(object):
+    """ RandCropImageV2 is different from RandCropImage,
+    it will Select a cutting position randomly in a uniform distribution way,
+    and cut according to the given size without resize at last."""
+
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)  # (h, w)
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        if isinstance(img, np.ndarray):
+            img_h, img_w = img.shape[0], img.shape[1]
+        else:
+            img_w, img_h = img.size
+        tw, th = self.size
+
+        if img_h + 1 < th or img_w + 1 < tw:
+            raise ValueError(
+                "Required crop size {} is larger then input image size {}".
+                format((th, tw), (img_h, img_w)))
+
+        if img_w == tw and img_h == th:
+            return img
+
+        top = random.randint(0, img_h - th + 1)
+        left = random.randint(0, img_w - tw + 1)
+        if isinstance(img, np.ndarray):
+            return img[top:top + th, left:left + tw, :]
+        else:
+            return img.crop((left, top, left + tw, top + th))
+
+
+class RandFlipImage(object):
+    """ random flip image
+        flip_code:
+            1: Flipped Horizontally
+            0: Flipped Vertically
+            -1: Flipped Horizontally & Vertically
+    """
+
+    def __init__(self, flip_code=1):
+        assert flip_code in [-1, 0, 1
+                             ], "flip_code should be a value in [-1, 0, 1]"
+        self.flip_code = flip_code
+
+    @format_data
+    def __call__(self, img):
+        if random.randint(0, 1) == 1:
+            if isinstance(img, np.ndarray):
+                return cv2.flip(img, self.flip_code)
+            else:
+                if self.flip_code == 1:
+                    return img.transpose(Image.FLIP_LEFT_RIGHT)
+                elif self.flip_code == 0:
+                    return img.transpose(Image.FLIP_TOP_BOTTOM)
+                else:
+                    return img.transpose(Image.FLIP_LEFT_RIGHT).transpose(
+                        Image.FLIP_LEFT_RIGHT)
+        else:
+            return img
+
+
+class AutoAugment(object):
+    def __init__(self):
+        self.policy = ImageNetPolicy()
+
+    def __call__(self, img):
+        from PIL import Image
+        img = np.ascontiguousarray(img)
+        img = Image.fromarray(img)
+        img = self.policy(img)
+        img = np.asarray(img)
+
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self,
+                 scale=None,
+                 mean=None,
+                 std=None,
+                 order='chw',
+                 output_fp16=False,
+                 channel_num=3):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        assert channel_num in [
+            3, 4
+        ], "channel number of input image should be set to 3 or 4."
+        self.channel_num = channel_num
+        self.output_dtype = 'float16' if output_fp16 else 'float32'
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        self.order = order
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if self.order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    @format_data
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+
+        img = (img.astype('float32') * self.scale - self.mean) / self.std
+
+        if self.channel_num == 4:
+            img_h = img.shape[1] if self.order == 'chw' else img.shape[0]
+            img_w = img.shape[2] if self.order == 'chw' else img.shape[1]
+            pad_zeros = np.zeros(
+                (1, img_h, img_w)) if self.order == 'chw' else np.zeros(
+                    (img_h, img_w, 1))
+            img = (np.concatenate(
+                (img, pad_zeros), axis=0)
+                   if self.order == 'chw' else np.concatenate(
+                       (img, pad_zeros), axis=2))
+
+        img = img.astype(self.output_dtype)
+        return img
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        return img.transpose((2, 0, 1))
+
+
+class AugMix(object):
+    """ Perform AugMix augmentation and compute mixture.
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 aug_prob_coeff=0.1,
+                 mixture_width=3,
+                 mixture_depth=1,
+                 aug_severity=1):
+        """
+        Args:
+            prob: Probability of taking augmix
+            aug_prob_coeff: Probability distribution coefficients.
+            mixture_width: Number of augmentation chains to mix per augmented example.
+            mixture_depth: Depth of augmentation chains. -1 denotes stochastic depth in [1, 3]'
+            aug_severity: Severity of underlying augmentation operators (between 1 to 10).
+        """
+        # fmt: off
+        self.prob = prob
+        self.aug_prob_coeff = aug_prob_coeff
+        self.mixture_width = mixture_width
+        self.mixture_depth = mixture_depth
+        self.aug_severity = aug_severity
+        self.augmentations = augmentations
+        # fmt: on
+
+    def __call__(self, image):
+        """Perform AugMix augmentations and compute mixture.
+        Returns:
+          mixed: Augmented and mixed image.
+        """
+        if random.random() > self.prob:
+            # Avoid the warning: the given NumPy array is not writeable
+            return np.asarray(image).copy()
+
+        ws = np.float32(
+            np.random.dirichlet([self.aug_prob_coeff] * self.mixture_width))
+        m = np.float32(
+            np.random.beta(self.aug_prob_coeff, self.aug_prob_coeff))
+
+        # image = Image.fromarray(image)
+        mix = np.zeros(image.shape)
+        for i in range(self.mixture_width):
+            image_aug = image.copy()
+            image_aug = Image.fromarray(image_aug)
+            depth = self.mixture_depth if self.mixture_depth > 0 else np.random.randint(
+                1, 4)
+            for _ in range(depth):
+                op = np.random.choice(self.augmentations)
+                image_aug = op(image_aug, self.aug_severity)
+            mix += ws[i] * np.asarray(image_aug)
+
+        mixed = (1 - m) * image + m * mix
+        return mixed.astype(np.uint8)
+
+
+class ColorJitter(RawColorJitter):
+    """ColorJitter.
+    """
+
+    def __init__(self, prob=2, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prob = prob
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            if not isinstance(img, Image.Image):
+                img = np.ascontiguousarray(img)
+                img = Image.fromarray(img)
+            img = super()._apply_image(img)
+            if isinstance(img, Image.Image):
+                img = np.asarray(img)
+        return img
+
+
+class RandomRotation(RawRandomRotation):
+    """RandomRotation.
+    """
+
+    def __init__(self, prob=0.5, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prob = prob
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            img = super()._apply_image(img)
+        return img
+
+
+class Pad(object):
+    """
+    Pads the given PIL.Image on all sides with specified padding mode and fill value.
+    adapted from: https://pytorch.org/vision/stable/_modules/torchvision/transforms/transforms.html#Pad
+    """
+
+    def __init__(self,
+                 padding: int,
+                 fill: int=0,
+                 padding_mode: str="constant",
+                 backend: str="pil"):
+        self.padding = padding
+        self.fill = fill
+        self.padding_mode = padding_mode
+        self.backend = backend
+        assert backend in [
+            "pil", "cv2"
+        ], f"backend must in ['pil', 'cv2'], but got {backend}"
+
+    def _parse_fill(self, fill, img, min_pil_version, name="fillcolor"):
+        # Process fill color for affine transforms
+        major_found, minor_found = (int(v)
+                                    for v in PILLOW_VERSION.split('.')[:2])
+        major_required, minor_required = (
+            int(v) for v in min_pil_version.split('.')[:2])
+        if major_found < major_required or (major_found == major_required and
+                                            minor_found < minor_required):
+            if fill is None:
+                return {}
+            else:
+                msg = (
+                    "The option to fill background area of the transformed image, "
+                    "requires pillow>={}")
+                raise RuntimeError(msg.format(min_pil_version))
+
+        num_bands = len(img.getbands())
+        if fill is None:
+            fill = 0
+        if isinstance(fill, (int, float)) and num_bands > 1:
+            fill = tuple([fill] * num_bands)
+        if isinstance(fill, (list, tuple)):
+            if len(fill) != num_bands:
+                msg = (
+                    "The number of elements in 'fill' does not match the number of "
+                    "bands of the image ({} != {})")
+                raise ValueError(msg.format(len(fill), num_bands))
+
+            fill = tuple(fill)
+
+        return {name: fill}
+
+    def __call__(self, img):
+        if self.backend == "pil":
+            opts = self._parse_fill(self.fill, img, "2.3.0", name="fill")
+            if img.mode == "P":
+                palette = img.getpalette()
+                img = ImageOps.expand(img, border=self.padding, **opts)
+                img.putpalette(palette)
+                return img
+            return ImageOps.expand(img, border=self.padding, **opts)
+        else:
+            img = cv2.copyMakeBorder(
+                img,
+                self.padding,
+                self.padding,
+                self.padding,
+                self.padding,
+                cv2.BORDER_CONSTANT,
+                value=(self.fill, self.fill, self.fill))
+            return img
+
+
+# TODO(gaotingquan): integrate into RandomRotation
+class RandomRot90(object):
+    """RandomRot90
+    """
+
+    def __init__(self):
+        pass
+
+    @format_data
+    def __call__(self, img):
+        orientation = random.choice([0, 1, 2, 3])
+        if orientation:
+            img = np.rot90(img, orientation)
+        return {"img": img, "random_rot90_orientation": orientation}
+
+
+class BlurImage(object):
+    """BlurImage
+    """
+
+    def __init__(self,
+                 ratio=0.5,
+                 motion_max_ksize=12,
+                 motion_max_angle=45,
+                 gaussian_max_ksize=12):
+        self.ratio = ratio
+        self.motion_max_ksize = motion_max_ksize
+        self.motion_max_angle = motion_max_angle
+        self.gaussian_max_ksize = gaussian_max_ksize
+
+    def _gaussian_blur(self, img, max_ksize=12):
+        ksize = (np.random.choice(np.arange(5, max_ksize, 2)),
+                 np.random.choice(np.arange(5, max_ksize, 2)))
+        img = cv2.GaussianBlur(img, ksize, 0)
+        return img
+
+    def _motion_blur(self, img, max_ksize=12, max_angle=45):
+        degree = np.random.choice(np.arange(5, max_ksize, 2))
+        angle = np.random.choice(np.arange(-1 * max_angle, max_angle))
+
+        M = cv2.getRotationMatrix2D((degree / 2, degree / 2), angle, 1)
+        motion_blur_kernel = np.diag(np.ones(degree))
+        motion_blur_kernel = cv2.warpAffine(motion_blur_kernel, M,
+                                            (degree, degree))
+
+        motion_blur_kernel = motion_blur_kernel / degree
+        blurred = cv2.filter2D(img, -1, motion_blur_kernel)
+
+        cv2.normalize(blurred, blurred, 0, 255, cv2.NORM_MINMAX)
+        img = np.array(blurred, dtype=np.uint8)
+        return img
+
+    @format_data
+    def __call__(self, img):
+        if random.random() > self.ratio:
+            label = 0
+        else:
+            method = random.choice(["gaussian", "motion"])
+            if method == "gaussian":
+                img = self._gaussian_blur(img, self.gaussian_max_ksize)
+            else:
+                img = self._motion_blur(img, self.motion_max_ksize,
+                                        self.motion_max_angle)
+            label = 1
+        return {"img": img, "blur_image": label}
+
+
+class RandomGrayscale(object):
+    """Randomly convert image to grayscale with a probability of p (default 0.1).
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+
+    Returns:
+        PIL Image: Grayscale version of the input image with probability p and unchanged
+        with probability (1-p).
+        - If input image is 1 channel: grayscale version is 1 channel
+        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
+
+    """
+
+    def __init__(self, p=0.1):
+        self.p = p
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be converted to grayscale.
+
+        Returns:
+            PIL Image: Randomly grayscaled image.
+        """
+        num_output_channels = 1 if img.mode == 'L' else 3
+        if random.random() < self.p:
+            return F.to_grayscale(img, num_output_channels=num_output_channels)
+        return img
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+
+
+class PCALighting(object):
+    """
+    Lighting noise(AlexNet - style PCA - based noise)
+    reference: https://github.com/DingXiaoH/DiverseBranchBlock
+    """
+
+    def __init__(self):
+        self.alphastd = 0.1
+        self.eigval = [0.2175, 0.0188, 0.0045]
+        self.eigvec = [
+            [-0.5675, 0.7192, 0.4009],
+            [-0.5808, -0.0045, -0.8140],
+            [-0.5836, -0.6948, 0.4203],
+        ]
+        self.eigval = np.array(self.eigval).astype(np.float32)
+        self.eigvec = np.array(self.eigvec).astype(np.float32)
+
+    def __call__(self, img):
+        if self.alphastd == 0:
+            return img
+
+        img = img.transpose((2, 0, 1))
+        alpha = np.random.normal(0, self.alphastd, size=(3)).astype(np.float32)
+        rgb = self.eigvec * np.broadcast_to(alpha.reshape(1, 3), (
+            3, 3)) * np.broadcast_to(self.eigval.reshape(1, 3), (3, 3))
+        rgb = rgb.sum(1).squeeze()
+        img = img + rgb.reshape(3, 1, 1)
+        return img.transpose((1, 2, 0))
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/randaugment.py b/example/low_precision_training/ppcls/data/preprocess/ops/randaugment.py
new file mode 100755
index 000000000..966f1412a
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/randaugment.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/heartInsert/randaugment
+# reference: https://arxiv.org/abs/1909.13719
+
+import random
+from .operators import RawColorJitter
+from .timm_autoaugment import _pil_interp
+from paddle.vision.transforms import transforms as T
+
+import numpy as np
+from PIL import Image, ImageEnhance, ImageOps
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def cutout(image, pad_size, replace=0):
+    image_np = np.array(image)
+    image_height, image_width, _ = image_np.shape
+
+    # Sample the center location in the image where the zero mask will be applied.
+    cutout_center_height = np.random.randint(0, image_height + 1)
+    cutout_center_width = np.random.randint(0, image_width + 1)
+
+    lower_pad = np.maximum(0, cutout_center_height - pad_size)
+    upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size)
+    left_pad = np.maximum(0, cutout_center_width - pad_size)
+    right_pad = np.maximum(0, image_width - cutout_center_width - pad_size)
+
+    cutout_shape = [
+        image_height - (lower_pad + upper_pad),
+        image_width - (left_pad + right_pad)
+    ]
+    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+    mask = np.pad(np.zeros(
+        cutout_shape, dtype=image_np.dtype),
+                  padding_dims,
+                  constant_values=1)
+    mask = np.expand_dims(mask, -1)
+    mask = np.tile(mask, [1, 1, 3])
+    image_np = np.where(
+        np.equal(mask, 0),
+        np.full_like(
+            image_np, fill_value=replace, dtype=image_np.dtype),
+        image_np)
+    return Image.fromarray(image_np)
+
+
+class RandAugment(object):
+    def __init__(self, num_layers=2, magnitude=5, fillcolor=(128, 128, 128)):
+        self.num_layers = num_layers
+        self.magnitude = magnitude
+        self.max_level = 10
+
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 150.0 / 331 * abso_level,
+            "translateY": 150.0 / 331 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 0.9 * abso_level,
+            "posterize": int(4.0 * abso_level),
+            "solarize": 256.0 * abso_level,
+            "contrast": 0.9 * abso_level,
+            "sharpness": 0.9 * abso_level,
+            "brightness": 0.9 * abso_level,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0
+        }
+
+        # from https://stackoverflow.com/questions/5252170/
+        # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "posterize": lambda img, magnitude:
+                ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+                ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude:
+                ImageEnhance.Contrast(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "sharpness": lambda img, magnitude:
+                ImageEnhance.Sharpness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "brightness": lambda img, magnitude:
+                ImageEnhance.Brightness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "autocontrast": lambda img, _:
+                ImageOps.autocontrast(img),
+            "equalize": lambda img, _: ImageOps.equalize(img),
+            "invert": lambda img, _: ImageOps.invert(img)
+        }
+
+    def __call__(self, img):
+        avaiable_op_names = list(self.level_map.keys())
+        for layer_num in range(self.num_layers):
+            op_name = np.random.choice(avaiable_op_names)
+            img = self.func[op_name](img, self.level_map[op_name])
+        return img
+
+
+class RandomApply(object):
+    def __init__(self, p, transforms):
+        self.p = p
+        ts = []
+        for t in transforms:
+            for key in t.keys():
+                ts.append(eval(key)(**t[key]))
+
+        self.trans = T.Compose(ts)
+
+    def __call__(self, img):
+        if self.p < np.random.rand(1):
+            return img
+        timg = self.trans(img)
+        return timg
+
+
+## RandAugment_EfficientNetV2 code below ##
+class RandAugmentV2(RandAugment):
+    """Customed RandAugment for EfficientNetV2"""
+
+    def __init__(self,
+                 num_layers=2,
+                 magnitude=5,
+                 progress_magnitude=None,
+                 fillcolor=(128, 128, 128)):
+        super().__init__(num_layers, magnitude, fillcolor)
+        self.progress_magnitude = progress_magnitude
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 100.0 * abso_level,
+            "translateY": 100.0 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 1.8 * abso_level + 0.1,
+            "posterize": int(4.0 * abso_level),
+            "solarize": int(256.0 * abso_level),
+            "solarize_add": int(110.0 * abso_level),
+            "contrast": 1.8 * abso_level + 0.1,
+            "sharpness": 1.8 * abso_level + 0.1,
+            "brightness": 1.8 * abso_level + 0.1,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0,
+            "cutout": int(40 * abso_level)
+        }
+
+        # from https://stackoverflow.com/questions/5252170/
+        # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                Image.NEAREST,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                Image.NEAREST,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * rnd_ch_op([-1, 1]), 0, 1, 0),
+                Image.NEAREST,
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * rnd_ch_op([-1, 1])),
+                Image.NEAREST,
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude * rnd_ch_op([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(magnitude),
+            "posterize": lambda img, magnitude:
+                ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+                ImageOps.solarize(img, magnitude),
+            "solarize_add": lambda img, magnitude:
+                solarize_add(img, magnitude),
+            "contrast": lambda img, magnitude:
+                ImageEnhance.Contrast(img).enhance(magnitude),
+            "sharpness": lambda img, magnitude:
+                ImageEnhance.Sharpness(img).enhance(magnitude),
+            "brightness": lambda img, magnitude:
+                ImageEnhance.Brightness(img).enhance(magnitude),
+            "autocontrast": lambda img, _:
+                ImageOps.autocontrast(img),
+            "equalize": lambda img, _: ImageOps.equalize(img),
+            "invert": lambda img, _: ImageOps.invert(img),
+            "cutout": lambda img, magnitude: cutout(img, magnitude, replace=fillcolor[0])
+        }
+
+
+class RandAugmentV3(RandAugment):
+    """Customed RandAugment for MobileViTV2"""
+
+    def __init__(self,
+                 num_layers=2,
+                 magnitude=3,
+                 fillcolor=(0, 0, 0),
+                 interpolation="bicubic"):
+        self.num_layers = num_layers
+        self.magnitude = magnitude
+        self.max_level = 10
+        interpolation = _pil_interp(interpolation)
+
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 150.0 / 331.0 * abso_level,
+            "translateY": 150.0 / 331.0 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 0.9 * abso_level,
+            "posterize": 8 - int(4.0 * abso_level),
+            "solarize": 255.0 * (1 - abso_level),
+            "contrast": 0.9 * abso_level,
+            "sharpness": 0.9 * abso_level,
+            "brightness": 0.9 * abso_level,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0
+        }
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                interpolation,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                interpolation,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0),
+                interpolation,
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])),
+                interpolation,
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: img.rotate(
+                magnitude * rnd_ch_op([-1, 1]),
+                interpolation,
+                fillcolor=fillcolor),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "posterize": lambda img, magnitude:
+                ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+                ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude:
+                ImageEnhance.Contrast(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "sharpness": lambda img, magnitude:
+                ImageEnhance.Sharpness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "brightness": lambda img, magnitude:
+                ImageEnhance.Brightness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "autocontrast": lambda img, _:
+                ImageOps.autocontrast(img),
+            "equalize": lambda img, _: ImageOps.equalize(img),
+            "invert": lambda img, _: ImageOps.invert(img)
+        }
+
+
+class SubPolicyV2(object):
+    """Custom SubPolicy for ML-Decoder"""
+
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int_),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10,
+            "cutout": np.round(np.linspace(0, 20, 10), 0).astype(np.int_),
+        }
+
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128,) * 4),
+                                   rot).convert(img.mode)
+
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img),
+            "cutout": lambda img, magnitude: cutout(img, magnitude),
+        }
+
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
+
+
+class RandAugmentV4(object):
+    """Custom RandAugment for ML-Decoder"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._policies = self.get_rand_policies()
+
+    @classmethod
+    def get_trans_list(cls):
+        trans_list = [
+            "shearX",
+            "shearY",
+            "translateX",
+            "translateY",
+            "rotate",
+            "color",
+            "posterize",
+            "solarize",
+            "contrast",
+            "sharpness",
+            "brightness",
+            "autocontrast",
+            "equalize",
+            "invert",
+            "cutout",
+        ]
+        return trans_list
+
+    @classmethod
+    def get_rand_policies(cls):
+        op_list = []
+        for trans in cls.get_trans_list():
+            for magnitude in range(1, 10):
+                op_list += [(0.5, trans, magnitude)]
+        policies = []
+        for op_1 in op_list:
+            for op_2 in op_list:
+                policies += [[op_1, op_2]]
+        return policies
+
+    def __call__(self, img):
+        randomly_chosen_policy = self._policies[
+            random.randint(0, len(self._policies) - 1)]
+        policy = SubPolicyV2(*randomly_chosen_policy[0],
+                             *randomly_chosen_policy[1])
+        img = policy(img)
+        return img
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/random_erasing.py b/example/low_precision_training/ppcls/data/preprocess/ops/random_erasing.py
new file mode 100755
index 000000000..e687283c7
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/random_erasing.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is adapted from https://github.com/zhunzhong07/Random-Erasing, and refer to Timm(https://github.com/rwightman/pytorch-image-models).
+# reference: https://arxiv.org/abs/1708.04896
+
+from functools import partial
+
+import math
+import random
+
+import numpy as np
+
+from .operators import format_data
+
+
+class Pixels(object):
+    def __init__(self, mode="const", mean=[0., 0., 0.]):
+        self._mode = mode
+        self._mean = np.array(mean)
+
+    def __call__(self, h=224, w=224, c=3, channel_first=False):
+        if self._mode == "rand":
+            return np.random.normal(size=(
+                1, 1, 3)) if not channel_first else np.random.normal(size=(
+                    3, 1, 1))
+        elif self._mode == "pixel":
+            return np.random.normal(size=(
+                h, w, c)) if not channel_first else np.random.normal(size=(
+                    c, h, w))
+        elif self._mode == "const":
+            return np.reshape(self._mean, (
+                1, 1, c)) if not channel_first else np.reshape(self._mean,
+                                                               (c, 1, 1))
+        else:
+            raise Exception(
+                "Invalid mode in RandomErasing, only support \"const\", \"rand\", \"pixel\""
+            )
+
+
+class RandomErasing(object):
+    """RandomErasing.
+    """
+
+    def __init__(self,
+                 EPSILON=0.5,
+                 sl=0.02,
+                 sh=0.4,
+                 r1=0.3,
+                 mean=[0., 0., 0.],
+                 attempt=100,
+                 use_log_aspect=False,
+                 mode='const'):
+        self.EPSILON = eval(EPSILON) if isinstance(EPSILON, str) else EPSILON
+        self.sl = eval(sl) if isinstance(sl, str) else sl
+        self.sh = eval(sh) if isinstance(sh, str) else sh
+        r1 = eval(r1) if isinstance(r1, str) else r1
+        self.r1 = (math.log(r1), math.log(1 / r1)) if use_log_aspect else (
+            r1, 1 / r1)
+        self.use_log_aspect = use_log_aspect
+        self.attempt = attempt
+        self.get_pixels = Pixels(mode, mean)
+
+    @format_data
+    def __call__(self, img):
+        if random.random() > self.EPSILON:
+            return img
+
+        for _ in range(self.attempt):
+            if isinstance(img, np.ndarray):
+                img_h, img_w, img_c = img.shape
+                channel_first = False
+            else:
+                img_c, img_h, img_w = img.shape
+                channel_first = True
+            area = img_h * img_w
+
+            target_area = random.uniform(self.sl, self.sh) * area
+            aspect_ratio = random.uniform(*self.r1)
+            if self.use_log_aspect:
+                aspect_ratio = math.exp(aspect_ratio)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < img_w and h < img_h:
+                pixels = self.get_pixels(h, w, img_c, channel_first)
+                x1 = random.randint(0, img_h - h)
+                y1 = random.randint(0, img_w - w)
+                if img_c == 3:
+                    if channel_first:
+                        img[:, x1:x1 + h, y1:y1 + w] = pixels
+                    else:
+                        img[x1:x1 + h, y1:y1 + w, :] = pixels
+                else:
+                    if channel_first:
+                        img[0, x1:x1 + h, y1:y1 + w] = pixels[0]
+                    else:
+                        img[x1:x1 + h, y1:y1 + w, 0] = pixels[:, :, 0]
+                return img
+
+        return img
diff --git a/example/low_precision_training/ppcls/data/preprocess/ops/timm_autoaugment.py b/example/low_precision_training/ppcls/data/preprocess/ops/timm_autoaugment.py
new file mode 100755
index 000000000..30f1f505a
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/preprocess/ops/timm_autoaugment.py
@@ -0,0 +1,878 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is heavily based on  https://github.com/rwightman/pytorch-image-models
+# reference: https://arxiv.org/abs/1805.09501
+
+import random
+import math
+import re
+from PIL import Image, ImageOps, ImageEnhance, ImageChops
+import PIL
+import numpy as np
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+
+_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
+
+_FILL = (128, 128, 128)
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+_HPARAMS_DEFAULT = dict(
+    translate_const=250,
+    img_mean=_FILL, )
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+def _pil_interp(method):
+    if method == 'bicubic':
+        return Image.BICUBIC
+    elif method == 'lanczos':
+        return Image.LANCZOS
+    elif method == 'hamming':
+        return Image.HAMMING
+    else:
+        # default bilinear, do we want to allow nearest?
+        return Image.BILINEAR
+
+
+def _interpolation(kwargs):
+    interpolation = kwargs.pop('resample', Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+
+def _check_args_tf(kwargs):
+    if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop('fillcolor')
+    kwargs['resample'] = _interpolation(kwargs)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0),
+                         **kwargs)
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0),
+                         **kwargs)
+
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0),
+                         **kwargs)
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels),
+                         **kwargs)
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0),
+                         **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels),
+                         **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(-rotn_center[0] - post_trans[0],
+                                         -rotn_center[1] - post_trans[1],
+                                         matrix)
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs['resample'])
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate(level)
+    return level,
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return (level / _MAX_LEVEL) * 1.8 + 0.1,
+
+
+def _enhance_increasing_level_to_arg(level, _hparams):
+    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
+    # range [0.1, 1.9]
+    level = (level / _MAX_LEVEL) * .9
+    level = 1.0 + _randomly_negate(level)
+    return level,
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams['translate_const']
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_rel_level_to_arg(level, hparams):
+    # default range [-0.45, 0.45]
+    translate_pct = hparams.get('translate_pct', 0.45)
+    level = (level / _MAX_LEVEL) * translate_pct
+    level = _randomly_negate(level)
+    return level,
+
+
+def _posterize_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4),
+
+
+def _posterize_increasing_level_to_arg(level, hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image',
+    # intensity/severity of augmentation increases with level
+    return 4 - _posterize_level_to_arg(level, hparams)[0],
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4) + 4,
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 256),
+
+
+def _solarize_increasing_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation increases with level
+    return 256 - _solarize_level_to_arg(level, _hparams)[0],
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return int((level / _MAX_LEVEL) * 110),
+
+
+LEVEL_TO_ARG = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Rotate': _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
+    'Posterize': _posterize_level_to_arg,
+    'PosterizeIncreasing': _posterize_increasing_level_to_arg,
+    'PosterizeOriginal': _posterize_original_level_to_arg,
+    'Solarize': _solarize_level_to_arg,
+    'SolarizeIncreasing': _solarize_increasing_level_to_arg,
+    'SolarizeAdd': _solarize_add_level_to_arg,
+    'Color': _enhance_level_to_arg,
+    'ColorIncreasing': _enhance_increasing_level_to_arg,
+    'Contrast': _enhance_level_to_arg,
+    'ContrastIncreasing': _enhance_increasing_level_to_arg,
+    'Brightness': _enhance_level_to_arg,
+    'BrightnessIncreasing': _enhance_increasing_level_to_arg,
+    'Sharpness': _enhance_level_to_arg,
+    'SharpnessIncreasing': _enhance_increasing_level_to_arg,
+    'ShearX': _shear_level_to_arg,
+    'ShearY': _shear_level_to_arg,
+    'TranslateX': _translate_abs_level_to_arg,
+    'TranslateY': _translate_abs_level_to_arg,
+    'TranslateXRel': _translate_rel_level_to_arg,
+    'TranslateYRel': _translate_rel_level_to_arg,
+}
+
+NAME_TO_OP = {
+    'AutoContrast': auto_contrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': rotate,
+    'Posterize': posterize,
+    'PosterizeIncreasing': posterize,
+    'PosterizeOriginal': posterize,
+    'Solarize': solarize,
+    'SolarizeIncreasing': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'ColorIncreasing': color,
+    'Contrast': contrast,
+    'ContrastIncreasing': contrast,
+    'Brightness': brightness,
+    'BrightnessIncreasing': brightness,
+    'Sharpness': sharpness,
+    'SharpnessIncreasing': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x_abs,
+    'TranslateY': translate_y_abs,
+    'TranslateXRel': translate_x_rel,
+    'TranslateYRel': translate_y_rel,
+}
+
+
+class AugmentOp(object):
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.aug_fn = NAME_TO_OP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = dict(
+            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
+            resample=hparams['interpolation']
+            if 'interpolation' in hparams else _RANDOM_INTERPOLATION, )
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        self.magnitude_std = self.hparams.get('magnitude_std', 0)
+
+    def __call__(self, img):
+        if self.prob < 1.0 and random.random() > self.prob:
+            return img
+        magnitude = self.magnitude
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = self.level_fn(
+            magnitude, self.hparams) if self.level_fn is not None else tuple()
+        return self.aug_fn(img, *level_args, **self.kwargs)
+
+
+def auto_augment_policy_v0(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)
+         ],  # This results in black image with Tpu posterize
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_v0r(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used
+    # in Google research implementation (number of bits discarded increases with magnitude)
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('PosterizeIncreasing', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_original(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501
+    policy = [
+        [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_originalr(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
+    policy = [
+        [('PosterizeIncreasing', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeIncreasing', 0.6, 7), ('PosterizeIncreasing', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeIncreasing', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy(name='v0', hparams=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    if name == 'original':
+        return auto_augment_policy_original(hparams)
+    elif name == 'originalr':
+        return auto_augment_policy_originalr(hparams)
+    elif name == 'v0':
+        return auto_augment_policy_v0(hparams)
+    elif name == 'v0r':
+        return auto_augment_policy_v0r(hparams)
+    else:
+        assert False, 'Unknown AA policy (%s)' % name
+
+
+class AutoAugment(object):
+    def __init__(self, policy):
+        self.policy = policy
+
+    def __call__(self, img):
+        sub_policy = random.choice(self.policy)
+        for op in sub_policy:
+            img = op(img)
+        return img
+
+
+def auto_augment_transform(config_str, hparams):
+    """
+    Create a AutoAugment transform
+
+    :param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr').
+    The remaining sections, not order sepecific determine
+        'mstd' -  float std deviation of magnitude noise applied
+    Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
+
+    :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
+
+    :return: A callable Transform Op
+    """
+    config = config_str.split('-')
+    policy_name = config[0]
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        else:
+            assert False, 'Unknown AutoAugment config section'
+    aa_policy = auto_augment_policy(policy_name, hparams=hparams)
+    return AutoAugment(aa_policy)
+
+
+_RAND_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'Posterize',
+    'Solarize',
+    'SolarizeAdd',
+    'Color',
+    'Contrast',
+    'Brightness',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+_RAND_INCREASING_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'SolarizeAdd',
+    'ColorIncreasing',
+    'ContrastIncreasing',
+    'BrightnessIncreasing',
+    'SharpnessIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    'Rotate': 0.3,
+    'ShearX': 0.2,
+    'ShearY': 0.2,
+    'TranslateXRel': 0.1,
+    'TranslateYRel': 0.1,
+    'Color': .025,
+    'Sharpness': 0.025,
+    'AutoContrast': 0.025,
+    'Solarize': .005,
+    'SolarizeAdd': .005,
+    'Contrast': .005,
+    'Brightness': .005,
+    'Equalize': .005,
+    'Posterize': 0,
+    'Invert': 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0  # only one set of weights currently
+    rand_weights = _RAND_CHOICE_WEIGHTS_0
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [
+        AugmentOp(
+            name, prob=0.5, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
+
+
+class RandAugment(object):
+    def __init__(self, ops, num_layers=2, choice_weights=None):
+        self.ops = ops
+        self.num_layers = num_layers
+        self.choice_weights = choice_weights
+
+    def __call__(self, img):
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops,
+            self.num_layers,
+            replace=self.choice_weights is None,
+            p=self.choice_weights)
+        for op in ops:
+            img = op(img)
+        return img
+
+
+def rand_augment_transform(config_str, hparams):
+    """
+    Create a RandAugment transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude of rand augment
+        'n' - integer num layers (number of transform ops selected per image)
+        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
+        'mstd' -  float std deviation of magnitude noise applied
+        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
+    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
+    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
+
+    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
+
+    :return: A callable Transform Op
+    """
+    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
+    num_layers = 2  # default to 2 ops per image
+    weight_idx = None  # default to no probability weights for op choice
+    transforms = _RAND_TRANSFORMS
+    config = config_str.split('-')
+    assert config[0] == 'rand'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'inc':
+            if bool(val):
+                transforms = _RAND_INCREASING_TRANSFORMS
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'n':
+            num_layers = int(val)
+        elif key == 'w':
+            weight_idx = int(val)
+        else:
+            assert False, 'Unknown RandAugment config section'
+    ra_ops = rand_augment_ops(
+        magnitude=magnitude, hparams=hparams, transforms=transforms)
+    choice_weights = None if weight_idx is None else _select_rand_weights(
+        weight_idx)
+    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
+
+
+_AUGMIX_TRANSFORMS = [
+    'AutoContrast',
+    'ColorIncreasing',  # not in paper
+    'ContrastIncreasing',  # not in paper
+    'BrightnessIncreasing',  # not in paper
+    'SharpnessIncreasing',  # not in paper
+    'Equalize',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+]
+
+
+def augmix_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _AUGMIX_TRANSFORMS
+    return [
+        AugmentOp(
+            name, prob=1.0, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
+
+
+class AugMixAugment(object):
+    """ AugMix Transform
+    Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py
+    From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty -
+    https://arxiv.org/abs/1912.02781
+    """
+
+    def __init__(self, ops, alpha=1., width=3, depth=-1, blended=False):
+        self.ops = ops
+        self.alpha = alpha
+        self.width = width
+        self.depth = depth
+        self.blended = blended  # blended mode is faster but not well tested
+
+    def _calc_blended_weights(self, ws, m):
+        ws = ws * m
+        cump = 1.
+        rws = []
+        for w in ws[::-1]:
+            alpha = w / cump
+            cump *= (1 - alpha)
+            rws.append(alpha)
+        return np.array(rws[::-1], dtype=np.float32)
+
+    def _apply_blended(self, img, mixing_weights, m):
+        # This is my first crack and implementing a slightly faster mixed augmentation. Instead
+        # of accumulating the mix for each chain in a Numpy array and then blending with original,
+        # it recomputes the blending coefficients and applies one PIL image blend per chain.
+        # TODO the results appear in the right ballpark but they differ by more than rounding.
+        img_orig = img.copy()
+        ws = self._calc_blended_weights(mixing_weights, m)
+        for w in ws:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img_orig  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            img = Image.blend(img, img_aug, w)
+        return img
+
+    def _apply_basic(self, img, mixing_weights, m):
+        # This is a literal adaptation of the paper/official implementation without normalizations and
+        # PIL <-> Numpy conversions between every op. It is still quite CPU compute heavy compared to the
+        # typical augmentation transforms, could use a GPU / Kornia implementation.
+        img_shape = img.size[0], img.size[1], len(img.getbands())
+        mixed = np.zeros(img_shape, dtype=np.float32)
+        for mw in mixing_weights:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            mixed += mw * np.asarray(img_aug, dtype=np.float32)
+        np.clip(mixed, 0, 255., out=mixed)
+        mixed = Image.fromarray(mixed.astype(np.uint8))
+        return Image.blend(img, mixed, m)
+
+    def __call__(self, img):
+        mixing_weights = np.float32(
+            np.random.dirichlet([self.alpha] * self.width))
+        m = np.float32(np.random.beta(self.alpha, self.alpha))
+        if self.blended:
+            mixed = self._apply_blended(img, mixing_weights, m)
+        else:
+            mixed = self._apply_basic(img, mixing_weights, m)
+        return mixed
+
+
+def augment_and_mix_transform(config_str, hparams):
+    """ Create AugMix transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude (severity) of augmentation mix (default: 3)
+        'w' - integer width of augmentation chain (default: 3)
+        'd' - integer depth of augmentation chain (-1 is random [1, 3], default: -1)
+        'b' - integer (bool), blend each branch of chain into end result without a final blend, less CPU (default: 0)
+        'mstd' -  float std deviation of magnitude noise applied (default: 0)
+    Ex 'augmix-m5-w4-d2' results in AugMix with severity 5, chain width 4, chain depth 2
+
+    :param hparams: Other hparams (kwargs) for the Augmentation transforms
+
+    :return: A callable Transform Op
+    """
+    magnitude = 3
+    width = 3
+    depth = -1
+    alpha = 1.
+    blended = False
+    config = config_str.split('-')
+    assert config[0] == 'augmix'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'w':
+            width = int(val)
+        elif key == 'd':
+            depth = int(val)
+        elif key == 'a':
+            alpha = float(val)
+        elif key == 'b':
+            blended = bool(val)
+        else:
+            assert False, 'Unknown AugMix config section'
+    ops = augmix_ops(magnitude=magnitude, hparams=hparams)
+    return AugMixAugment(
+        ops, alpha=alpha, width=width, depth=depth, blended=blended)
+
+
+class RawTimmAutoAugment(object):
+    """TimmAutoAugment API for PaddleClas."""
+
+    def __init__(self,
+                 config_str="rand-m9-mstd0.5-inc1",
+                 interpolation="bicubic",
+                 img_size=224,
+                 mean=IMAGENET_DEFAULT_MEAN):
+        if isinstance(img_size, (tuple, list)):
+            img_size_min = min(img_size)
+        else:
+            img_size_min = img_size
+
+        aa_params = dict(
+            translate_const=int(img_size_min * 0.45),
+            img_mean=tuple([min(255, round(255 * x)) for x in mean]), )
+        if interpolation and interpolation != 'random':
+            aa_params['interpolation'] = _pil_interp(interpolation)
+        if config_str.startswith('rand'):
+            self.augment_func = rand_augment_transform(config_str, aa_params)
+        elif config_str.startswith('augmix'):
+            aa_params['translate_pct'] = 0.3
+            self.augment_func = augment_and_mix_transform(config_str,
+                                                          aa_params)
+        elif config_str.startswith('auto'):
+            self.augment_func = auto_augment_transform(config_str, aa_params)
+        else:
+            raise Exception(
+                "ConfigError: The TimmAutoAugment Op only support RandAugment, AutoAugment, AugMix, and the config_str only starts with \"rand\", \"augmix\", \"auto\"."
+            )
+
+    def __call__(self, img):
+        return self.augment_func(img)
diff --git a/example/low_precision_training/ppcls/data/utils/__init__.py b/example/low_precision_training/ppcls/data/utils/__init__.py
new file mode 100755
index 000000000..61d5aa213
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/data/utils/get_image_list.py b/example/low_precision_training/ppcls/data/utils/get_image_list.py
new file mode 100755
index 000000000..9b6de0690
--- /dev/null
+++ b/example/low_precision_training/ppcls/data/utils/get_image_list.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import base64
+import numpy as np
+
+
+def get_image_list(img_file, infer_list=None):
+    imgs_lists = []
+    if infer_list and not os.path.exists(infer_list):
+        raise Exception("not found infer list {}".format(infer_list))
+    if infer_list:
+        with open(infer_list, "r") as f:
+            lines = f.readlines()
+        for line in lines:
+            image_path = line.strip(" ").split()[0]
+            image_path = os.path.join(img_file, image_path)
+            imgs_lists.append(image_path)
+    else:
+        if img_file is None or not os.path.exists(img_file):
+            raise Exception("not found any img file in {}".format(img_file))
+        img_end = ['jpg', 'png', 'jpeg', 'JPEG', 'JPG', 'bmp']
+        if os.path.isfile(img_file) and img_file.split('.')[-1] in img_end:
+            imgs_lists.append(img_file)
+        elif os.path.isdir(img_file):
+            for root, dirs, files in os.walk(img_file):
+                for single_file in files:
+                    if single_file.split('.')[-1] in img_end:
+                        imgs_lists.append(os.path.join(root, single_file))
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    imgs_lists = sorted(imgs_lists)
+    return imgs_lists
+
+
+def get_image_list_from_label_file(image_path, label_file_path):
+    imgs_lists = []
+    gt_labels = []
+    with open(label_file_path, "r") as fin:
+        lines = fin.readlines()
+        for line in lines:
+            image_name, label = line.strip("\n").split()
+            label = int(label)
+            imgs_lists.append(os.path.join(image_path, image_name))
+            gt_labels.append(int(label))
+    return imgs_lists, gt_labels
diff --git a/example/low_precision_training/ppcls/engine/__init__.py b/example/low_precision_training/ppcls/engine/__init__.py
new file mode 100755
index 000000000..e69de29bb
diff --git a/example/low_precision_training/ppcls/engine/engine.py b/example/low_precision_training/ppcls/engine/engine.py
new file mode 100755
index 000000000..893401d9a
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/engine.py
@@ -0,0 +1,2013 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import copy
+import platform
+import paddle
+import paddle.distributed as dist
+# from visualdl import LogWriter
+from paddle import nn
+import numpy as np
+import random
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.utils.config import print_config, dump_infer_config
+from ppcls.data import build_dataloader
+from ppcls.arch import build_model, RecModel, DistillationModel, TheseusLayer
+from ppcls.arch import apply_to_static
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.utils.amp import AutoCast, build_scaler
+from ppcls.utils.ema import ExponentialMovingAverage
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.utils.save_load import init_model
+from ppcls.utils.save_result import update_train_results
+from ppcls.utils import save_load, save_predict_result
+
+from ppcls.data.utils.get_image_list import get_image_list
+from ppcls.data.postprocess import build_postprocess
+from ppcls.data import create_operators
+from ppcls.engine import train as train_method
+from ppcls.engine.train.utils import type_name
+from ppcls.engine import evaluation
+from ppcls.arch.gears.identity_head import IdentityHead
+
+from quantization import build_quantization_manager
+
+class Engine(object):
+    def __init__(self, config, mode="train"):
+        assert mode in ["train", "eval", "infer", "export"]
+        self.mode = mode
+        self.config = config
+        self.eval_mode = self.config["Global"].get("eval_mode",
+                                                   "classification")
+        self.train_mode = self.config["Global"].get("train_mode", None)
+        if "Head" in self.config["Arch"] or self.config["Arch"].get("is_rec",
+                                                                    False):
+            self.is_rec = True
+        else:
+            self.is_rec = False
+
+        # set seed
+        seed = self.config["Global"].get("seed", False)
+        if seed or seed == 0:
+            assert isinstance(seed, int), "The 'seed' must be a integer!"
+            paddle.seed(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+
+        # init logger
+        self.output_dir = self.config['Global']['output_dir']
+        log_file = os.path.join(self.output_dir, f"{mode}.log")
+        log_ranks = self.config['Global'].get("log_ranks", "0")
+        init_logger(log_file=log_file, log_ranks=log_ranks)
+        print_config(config)
+
+        # init train_func and eval_func
+        assert self.eval_mode in [
+            "classification", "retrieval", "adaface"
+        ], logger.error("Invalid eval mode: {}".format(self.eval_mode))
+        if self.train_mode is None:
+            self.train_epoch_func = train_method.train_epoch
+        else:
+            self.train_epoch_func = getattr(train_method,
+                                            "train_epoch_" + self.train_mode)
+        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")
+
+        self.use_dali = self.config['Global'].get("use_dali", False)
+
+        # for visualdl
+        self.vdl_writer = None
+        if self.config['Global'][
+                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
+            vdl_writer_path = self.output_dir
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+
+        # set device
+        assert self.config["Global"]["device"] in [
+            "cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"
+        ]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+        # gradient accumulation
+        self.update_freq = self.config["Global"].get("update_freq", 1)
+
+        if "class_num" in config["Global"]:
+            global_class_num = config["Global"]["class_num"]
+            if "class_num" not in config["Arch"]:
+                config["Arch"]["class_num"] = global_class_num
+                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
+            else:
+                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
+            logger.warning(msg)
+        #TODO(gaotingquan): support rec
+        class_num = config["Arch"].get("class_num", None)
+        self.config["DataLoader"].update({"class_num": class_num})
+        self.config["DataLoader"].update({
+            "epochs": self.config["Global"]["epochs"]
+        })
+
+        # build dataloader
+        #################### 
+        if self.mode == 'train' or self.mode == "eval":
+        #################### 
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)
+            if self.config["DataLoader"].get('UnLabelTrain', None) is not None:
+                self.unlabel_train_dataloader = build_dataloader(
+                    self.config["DataLoader"], "UnLabelTrain", self.device,
+                    self.use_dali)
+            else:
+                self.unlabel_train_dataloader = None
+
+            self.iter_per_epoch = len(
+                self.train_dataloader) - 1 if platform.system(
+                ) == "Windows" else len(self.train_dataloader)
+            if self.config["Global"].get("iter_per_epoch", None):
+                # set max iteration per epoch mannualy, when training by iteration(s), such as XBM, FixMatch.
+                self.iter_per_epoch = self.config["Global"].get(
+                    "iter_per_epoch")
+            if self.iter_per_epoch < self.update_freq:
+                logger.warning(
+                    "The arg Global.update_freq greater than iter_per_epoch and has been set to 1. This may be caused by too few of batches."
+                )
+                self.update_freq = 1
+            self.iter_per_epoch = self.iter_per_epoch // self.update_freq * self.update_freq
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode in ["classification", "adaface"]:
+                self.eval_dataloader = build_dataloader(
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)
+            elif self.eval_mode == "retrieval":
+                self.gallery_query_dataloader = None
+                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
+                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
+                    self.gallery_query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], key, self.device,
+                        self.use_dali)
+                else:
+                    self.gallery_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Gallery",
+                        self.device, self.use_dali)
+                    self.query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Query", self.device,
+                        self.use_dali)
+
+        # build loss
+        if self.mode == "train":
+            label_loss_info = self.config["Loss"]["Train"]
+            self.train_loss_func = build_loss(label_loss_info)
+            unlabel_loss_info = self.config.get("UnLabelLoss", {}).get("Train",
+                                                                       None)
+            self.unlabel_train_loss_func = build_loss(unlabel_loss_info)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            loss_config = self.config.get("Loss", None)
+            if loss_config is not None:
+                loss_config = loss_config.get("Eval")
+                if loss_config is not None:
+                    self.eval_loss_func = build_loss(loss_config)
+                else:
+                    self.eval_loss_func = None
+            else:
+                self.eval_loss_func = None
+
+        # build metric
+        if self.mode == 'train' and "Metric" in self.config and "Train" in self.config[
+                "Metric"] and self.config["Metric"]["Train"]:
+            metric_config = self.config["Metric"]["Train"]
+            if hasattr(self.train_dataloader, "collate_fn"
+                       ) and self.train_dataloader.collate_fn is not None:
+                for m_idx, m in enumerate(metric_config):
+                    if "TopkAcc" in m:
+                        msg = f"Unable to calculate accuracy when using \"batch_transform_ops\". The metric \"{m}\" has been removed."
+                        logger.warning(msg)
+                        metric_config.pop(m_idx)
+            self.train_metric_func = build_metrics(metric_config)
+        else:
+            self.train_metric_func = None
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode == "classification":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    self.eval_metric_func = build_metrics(self.config["Metric"][
+                        "Eval"])
+                else:
+                    self.eval_metric_func = None
+            elif self.eval_mode == "retrieval":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    metric_config = self.config["Metric"]["Eval"]
+                else:
+                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
+                self.eval_metric_func = build_metrics(metric_config)
+        else:
+            self.eval_metric_func = None
+
+        # build model
+        self.model = build_model(self.config, self.mode)
+        
+        # set @to_static for benchmark, skip this by default.
+        apply_to_static(self.config, self.model, is_rec=self.is_rec)
+
+        # load_pretrain
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(
+                [self.model, getattr(self, 'train_loss_func', None)],
+                self.config["Global"]["pretrained_model"])
+
+        # build optimizer
+        if self.mode == 'train':
+            self.optimizer, self.lr_sch = build_optimizer(
+                self.config["Optimizer"], self.config["Global"]["epochs"],
+                self.iter_per_epoch // self.update_freq,
+                [self.model, self.train_loss_func])
+        # amp
+        self._init_amp()
+
+        # build EMA model
+        self.ema = "EMA" in self.config and self.mode == "train"
+        if self.ema:
+            self.model_ema = ExponentialMovingAverage(
+                self.model, self.config['EMA'].get("decay", 0.9999))
+
+        # check the gpu num
+        world_size = dist.get_world_size()
+        self.config["Global"]["distributed"] = world_size != 1
+        if self.mode == "train":
+            std_gpu_num = 8 if isinstance(
+                self.config["Optimizer"],
+                dict) and self.config["Optimizer"]["name"] == "AdamW" else 4
+            if world_size != std_gpu_num:
+                msg = f"The training strategy provided by PaddleClas is based on {std_gpu_num} gpus. But the number of gpu is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use this config to train."
+                logger.warning(msg)
+
+        # for distributed
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+            self.model = paddle.DataParallel(self.model)
+            if self.mode == 'train' and len(self.train_loss_func.parameters(
+            )) > 0:
+                self.train_loss_func = paddle.DataParallel(self.train_loss_func)
+
+            # set different seed in different GPU manually in distributed environment
+            if seed is None:
+                logger.warning(
+                    "The random seed cannot be None in a distributed environment. Global.seed has been set to 42 by default"
+                )
+                self.config["Global"]["seed"] = seed = 42
+            logger.info(
+                f"Set random seed to ({int(seed)} + $PADDLE_TRAINER_ID) for different trainer"
+            )
+            paddle.seed(int(seed) + dist.get_rank())
+            np.random.seed(int(seed) + dist.get_rank())
+            random.seed(int(seed) + dist.get_rank())
+
+        # build postprocess for infer
+        if self.mode == 'infer':
+            self.preprocess_func = create_operators(self.config["Infer"][
+                "transforms"])
+            self.postprocess_func = build_postprocess(self.config["Infer"][
+                "PostProcess"])
+
+    def train(self):
+        assert self.mode == "train"
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": -1.0,
+            "epoch": 0,
+        }
+        acc_ema = -1.0
+        best_metric_ema = -1.0
+        ema_module = None
+        if self.ema:
+            ema_module = self.model_ema.module
+        # key:
+        # val: metrics list word
+        self.output_info = dict()
+        self.time_info = {
+            "batch_cost": AverageMeter(
+                "batch_cost", '.5f', postfix=" s,"),
+            "reader_cost": AverageMeter(
+                "reader_cost", ".5f", postfix=" s,"),
+        }
+        # global iter counter
+        self.global_step = 0
+        uniform_output_enabled = self.config['Global'].get(
+            "uniform_output_enabled", False)
+
+        if self.config.Global.checkpoints is not None:
+            metric_info = init_model(self.config.Global, self.model,
+                                     self.optimizer, self.train_loss_func,
+                                     ema_module)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+            if hasattr(self.train_dataloader.batch_sampler, "set_epoch"):
+                self.train_dataloader.batch_sampler.set_epoch(best_metric[
+                    "epoch"])
+
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            # for one epoch train
+            self.train_epoch_func(self, epoch_id, print_batch_step)
+
+            if self.use_dali:
+                self.train_dataloader.reset()
+            metric_msg = ", ".join(
+                [self.output_info[key].avg_info for key in self.output_info])
+            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
+                epoch_id, self.config["Global"]["epochs"], metric_msg))
+            self.output_info.clear()
+
+            # eval model and save model if possible
+            start_eval_epoch = self.config["Global"].get("start_eval_epoch",
+                                                         0) - 1
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_interval"] == 0 and epoch_id > start_eval_epoch:
+                acc = self.eval(epoch_id)
+
+                # step lr (by epoch) according to given metric, such as acc
+                for i in range(len(self.lr_sch)):
+                    if getattr(self.lr_sch[i], "by_epoch", False) and \
+                            type_name(self.lr_sch[i]) == "ReduceOnPlateau":
+                        self.lr_sch[i].step(acc)
+
+                # update best_metric
+                if acc >= best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                logger.info("[Eval][Epoch {}][best metric: {}]".format(
+                    epoch_id, best_metric["metric"]))
+                logger.scaler(
+                    name="eval_acc",
+                    value=acc,
+                    step=epoch_id,
+                    writer=self.vdl_writer)
+
+                if self.ema:
+                    ori_model, self.model = self.model, ema_module
+                    acc_ema = self.eval(epoch_id)
+                    self.model = ori_model
+                    ema_module.eval()
+
+                    # update best_ema
+                    if acc_ema > best_metric_ema:
+                        best_metric_ema = acc_ema
+                    logger.info("[Eval][Epoch {}][best metric ema: {}]".format(
+                        epoch_id, best_metric_ema))
+                    logger.scaler(
+                        name="eval_acc_ema",
+                        value=acc_ema,
+                        step=epoch_id,
+                        writer=self.vdl_writer)
+
+                # save best model from best_acc or best_ema_acc
+                if max(acc, acc_ema) >= max(best_metric["metric"],
+                                            best_metric_ema):
+                    metric_info = {
+                        "metric": max(acc, acc_ema),
+                        "epoch": epoch_id
+                    }
+                    prefix = "best_model"
+                    save_load.save_model(
+                        self.model,
+                        self.optimizer,
+                        metric_info,
+                        os.path.join(self.output_dir, prefix)
+                        if uniform_output_enabled else self.output_dir,
+                        ema=ema_module,
+                        model_name=self.config["Arch"]["name"],
+                        prefix=prefix,
+                        loss=self.train_loss_func,
+                        save_student_model=True)
+                    if uniform_output_enabled:
+                        save_path = os.path.join(self.output_dir, prefix,
+                                                 "inference")
+                        self.export(save_path, uniform_output_enabled)
+                        if self.ema:
+                            ema_save_path = os.path.join(
+                                self.output_dir, prefix, "inference_ema")
+                            self.export(ema_save_path, uniform_output_enabled)
+                        update_train_results(
+                            self.config, prefix, metric_info, ema=self.ema)
+                        save_load.save_model_info(metric_info, self.output_dir,
+                                                  prefix)
+
+                self.model.train()
+
+            # save model
+            if save_interval > 0 and epoch_id % save_interval == 0:
+                metric_info = {"metric": acc, "epoch": epoch_id}
+                prefix = "epoch_{}".format(epoch_id)
+                save_load.save_model(
+                    self.model,
+                    self.optimizer,
+                    metric_info,
+                    os.path.join(self.output_dir, prefix)
+                    if uniform_output_enabled else self.output_dir,
+                    ema=ema_module,
+                    model_name=self.config["Arch"]["name"],
+                    prefix=prefix,
+                    loss=self.train_loss_func)
+                if uniform_output_enabled:
+                    save_path = os.path.join(self.output_dir, prefix,
+                                             "inference")
+                    self.export(save_path, uniform_output_enabled)
+                    if self.ema:
+                        ema_save_path = os.path.join(self.output_dir, prefix,
+                                                     "inference_ema")
+                        self.export(ema_save_path, uniform_output_enabled)
+                    update_train_results(
+                        self.config,
+                        prefix,
+                        metric_info,
+                        done_flag=epoch_id == self.config["Global"]["epochs"],
+                        ema=self.ema)
+                    save_load.save_model_info(metric_info, self.output_dir,
+                                              prefix)
+            # save the latest model
+            metric_info = {"metric": acc, "epoch": epoch_id}
+            prefix = "latest"
+            save_load.save_model(
+                self.model,
+                self.optimizer,
+                metric_info,
+                os.path.join(self.output_dir, prefix)
+                if uniform_output_enabled else self.output_dir,
+                ema=ema_module,
+                model_name=self.config["Arch"]["name"],
+                prefix=prefix,
+                loss=self.train_loss_func)
+            if uniform_output_enabled:
+                save_path = os.path.join(self.output_dir, prefix, "inference")
+                self.export(save_path, uniform_output_enabled)
+                if self.ema:
+                    ema_save_path = os.path.join(self.output_dir, prefix,
+                                                 "inference_ema")
+                    self.export(ema_save_path, uniform_output_enabled)
+                save_load.save_model_info(metric_info, self.output_dir, prefix)
+                self.model.train()
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.close()
+
+    def upbn(self, max_iter = 100):
+        # switch to train mode
+        self.model.train()
+        for i, (images, target) in enumerate(self.train_dataloader):
+            # compute output
+            output = self.model(images)
+            if i >= max_iter:
+                break
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        assert self.mode in ["train", "eval"]
+
+        self.model.eval()
+        eval_result = self.eval_func(self, epoch_id)
+        self.model.train()
+        return eval_result
+
+    @paddle.no_grad()
+    def infer(self):
+        assert self.mode == "infer" and self.eval_mode == "classification"
+        results = []
+        total_trainer = dist.get_world_size()
+        local_rank = dist.get_rank()
+        infer_imgs = self.config["Infer"]["infer_imgs"]
+        infer_list = self.config["Infer"].get("infer_list", None)
+        image_list = get_image_list(infer_imgs, infer_list=infer_list)
+        # data split
+        image_list = image_list[local_rank::total_trainer]
+
+        batch_size = self.config["Infer"]["batch_size"]
+        self.model.eval()
+        batch_data = []
+        image_file_list = []
+        save_path = self.config["Infer"].get("save_dir", None)
+        for idx, image_file in enumerate(image_list):
+            with open(image_file, 'rb') as f:
+                x = f.read()
+            try:
+                for process in self.preprocess_func:
+                    x = process(x)
+                batch_data.append(x)
+                image_file_list.append(image_file)
+                if len(batch_data) >= batch_size or idx == len(image_list) - 1:
+                    batch_tensor = paddle.to_tensor(batch_data)
+
+                    with self.auto_cast(is_eval=True):
+                        out = self.model(batch_tensor)
+
+                    if isinstance(out, list):
+                        out = out[0]
+                    if isinstance(out, dict) and "Student" in out:
+                        out = out["Student"]
+                    if isinstance(out, dict) and "logits" in out:
+                        out = out["logits"]
+                    if isinstance(out, dict) and "output" in out:
+                        out = out["output"]
+
+                    result = self.postprocess_func(out, image_file_list)
+                    if not save_path:
+                        logger.info(result)
+                    results.extend(result)
+                    batch_data.clear()
+                    image_file_list.clear()
+            except Exception as ex:
+                logger.error(
+                    "Exception occured when parse line: {} with msg: {}".format(
+                        image_file, ex))
+                continue
+        if save_path:
+            save_predict_result(save_path, results)
+        return results
+
+    def export(self,
+               save_path=None,
+               uniform_output_enabled=False,
+               ema_module=None):
+        assert self.mode == "export" or uniform_output_enabled
+        if paddle.distributed.get_rank() != 0:
+            return
+        use_multilabel = self.config["Global"].get(
+            "use_multilabel",
+            False) or "ATTRMetric" in self.config["Metric"]["Eval"][0]
+        model = self.model_ema.module if self.ema else self.model
+        if hasattr(model, '_layers'):
+            model = copy.deepcopy(model._layers)
+        else:
+            model = copy.deepcopy(model)
+        model = ExportModel(self.config["Arch"], model
+                            if not ema_module else ema_module, use_multilabel)
+        if self.config["Global"][
+                "pretrained_model"] is not None and not uniform_output_enabled:
+            load_dygraph_pretrain(model.base_model,
+                                  self.config["Global"]["pretrained_model"])
+        model.eval()
+        # for re-parameterization nets
+        for layer in model.sublayers():
+            if hasattr(layer, "re_parameterize") and not getattr(layer,
+                                                                 "is_repped"):
+                layer.re_parameterize()
+        if not save_path:
+            save_path = os.path.join(
+                self.config["Global"]["save_inference_dir"], "inference")
+        else:
+            save_path = os.path.join(save_path, "inference")
+
+        model = paddle.jit.to_static(
+            model,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None] + self.config["Global"]["image_shape"],
+                    dtype='float32')
+            ])
+        if hasattr(model.base_model,
+                   "quanter") and model.base_model.quanter is not None:
+            model.base_model.quanter.save_quantized_model(model,
+                                                          save_path + "_int8")
+        else:
+            paddle.jit.save(model, save_path)
+        if self.config["Global"].get("export_for_fd",
+                                     False) or uniform_output_enabled:
+            dst_path = os.path.join(os.path.dirname(save_path), 'inference.yml')
+            dump_infer_config(self.config, dst_path)
+        logger.info(
+            f"Export succeeded! The inference model exported has been saved in \"{save_path}\"."
+        )
+
+    def _init_amp(self):
+        if self.mode == "export":
+            return
+
+        amp_config = self.config.get("AMP", None)
+        use_amp = True if amp_config and amp_config.get("use_amp",
+                                                        True) else False
+
+        if not use_amp:
+            self.auto_cast = AutoCast(use_amp)
+            self.scaler = build_scaler(use_amp)
+        else:
+            AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, }
+            if paddle.is_compiled_with_cuda():
+                AMP_RELATED_FLAGS_SETTING.update({
+                    'FLAGS_cudnn_batchnorm_spatial_persistent': 1
+                })
+            paddle.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+            use_promote = amp_config.get("use_promote", False)
+            amp_level = amp_config.get("level", "O1")
+            if amp_level not in ["O1", "O2"]:
+                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
+                logger.warning(msg)
+                amp_level = amp_config["level"] = "O1"
+
+            amp_eval = self.config["AMP"].get("use_fp16_test", False)
+            # TODO(gaotingquan): Paddle not yet support FP32 evaluation when training with AMPO2
+            if self.mode == "train" and self.config["Global"].get(
+                    "eval_during_train",
+                    True) and amp_level == "O2" and amp_eval == False:
+                msg = "PaddlePaddle only support FP16 evaluation when training with AMP O2 now. "
+                logger.warning(msg)
+                self.config["AMP"]["use_fp16_test"] = True
+                amp_eval = True
+
+            self.auto_cast = AutoCast(
+                use_amp,
+                amp_level=amp_level,
+                use_promote=use_promote,
+                amp_eval=amp_eval)
+
+            scale_loss = amp_config.get("scale_loss", 1.0)
+            use_dynamic_loss_scaling = amp_config.get(
+                "use_dynamic_loss_scaling", False)
+            self.scaler = build_scaler(
+                use_amp,
+                scale_loss=scale_loss,
+                use_dynamic_loss_scaling=use_dynamic_loss_scaling)
+
+            if self.mode == "train":
+                self.model, self.optimizer = paddle.amp.decorate(
+                    models=self.model,
+                    optimizers=self.optimizer,
+                    level=amp_level,
+                    save_dtype='float32')
+            elif amp_eval:
+                self.model = paddle.amp.decorate(
+                    models=self.model, level=amp_level, save_dtype='float32')
+
+            if self.mode == "train" and len(self.train_loss_func.parameters(
+            )) > 0:
+                self.train_loss_func = paddle.amp.decorate(
+                    models=self.train_loss_func,
+                    level=self.amp_level,
+                    save_dtype='float32')
+
+
+class Engine_qt(object):
+    def __init__(self, config, mode="train"):
+        assert mode in ["train", "eval", "infer", "export"]
+        self.mode = mode
+        self.config = config
+        self.eval_mode = self.config["Global"].get("eval_mode",
+                                                   "classification")
+        self.train_mode = self.config["Global"].get("train_mode", None)
+        if "Head" in self.config["Arch"] or self.config["Arch"].get("is_rec",
+                                                                    False):
+            self.is_rec = True
+        else:
+            self.is_rec = False
+
+        # set seed
+        seed = self.config["Global"].get("seed", False)
+        if seed or seed == 0:
+            assert isinstance(seed, int), "The 'seed' must be a integer!"
+            paddle.seed(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+
+        # init logger
+        self.output_dir = self.config['Global']['output_dir']
+        log_file = os.path.join(self.output_dir, f"{mode}.log")
+        log_ranks = self.config['Global'].get("log_ranks", "0")
+        init_logger(log_file=log_file, log_ranks=log_ranks)
+        print_config(config)
+
+        # init train_func and eval_func
+        assert self.eval_mode in [
+            "classification", "retrieval", "adaface"
+        ], logger.error("Invalid eval mode: {}".format(self.eval_mode))
+        if self.train_mode is None:
+            self.train_epoch_func = train_method.train_epoch
+        else:
+            self.train_epoch_func = getattr(train_method,
+                                            "train_epoch_" + self.train_mode)
+        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")
+
+        self.use_dali = self.config['Global'].get("use_dali", False)
+
+        # for visualdl
+        self.vdl_writer = None
+        if self.config['Global'][
+                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
+            vdl_writer_path = self.output_dir
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+
+        # set device
+        assert self.config["Global"]["device"] in [
+            "cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"
+        ]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+        # gradient accumulation
+        self.update_freq = self.config["Global"].get("update_freq", 1)
+
+        if "class_num" in config["Global"]:
+            global_class_num = config["Global"]["class_num"]
+            if "class_num" not in config["Arch"]:
+                config["Arch"]["class_num"] = global_class_num
+                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
+            else:
+                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
+            logger.warning(msg)
+        #TODO(gaotingquan): support rec
+        class_num = config["Arch"].get("class_num", None)
+        self.config["DataLoader"].update({"class_num": class_num})
+        self.config["DataLoader"].update({
+            "epochs": self.config["Global"]["epochs"]
+        })
+
+        # build dataloader
+        #################### 
+        if self.mode == 'train' or self.mode == "eval":
+        #################### 
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)
+            if self.config["DataLoader"].get('UnLabelTrain', None) is not None:
+                self.unlabel_train_dataloader = build_dataloader(
+                    self.config["DataLoader"], "UnLabelTrain", self.device,
+                    self.use_dali)
+            else:
+                self.unlabel_train_dataloader = None
+
+            self.iter_per_epoch = len(
+                self.train_dataloader) - 1 if platform.system(
+                ) == "Windows" else len(self.train_dataloader)
+            if self.config["Global"].get("iter_per_epoch", None):
+                # set max iteration per epoch mannualy, when training by iteration(s), such as XBM, FixMatch.
+                self.iter_per_epoch = self.config["Global"].get(
+                    "iter_per_epoch")
+            if self.iter_per_epoch < self.update_freq:
+                logger.warning(
+                    "The arg Global.update_freq greater than iter_per_epoch and has been set to 1. This may be caused by too few of batches."
+                )
+                self.update_freq = 1
+            self.iter_per_epoch = self.iter_per_epoch // self.update_freq * self.update_freq
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode in ["classification", "adaface"]:
+                self.eval_dataloader = build_dataloader(
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)
+            elif self.eval_mode == "retrieval":
+                self.gallery_query_dataloader = None
+                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
+                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
+                    self.gallery_query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], key, self.device,
+                        self.use_dali)
+                else:
+                    self.gallery_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Gallery",
+                        self.device, self.use_dali)
+                    self.query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Query", self.device,
+                        self.use_dali)
+
+        # build loss
+        if self.mode == "train":
+            label_loss_info = self.config["Loss"]["Train"]
+            self.train_loss_func = build_loss(label_loss_info)
+            unlabel_loss_info = self.config.get("UnLabelLoss", {}).get("Train",
+                                                                       None)
+            self.unlabel_train_loss_func = build_loss(unlabel_loss_info)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            loss_config = self.config.get("Loss", None)
+            if loss_config is not None:
+                loss_config = loss_config.get("Eval")
+                if loss_config is not None:
+                    self.eval_loss_func = build_loss(loss_config)
+                else:
+                    self.eval_loss_func = None
+            else:
+                self.eval_loss_func = None
+
+        # build metric
+        if self.mode == 'train' and "Metric" in self.config and "Train" in self.config[
+                "Metric"] and self.config["Metric"]["Train"]:
+            metric_config = self.config["Metric"]["Train"]
+            if hasattr(self.train_dataloader, "collate_fn"
+                       ) and self.train_dataloader.collate_fn is not None:
+                for m_idx, m in enumerate(metric_config):
+                    if "TopkAcc" in m:
+                        msg = f"Unable to calculate accuracy when using \"batch_transform_ops\". The metric \"{m}\" has been removed."
+                        logger.warning(msg)
+                        metric_config.pop(m_idx)
+            self.train_metric_func = build_metrics(metric_config)
+        else:
+            self.train_metric_func = None
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode == "classification":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    self.eval_metric_func = build_metrics(self.config["Metric"][
+                        "Eval"])
+                else:
+                    self.eval_metric_func = None
+            elif self.eval_mode == "retrieval":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    metric_config = self.config["Metric"]["Eval"]
+                else:
+                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
+                self.eval_metric_func = build_metrics(metric_config)
+        else:
+            self.eval_metric_func = None
+
+        # build model
+        self.model = build_model(self.config, self.mode)
+
+        # for name, param in self.model.named_parameters():
+        #     print(f"Parameter: {name}")
+        #     print(f"Shape: {param.shape}")
+        # assert 0==1
+
+        # for name, param in self.model.named_parameters():
+        #     print(f"Parameter: {name}")
+        #     print(f"Shape: {param.shape}")
+     
+        # set @to_static for benchmark, skip this by default.
+        apply_to_static(self.config, self.model, is_rec=self.is_rec)
+
+        # load_pretrain
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(
+                [self.model, getattr(self, 'train_loss_func', None)],
+                self.config["Global"]["pretrained_model"])
+
+        # build optimizer
+        if self.mode == 'train':
+            self.optimizer, self.lr_sch = build_optimizer(
+                self.config["Optimizer"], self.config["Global"]["epochs"],
+                self.iter_per_epoch // self.update_freq,
+                [self.model, self.train_loss_func])
+        # amp
+        self._init_amp()
+
+        # build EMA model
+        self.ema = "EMA" in self.config and self.mode == "train"
+        if self.ema:
+            self.model_ema = ExponentialMovingAverage(
+                self.model, self.config['EMA'].get("decay", 0.9999))
+
+        # check the gpu num
+        world_size = dist.get_world_size()
+        self.config["Global"]["distributed"] = world_size != 1
+        if self.mode == "train":
+            std_gpu_num = 8 if isinstance(
+                self.config["Optimizer"],
+                dict) and self.config["Optimizer"]["name"] == "AdamW" else 4
+            if world_size != std_gpu_num:
+                msg = f"The training strategy provided by PaddleClas is based on {std_gpu_num} gpus. But the number of gpu is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use this config to train."
+                logger.warning(msg)
+
+        # for distributed
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+            self.model = paddle.DataParallel(self.model)
+            if self.mode == 'train' and len(self.train_loss_func.parameters(
+            )) > 0:
+                self.train_loss_func = paddle.DataParallel(self.train_loss_func)
+
+            # set different seed in different GPU manually in distributed environment
+            if seed is None:
+                logger.warning(
+                    "The random seed cannot be None in a distributed environment. Global.seed has been set to 42 by default"
+                )
+                self.config["Global"]["seed"] = seed = 42
+            logger.info(
+                f"Set random seed to ({int(seed)} + $PADDLE_TRAINER_ID) for different trainer"
+            )
+            paddle.seed(int(seed) + dist.get_rank())
+            np.random.seed(int(seed) + dist.get_rank())
+            random.seed(int(seed) + dist.get_rank())
+
+        # build postprocess for infer
+        if self.mode == 'infer':
+            self.preprocess_func = create_operators(self.config["Infer"][
+                "transforms"])
+            self.postprocess_func = build_postprocess(self.config["Infer"][
+                "PostProcess"])
+            
+        ############ quantization    
+        self.qm = build_quantization_manager(self.config["qconfig"])
+        self.model = self.qm.prepare(self.model)
+
+    def train(self):
+        assert self.mode == "train"
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": -1.0,
+            "epoch": 0,
+        }
+        acc_ema = -1.0
+        best_metric_ema = -1.0
+        ema_module = None
+        if self.ema:
+            ema_module = self.model_ema.module
+        # key:
+        # val: metrics list word
+        self.output_info = dict()
+        self.time_info = {
+            "batch_cost": AverageMeter(
+                "batch_cost", '.5f', postfix=" s,"),
+            "reader_cost": AverageMeter(
+                "reader_cost", ".5f", postfix=" s,"),
+        }
+        # global iter counter
+        self.global_step = 0
+        uniform_output_enabled = self.config['Global'].get(
+            "uniform_output_enabled", False)
+
+        if self.config.Global.checkpoints is not None:
+            metric_info = init_model(self.config.Global, self.model,
+                                     self.optimizer, self.train_loss_func,
+                                     ema_module)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+            if hasattr(self.train_dataloader.batch_sampler, "set_epoch"):
+                self.train_dataloader.batch_sampler.set_epoch(best_metric[
+                    "epoch"])
+
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            # for one epoch train
+            self.train_epoch_func(self, epoch_id, print_batch_step)
+
+            if self.use_dali:
+                self.train_dataloader.reset()
+            metric_msg = ", ".join(
+                [self.output_info[key].avg_info for key in self.output_info])
+            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
+                epoch_id, self.config["Global"]["epochs"], metric_msg))
+            self.output_info.clear()
+
+            # eval model and save model if possible
+            start_eval_epoch = self.config["Global"].get("start_eval_epoch",
+                                                         0) - 1
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_interval"] == 0 and epoch_id > start_eval_epoch:
+                acc = self.eval(epoch_id)
+
+                # step lr (by epoch) according to given metric, such as acc
+                for i in range(len(self.lr_sch)):
+                    if getattr(self.lr_sch[i], "by_epoch", False) and \
+                            type_name(self.lr_sch[i]) == "ReduceOnPlateau":
+                        self.lr_sch[i].step(acc)
+
+                # update best_metric
+                if acc >= best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                logger.info("[Eval][Epoch {}][best metric: {}]".format(
+                    epoch_id, best_metric["metric"]))
+                logger.scaler(
+                    name="eval_acc",
+                    value=acc,
+                    step=epoch_id,
+                    writer=self.vdl_writer)
+
+                if self.ema:
+                    ori_model, self.model = self.model, ema_module
+                    acc_ema = self.eval(epoch_id)
+                    self.model = ori_model
+                    ema_module.eval()
+
+                    # update best_ema
+                    if acc_ema > best_metric_ema:
+                        best_metric_ema = acc_ema
+                    logger.info("[Eval][Epoch {}][best metric ema: {}]".format(
+                        epoch_id, best_metric_ema))
+                    logger.scaler(
+                        name="eval_acc_ema",
+                        value=acc_ema,
+                        step=epoch_id,
+                        writer=self.vdl_writer)
+
+                # save best model from best_acc or best_ema_acc
+                if max(acc, acc_ema) >= max(best_metric["metric"],
+                                            best_metric_ema):
+                    metric_info = {
+                        "metric": max(acc, acc_ema),
+                        "epoch": epoch_id
+                    }
+                    prefix = "best_model"
+                    save_load.save_model(
+                        self.model,
+                        self.optimizer,
+                        metric_info,
+                        os.path.join(self.output_dir, prefix)
+                        if uniform_output_enabled else self.output_dir,
+                        ema=ema_module,
+                        model_name=self.config["Arch"]["name"],
+                        prefix=prefix,
+                        loss=self.train_loss_func,
+                        save_student_model=True)
+                    if uniform_output_enabled:
+                        save_path = os.path.join(self.output_dir, prefix,
+                                                 "inference")
+                        self.export(save_path, uniform_output_enabled)
+                        if self.ema:
+                            ema_save_path = os.path.join(
+                                self.output_dir, prefix, "inference_ema")
+                            self.export(ema_save_path, uniform_output_enabled)
+                        update_train_results(
+                            self.config, prefix, metric_info, ema=self.ema)
+                        save_load.save_model_info(metric_info, self.output_dir,
+                                                  prefix)
+
+                self.model.train()
+
+            # save model
+            if save_interval > 0 and epoch_id % save_interval == 0:
+                metric_info = {"metric": acc, "epoch": epoch_id}
+                prefix = "epoch_{}".format(epoch_id)
+                save_load.save_model(
+                    self.model,
+                    self.optimizer,
+                    metric_info,
+                    os.path.join(self.output_dir, prefix)
+                    if uniform_output_enabled else self.output_dir,
+                    ema=ema_module,
+                    model_name=self.config["Arch"]["name"],
+                    prefix=prefix,
+                    loss=self.train_loss_func)
+                if uniform_output_enabled:
+                    save_path = os.path.join(self.output_dir, prefix,
+                                             "inference")
+                    self.export(save_path, uniform_output_enabled)
+                    if self.ema:
+                        ema_save_path = os.path.join(self.output_dir, prefix,
+                                                     "inference_ema")
+                        self.export(ema_save_path, uniform_output_enabled)
+                    update_train_results(
+                        self.config,
+                        prefix,
+                        metric_info,
+                        done_flag=epoch_id == self.config["Global"]["epochs"],
+                        ema=self.ema)
+                    save_load.save_model_info(metric_info, self.output_dir,
+                                              prefix)
+            # save the latest model
+            metric_info = {"metric": acc, "epoch": epoch_id}
+            prefix = "latest"
+            save_load.save_model(
+                self.model,
+                self.optimizer,
+                metric_info,
+                os.path.join(self.output_dir, prefix)
+                if uniform_output_enabled else self.output_dir,
+                ema=ema_module,
+                model_name=self.config["Arch"]["name"],
+                prefix=prefix,
+                loss=self.train_loss_func)
+            if uniform_output_enabled:
+                save_path = os.path.join(self.output_dir, prefix, "inference")
+                self.export(save_path, uniform_output_enabled)
+                if self.ema:
+                    ema_save_path = os.path.join(self.output_dir, prefix,
+                                                 "inference_ema")
+                    self.export(ema_save_path, uniform_output_enabled)
+                save_load.save_model_info(metric_info, self.output_dir, prefix)
+                self.model.train()
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.close()
+
+    def upbn(self, max_iter = 50):
+        # switch to train mode
+        self.model.train()
+        if self.config["eval_quant"]:
+            self.qm.enable_quantization(self.model)
+        else:
+            self.qm.disable_quantization(self.model)
+        for i, (images, target) in enumerate(self.train_dataloader):
+            # compute output
+            output = self.model(images)
+            if i >= max_iter:
+                break
+        self.qm.enable_quantization(self.model)
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        assert self.mode in ["train", "eval"]
+        ##########
+        if self.config["upbn"]:
+            self.upbn()
+        ##########
+
+        self.model.eval()
+        ##########
+        if self.config["eval_quant"]:
+            self.qm.enable_quantization(self.model)
+        else:
+            self.qm.disable_quantization(self.model)
+        ##########
+
+        eval_result = self.eval_func(self, epoch_id)
+        self.model.train()
+
+        ##########
+        self.qm.enable_quantization(self.model)
+        ##########
+        
+        return eval_result
+
+    @paddle.no_grad()
+    def infer(self):
+        assert self.mode == "infer" and self.eval_mode == "classification"
+        results = []
+        total_trainer = dist.get_world_size()
+        local_rank = dist.get_rank()
+        infer_imgs = self.config["Infer"]["infer_imgs"]
+        infer_list = self.config["Infer"].get("infer_list", None)
+        image_list = get_image_list(infer_imgs, infer_list=infer_list)
+        # data split
+        image_list = image_list[local_rank::total_trainer]
+
+        batch_size = self.config["Infer"]["batch_size"]
+        self.model.eval()
+        batch_data = []
+        image_file_list = []
+        save_path = self.config["Infer"].get("save_dir", None)
+        for idx, image_file in enumerate(image_list):
+            with open(image_file, 'rb') as f:
+                x = f.read()
+            try:
+                for process in self.preprocess_func:
+                    x = process(x)
+                batch_data.append(x)
+                image_file_list.append(image_file)
+                if len(batch_data) >= batch_size or idx == len(image_list) - 1:
+                    batch_tensor = paddle.to_tensor(batch_data)
+
+                    with self.auto_cast(is_eval=True):
+                        out = self.model(batch_tensor)
+
+                    if isinstance(out, list):
+                        out = out[0]
+                    if isinstance(out, dict) and "Student" in out:
+                        out = out["Student"]
+                    if isinstance(out, dict) and "logits" in out:
+                        out = out["logits"]
+                    if isinstance(out, dict) and "output" in out:
+                        out = out["output"]
+
+                    result = self.postprocess_func(out, image_file_list)
+                    if not save_path:
+                        logger.info(result)
+                    results.extend(result)
+                    batch_data.clear()
+                    image_file_list.clear()
+            except Exception as ex:
+                logger.error(
+                    "Exception occured when parse line: {} with msg: {}".format(
+                        image_file, ex))
+                continue
+        if save_path:
+            save_predict_result(save_path, results)
+        return results
+
+    def export(self,
+               save_path=None,
+               uniform_output_enabled=False,
+               ema_module=None):
+        assert self.mode == "export" or uniform_output_enabled
+        if paddle.distributed.get_rank() != 0:
+            return
+        use_multilabel = self.config["Global"].get(
+            "use_multilabel",
+            False) or "ATTRMetric" in self.config["Metric"]["Eval"][0]
+        model = self.model_ema.module if self.ema else self.model
+        if hasattr(model, '_layers'):
+            model = copy.deepcopy(model._layers)
+        else:
+            model = copy.deepcopy(model)
+        model = ExportModel(self.config["Arch"], model
+                            if not ema_module else ema_module, use_multilabel)
+        if self.config["Global"][
+                "pretrained_model"] is not None and not uniform_output_enabled:
+            load_dygraph_pretrain(model.base_model,
+                                  self.config["Global"]["pretrained_model"])
+        model.eval()
+        # for re-parameterization nets
+        for layer in model.sublayers():
+            if hasattr(layer, "re_parameterize") and not getattr(layer,
+                                                                 "is_repped"):
+                layer.re_parameterize()
+        if not save_path:
+            save_path = os.path.join(
+                self.config["Global"]["save_inference_dir"], "inference")
+        else:
+            save_path = os.path.join(save_path, "inference")
+
+        model = paddle.jit.to_static(
+            model,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None] + self.config["Global"]["image_shape"],
+                    dtype='float32')
+            ])
+        if hasattr(model.base_model,
+                   "quanter") and model.base_model.quanter is not None:
+            model.base_model.quanter.save_quantized_model(model,
+                                                          save_path + "_int8")
+        else:
+            paddle.jit.save(model, save_path)
+        if self.config["Global"].get("export_for_fd",
+                                     False) or uniform_output_enabled:
+            dst_path = os.path.join(os.path.dirname(save_path), 'inference.yml')
+            dump_infer_config(self.config, dst_path)
+        logger.info(
+            f"Export succeeded! The inference model exported has been saved in \"{save_path}\"."
+        )
+
+    def _init_amp(self):
+        if self.mode == "export":
+            return
+
+        amp_config = self.config.get("AMP", None)
+        use_amp = True if amp_config and amp_config.get("use_amp",
+                                                        True) else False
+
+        if not use_amp:
+            self.auto_cast = AutoCast(use_amp)
+            self.scaler = build_scaler(use_amp)
+        else:
+            AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, }
+            if paddle.is_compiled_with_cuda():
+                AMP_RELATED_FLAGS_SETTING.update({
+                    'FLAGS_cudnn_batchnorm_spatial_persistent': 1
+                })
+            paddle.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+            use_promote = amp_config.get("use_promote", False)
+            amp_level = amp_config.get("level", "O1")
+            if amp_level not in ["O1", "O2"]:
+                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
+                logger.warning(msg)
+                amp_level = amp_config["level"] = "O1"
+
+            amp_eval = self.config["AMP"].get("use_fp16_test", False)
+            # TODO(gaotingquan): Paddle not yet support FP32 evaluation when training with AMPO2
+            if self.mode == "train" and self.config["Global"].get(
+                    "eval_during_train",
+                    True) and amp_level == "O2" and amp_eval == False:
+                msg = "PaddlePaddle only support FP16 evaluation when training with AMP O2 now. "
+                logger.warning(msg)
+                self.config["AMP"]["use_fp16_test"] = True
+                amp_eval = True
+
+            self.auto_cast = AutoCast(
+                use_amp,
+                amp_level=amp_level,
+                use_promote=use_promote,
+                amp_eval=amp_eval)
+
+            scale_loss = amp_config.get("scale_loss", 1.0)
+            use_dynamic_loss_scaling = amp_config.get(
+                "use_dynamic_loss_scaling", False)
+            self.scaler = build_scaler(
+                use_amp,
+                scale_loss=scale_loss,
+                use_dynamic_loss_scaling=use_dynamic_loss_scaling)
+
+            if self.mode == "train":
+                self.model, self.optimizer = paddle.amp.decorate(
+                    models=self.model,
+                    optimizers=self.optimizer,
+                    level=amp_level,
+                    save_dtype='float32')
+            elif amp_eval:
+                self.model = paddle.amp.decorate(
+                    models=self.model, level=amp_level, save_dtype='float32')
+
+            if self.mode == "train" and len(self.train_loss_func.parameters(
+            )) > 0:
+                self.train_loss_func = paddle.amp.decorate(
+                    models=self.train_loss_func,
+                    level=self.amp_level,
+                    save_dtype='float32')
+
+
+class ExportModel(TheseusLayer):
+    """
+    ExportModel: add softmax onto the model
+    """
+
+    def __init__(self, config, model, use_multilabel):
+        super().__init__()
+        self.base_model = model
+        # we should choose a final model to export
+        if isinstance(self.base_model, DistillationModel):
+            self.infer_model_name = config["infer_model_name"]
+        else:
+            self.infer_model_name = None
+
+        self.infer_output_key = config.get("infer_output_key", None)
+        if self.infer_output_key == "features" and isinstance(self.base_model,
+                                                              RecModel):
+            self.base_model.head = IdentityHead()
+        if use_multilabel:
+            self.out_act = nn.Sigmoid()
+        else:
+            if config.get("infer_add_softmax", True):
+                self.out_act = nn.Softmax(axis=-1)
+            else:
+                self.out_act = None
+
+    def eval(self):
+        self.training = False
+        for layer in self.sublayers():
+            layer.training = False
+            layer.eval()
+
+    def forward(self, x):
+        x = self.base_model(x)
+        if isinstance(x, list):
+            x = x[0]
+        if self.infer_model_name is not None:
+            x = x[self.infer_model_name]
+        if self.infer_output_key is not None:
+            x = x[self.infer_output_key]
+        if self.out_act is not None:
+            if isinstance(x, dict):
+                x = x["logits"]
+            x = self.out_act(x)
+        return x
+    
+
+class Engine_qat(object):
+    def __init__(self, config, mode="train"):
+        assert mode in ["train", "eval", "infer", "export"]
+        self.mode = mode
+        self.config = config
+        self.eval_mode = self.config["Global"].get("eval_mode",
+                                                   "classification")
+        self.train_mode = self.config["Global"].get("train_mode", None)
+        if "Head" in self.config["Arch"] or self.config["Arch"].get("is_rec",
+                                                                    False):
+            self.is_rec = True
+        else:
+            self.is_rec = False
+
+        # set seed
+        seed = self.config["Global"].get("seed", False)
+        if seed or seed == 0:
+            assert isinstance(seed, int), "The 'seed' must be a integer!"
+            paddle.seed(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+
+        # init logger
+        self.output_dir = self.config['Global']['output_dir']
+        log_file = os.path.join(self.output_dir, f"{mode}.log")
+        log_ranks = self.config['Global'].get("log_ranks", "0")
+        init_logger(log_file=log_file, log_ranks=log_ranks)
+        print_config(config)
+
+        # init train_func and eval_func
+        assert self.eval_mode in [
+            "classification", "retrieval", "adaface"
+        ], logger.error("Invalid eval mode: {}".format(self.eval_mode))
+        if self.train_mode is None:
+            self.train_epoch_func = train_method.train_epoch
+        else:
+            self.train_epoch_func = getattr(train_method,
+                                            "train_epoch_" + self.train_mode)
+        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")
+
+        self.use_dali = self.config['Global'].get("use_dali", False)
+
+        # for visualdl
+        self.vdl_writer = None
+        if self.config['Global'][
+                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
+            vdl_writer_path = self.output_dir
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+
+        # set device
+        assert self.config["Global"]["device"] in [
+            "cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"
+        ]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+        # gradient accumulation
+        self.update_freq = self.config["Global"].get("update_freq", 1)
+
+        if "class_num" in config["Global"]:
+            global_class_num = config["Global"]["class_num"]
+            if "class_num" not in config["Arch"]:
+                config["Arch"]["class_num"] = global_class_num
+                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
+            else:
+                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
+            logger.warning(msg)
+        #TODO(gaotingquan): support rec
+        class_num = config["Arch"].get("class_num", None)
+        self.config["DataLoader"].update({"class_num": class_num})
+        self.config["DataLoader"].update({
+            "epochs": self.config["Global"]["epochs"]
+        })
+
+        # build dataloader
+        #################### 
+        if self.mode == 'train' or self.mode == "eval":
+        #################### 
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)
+            if self.config["DataLoader"].get('UnLabelTrain', None) is not None:
+                self.unlabel_train_dataloader = build_dataloader(
+                    self.config["DataLoader"], "UnLabelTrain", self.device,
+                    self.use_dali)
+            else:
+                self.unlabel_train_dataloader = None
+
+            self.iter_per_epoch = len(
+                self.train_dataloader) - 1 if platform.system(
+                ) == "Windows" else len(self.train_dataloader)
+            if self.config["Global"].get("iter_per_epoch", None):
+                # set max iteration per epoch mannualy, when training by iteration(s), such as XBM, FixMatch.
+                self.iter_per_epoch = self.config["Global"].get(
+                    "iter_per_epoch")
+            if self.iter_per_epoch < self.update_freq:
+                logger.warning(
+                    "The arg Global.update_freq greater than iter_per_epoch and has been set to 1. This may be caused by too few of batches."
+                )
+                self.update_freq = 1
+            self.iter_per_epoch = self.iter_per_epoch // self.update_freq * self.update_freq
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode in ["classification", "adaface"]:
+                self.eval_dataloader = build_dataloader(
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)
+            elif self.eval_mode == "retrieval":
+                self.gallery_query_dataloader = None
+                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
+                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
+                    self.gallery_query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], key, self.device,
+                        self.use_dali)
+                else:
+                    self.gallery_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Gallery",
+                        self.device, self.use_dali)
+                    self.query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Query", self.device,
+                        self.use_dali)
+
+        # build loss
+        if self.mode == "train":
+            label_loss_info = self.config["Loss"]["Train"]
+            self.train_loss_func = build_loss(label_loss_info)
+            unlabel_loss_info = self.config.get("UnLabelLoss", {}).get("Train",
+                                                                       None)
+            self.unlabel_train_loss_func = build_loss(unlabel_loss_info)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            loss_config = self.config.get("Loss", None)
+            if loss_config is not None:
+                loss_config = loss_config.get("Eval")
+                if loss_config is not None:
+                    self.eval_loss_func = build_loss(loss_config)
+                else:
+                    self.eval_loss_func = None
+            else:
+                self.eval_loss_func = None
+
+        # build metric
+        if self.mode == 'train' and "Metric" in self.config and "Train" in self.config[
+                "Metric"] and self.config["Metric"]["Train"]:
+            metric_config = self.config["Metric"]["Train"]
+            if hasattr(self.train_dataloader, "collate_fn"
+                       ) and self.train_dataloader.collate_fn is not None:
+                for m_idx, m in enumerate(metric_config):
+                    if "TopkAcc" in m:
+                        msg = f"Unable to calculate accuracy when using \"batch_transform_ops\". The metric \"{m}\" has been removed."
+                        logger.warning(msg)
+                        metric_config.pop(m_idx)
+            self.train_metric_func = build_metrics(metric_config)
+        else:
+            self.train_metric_func = None
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode == "classification":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    self.eval_metric_func = build_metrics(self.config["Metric"][
+                        "Eval"])
+                else:
+                    self.eval_metric_func = None
+            elif self.eval_mode == "retrieval":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    metric_config = self.config["Metric"]["Eval"]
+                else:
+                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
+                self.eval_metric_func = build_metrics(metric_config)
+        else:
+            self.eval_metric_func = None
+
+        # build model
+        self.model = build_model(self.config, self.mode)
+
+        # for name, param in self.model.named_parameters():
+        #     print(f"Parameter: {name}")
+        #     print(f"Shape: {param.shape}")
+        # assert 0==1
+
+        # for name, param in self.model.named_parameters():
+        #     print(f"Parameter: {name}")
+        #     print(f"Shape: {param.shape}")
+     
+        # set @to_static for benchmark, skip this by default.
+        apply_to_static(self.config, self.model, is_rec=self.is_rec)
+
+        # load_pretrain
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(
+                [self.model, getattr(self, 'train_loss_func', None)],
+                self.config["Global"]["pretrained_model"])
+
+        # build optimizer
+        if self.mode == 'train':
+            self.optimizer, self.lr_sch = build_optimizer(
+                self.config["Optimizer"], self.config["Global"]["epochs"],
+                self.iter_per_epoch // self.update_freq,
+                [self.model, self.train_loss_func])
+        # amp
+        self._init_amp()
+
+        # build EMA model
+        self.ema = "EMA" in self.config and self.mode == "train"
+        if self.ema:
+            self.model_ema = ExponentialMovingAverage(
+                self.model, self.config['EMA'].get("decay", 0.9999))
+
+        # check the gpu num
+        world_size = dist.get_world_size()
+        self.config["Global"]["distributed"] = world_size != 1
+        if self.mode == "train":
+            std_gpu_num = 8 if isinstance(
+                self.config["Optimizer"],
+                dict) and self.config["Optimizer"]["name"] == "AdamW" else 4
+            if world_size != std_gpu_num:
+                msg = f"The training strategy provided by PaddleClas is based on {std_gpu_num} gpus. But the number of gpu is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use this config to train."
+                logger.warning(msg)
+
+        # for distributed
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+            self.model = paddle.DataParallel(self.model)
+            if self.mode == 'train' and len(self.train_loss_func.parameters(
+            )) > 0:
+                self.train_loss_func = paddle.DataParallel(self.train_loss_func)
+
+            # set different seed in different GPU manually in distributed environment
+            if seed is None:
+                logger.warning(
+                    "The random seed cannot be None in a distributed environment. Global.seed has been set to 42 by default"
+                )
+                self.config["Global"]["seed"] = seed = 42
+            logger.info(
+                f"Set random seed to ({int(seed)} + $PADDLE_TRAINER_ID) for different trainer"
+            )
+            paddle.seed(int(seed) + dist.get_rank())
+            np.random.seed(int(seed) + dist.get_rank())
+            random.seed(int(seed) + dist.get_rank())
+
+        # build postprocess for infer
+        if self.mode == 'infer':
+            self.preprocess_func = create_operators(self.config["Infer"][
+                "transforms"])
+            self.postprocess_func = build_postprocess(self.config["Infer"][
+                "PostProcess"])
+            
+        ############ quantization    
+        self.qm = build_quantization_manager(self.config["qconfig"])
+        self.model = self.qm.prepare(self.model)
+            # calibration
+        self.model.train()
+        self.model.calibrating(True)
+        for i, (images, target) in enumerate(self.train_dataloader):
+            # compute output
+            output = self.model(images)
+            if i >= self.config["qconfig"]["calibration_config"]["steps"]:
+                break
+
+        self.model.calibrating(False)
+
+        self.qm.initialize_quantizer(self.model)
+
+    def train(self):
+        assert self.mode == "train"
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": -1.0,
+            "epoch": 0,
+        }
+        acc_ema = -1.0
+        best_metric_ema = -1.0
+        ema_module = None
+        if self.ema:
+            ema_module = self.model_ema.module
+        # key:
+        # val: metrics list word
+        self.output_info = dict()
+        self.time_info = {
+            "batch_cost": AverageMeter(
+                "batch_cost", '.5f', postfix=" s,"),
+            "reader_cost": AverageMeter(
+                "reader_cost", ".5f", postfix=" s,"),
+        }
+        # global iter counter
+        self.global_step = 0
+        uniform_output_enabled = self.config['Global'].get(
+            "uniform_output_enabled", False)
+
+        if self.config.Global.checkpoints is not None:
+            metric_info = init_model(self.config.Global, self.model,
+                                     self.optimizer, self.train_loss_func,
+                                     ema_module)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+            if hasattr(self.train_dataloader.batch_sampler, "set_epoch"):
+                self.train_dataloader.batch_sampler.set_epoch(best_metric[
+                    "epoch"])
+
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            # for one epoch train
+            self.train_epoch_func(self, epoch_id, print_batch_step)
+
+            if self.use_dali:
+                self.train_dataloader.reset()
+            metric_msg = ", ".join(
+                [self.output_info[key].avg_info for key in self.output_info])
+            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
+                epoch_id, self.config["Global"]["epochs"], metric_msg))
+            self.output_info.clear()
+
+            # eval model and save model if possible
+            start_eval_epoch = self.config["Global"].get("start_eval_epoch",
+                                                         0) - 1
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_interval"] == 0 and epoch_id > start_eval_epoch:
+                acc = self.eval(epoch_id)
+
+                # step lr (by epoch) according to given metric, such as acc
+                for i in range(len(self.lr_sch)):
+                    if getattr(self.lr_sch[i], "by_epoch", False) and \
+                            type_name(self.lr_sch[i]) == "ReduceOnPlateau":
+                        self.lr_sch[i].step(acc)
+
+                # update best_metric
+                if acc >= best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                logger.info("[Eval][Epoch {}][best metric: {}]".format(
+                    epoch_id, best_metric["metric"]))
+                logger.scaler(
+                    name="eval_acc",
+                    value=acc,
+                    step=epoch_id,
+                    writer=self.vdl_writer)
+
+                if self.ema:
+                    ori_model, self.model = self.model, ema_module
+                    acc_ema = self.eval(epoch_id)
+                    self.model = ori_model
+                    ema_module.eval()
+
+                    # update best_ema
+                    if acc_ema > best_metric_ema:
+                        best_metric_ema = acc_ema
+                    logger.info("[Eval][Epoch {}][best metric ema: {}]".format(
+                        epoch_id, best_metric_ema))
+                    logger.scaler(
+                        name="eval_acc_ema",
+                        value=acc_ema,
+                        step=epoch_id,
+                        writer=self.vdl_writer)
+
+                # save best model from best_acc or best_ema_acc
+                if max(acc, acc_ema) >= max(best_metric["metric"],
+                                            best_metric_ema):
+                    metric_info = {
+                        "metric": max(acc, acc_ema),
+                        "epoch": epoch_id
+                    }
+                    prefix = "best_model"
+                    save_load.save_model(
+                        self.model,
+                        self.optimizer,
+                        metric_info,
+                        os.path.join(self.output_dir, prefix)
+                        if uniform_output_enabled else self.output_dir,
+                        ema=ema_module,
+                        model_name=self.config["Arch"]["name"],
+                        prefix=prefix,
+                        loss=self.train_loss_func,
+                        save_student_model=True)
+                    if uniform_output_enabled:
+                        save_path = os.path.join(self.output_dir, prefix,
+                                                 "inference")
+                        self.export(save_path, uniform_output_enabled)
+                        if self.ema:
+                            ema_save_path = os.path.join(
+                                self.output_dir, prefix, "inference_ema")
+                            self.export(ema_save_path, uniform_output_enabled)
+                        update_train_results(
+                            self.config, prefix, metric_info, ema=self.ema)
+                        save_load.save_model_info(metric_info, self.output_dir,
+                                                  prefix)
+
+                self.model.train()
+
+            # save model
+            if save_interval > 0 and epoch_id % save_interval == 0:
+                metric_info = {"metric": acc, "epoch": epoch_id}
+                prefix = "epoch_{}".format(epoch_id)
+                save_load.save_model(
+                    self.model,
+                    self.optimizer,
+                    metric_info,
+                    os.path.join(self.output_dir, prefix)
+                    if uniform_output_enabled else self.output_dir,
+                    ema=ema_module,
+                    model_name=self.config["Arch"]["name"],
+                    prefix=prefix,
+                    loss=self.train_loss_func)
+                if uniform_output_enabled:
+                    save_path = os.path.join(self.output_dir, prefix,
+                                             "inference")
+                    self.export(save_path, uniform_output_enabled)
+                    if self.ema:
+                        ema_save_path = os.path.join(self.output_dir, prefix,
+                                                     "inference_ema")
+                        self.export(ema_save_path, uniform_output_enabled)
+                    update_train_results(
+                        self.config,
+                        prefix,
+                        metric_info,
+                        done_flag=epoch_id == self.config["Global"]["epochs"],
+                        ema=self.ema)
+                    save_load.save_model_info(metric_info, self.output_dir,
+                                              prefix)
+            # save the latest model
+            metric_info = {"metric": acc, "epoch": epoch_id}
+            prefix = "latest"
+            save_load.save_model(
+                self.model,
+                self.optimizer,
+                metric_info,
+                os.path.join(self.output_dir, prefix)
+                if uniform_output_enabled else self.output_dir,
+                ema=ema_module,
+                model_name=self.config["Arch"]["name"],
+                prefix=prefix,
+                loss=self.train_loss_func)
+            if uniform_output_enabled:
+                save_path = os.path.join(self.output_dir, prefix, "inference")
+                self.export(save_path, uniform_output_enabled)
+                if self.ema:
+                    ema_save_path = os.path.join(self.output_dir, prefix,
+                                                 "inference_ema")
+                    self.export(ema_save_path, uniform_output_enabled)
+                save_load.save_model_info(metric_info, self.output_dir, prefix)
+                self.model.train()
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.close()
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        assert self.mode in ["train", "eval"]
+        self.model.eval()
+        eval_result = self.eval_func(self, epoch_id)
+        self.model.train()
+        
+        return eval_result
+
+    @paddle.no_grad()
+    def infer(self):
+        assert self.mode == "infer" and self.eval_mode == "classification"
+        results = []
+        total_trainer = dist.get_world_size()
+        local_rank = dist.get_rank()
+        infer_imgs = self.config["Infer"]["infer_imgs"]
+        infer_list = self.config["Infer"].get("infer_list", None)
+        image_list = get_image_list(infer_imgs, infer_list=infer_list)
+        # data split
+        image_list = image_list[local_rank::total_trainer]
+
+        batch_size = self.config["Infer"]["batch_size"]
+        self.model.eval()
+        batch_data = []
+        image_file_list = []
+        save_path = self.config["Infer"].get("save_dir", None)
+        for idx, image_file in enumerate(image_list):
+            with open(image_file, 'rb') as f:
+                x = f.read()
+            try:
+                for process in self.preprocess_func:
+                    x = process(x)
+                batch_data.append(x)
+                image_file_list.append(image_file)
+                if len(batch_data) >= batch_size or idx == len(image_list) - 1:
+                    batch_tensor = paddle.to_tensor(batch_data)
+
+                    with self.auto_cast(is_eval=True):
+                        out = self.model(batch_tensor)
+
+                    if isinstance(out, list):
+                        out = out[0]
+                    if isinstance(out, dict) and "Student" in out:
+                        out = out["Student"]
+                    if isinstance(out, dict) and "logits" in out:
+                        out = out["logits"]
+                    if isinstance(out, dict) and "output" in out:
+                        out = out["output"]
+
+                    result = self.postprocess_func(out, image_file_list)
+                    if not save_path:
+                        logger.info(result)
+                    results.extend(result)
+                    batch_data.clear()
+                    image_file_list.clear()
+            except Exception as ex:
+                logger.error(
+                    "Exception occured when parse line: {} with msg: {}".format(
+                        image_file, ex))
+                continue
+        if save_path:
+            save_predict_result(save_path, results)
+        return results
+
+    def export(self,
+               save_path=None,
+               uniform_output_enabled=False,
+               ema_module=None):
+        assert self.mode == "export" or uniform_output_enabled
+        if paddle.distributed.get_rank() != 0:
+            return
+        use_multilabel = self.config["Global"].get(
+            "use_multilabel",
+            False) or "ATTRMetric" in self.config["Metric"]["Eval"][0]
+        model = self.model_ema.module if self.ema else self.model
+        if hasattr(model, '_layers'):
+            model = copy.deepcopy(model._layers)
+        else:
+            model = copy.deepcopy(model)
+        model = ExportModel(self.config["Arch"], model
+                            if not ema_module else ema_module, use_multilabel)
+        if self.config["Global"][
+                "pretrained_model"] is not None and not uniform_output_enabled:
+            load_dygraph_pretrain(model.base_model,
+                                  self.config["Global"]["pretrained_model"])
+        model.eval()
+        # for re-parameterization nets
+        for layer in model.sublayers():
+            if hasattr(layer, "re_parameterize") and not getattr(layer,
+                                                                 "is_repped"):
+                layer.re_parameterize()
+        if not save_path:
+            save_path = os.path.join(
+                self.config["Global"]["save_inference_dir"], "inference")
+        else:
+            save_path = os.path.join(save_path, "inference")
+
+        model = paddle.jit.to_static(
+            model,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None] + self.config["Global"]["image_shape"],
+                    dtype='float32')
+            ])
+        if hasattr(model.base_model,
+                   "quanter") and model.base_model.quanter is not None:
+            model.base_model.quanter.save_quantized_model(model,
+                                                          save_path + "_int8")
+        else:
+            paddle.jit.save(model, save_path)
+        if self.config["Global"].get("export_for_fd",
+                                     False) or uniform_output_enabled:
+            dst_path = os.path.join(os.path.dirname(save_path), 'inference.yml')
+            dump_infer_config(self.config, dst_path)
+        logger.info(
+            f"Export succeeded! The inference model exported has been saved in \"{save_path}\"."
+        )
+
+    def _init_amp(self):
+        if self.mode == "export":
+            return
+
+        amp_config = self.config.get("AMP", None)
+        use_amp = True if amp_config and amp_config.get("use_amp",
+                                                        True) else False
+
+        if not use_amp:
+            self.auto_cast = AutoCast(use_amp)
+            self.scaler = build_scaler(use_amp)
+        else:
+            AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, }
+            if paddle.is_compiled_with_cuda():
+                AMP_RELATED_FLAGS_SETTING.update({
+                    'FLAGS_cudnn_batchnorm_spatial_persistent': 1
+                })
+            paddle.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+            use_promote = amp_config.get("use_promote", False)
+            amp_level = amp_config.get("level", "O1")
+            if amp_level not in ["O1", "O2"]:
+                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
+                logger.warning(msg)
+                amp_level = amp_config["level"] = "O1"
+
+            amp_eval = self.config["AMP"].get("use_fp16_test", False)
+            # TODO(gaotingquan): Paddle not yet support FP32 evaluation when training with AMPO2
+            if self.mode == "train" and self.config["Global"].get(
+                    "eval_during_train",
+                    True) and amp_level == "O2" and amp_eval == False:
+                msg = "PaddlePaddle only support FP16 evaluation when training with AMP O2 now. "
+                logger.warning(msg)
+                self.config["AMP"]["use_fp16_test"] = True
+                amp_eval = True
+
+            self.auto_cast = AutoCast(
+                use_amp,
+                amp_level=amp_level,
+                use_promote=use_promote,
+                amp_eval=amp_eval)
+
+            scale_loss = amp_config.get("scale_loss", 1.0)
+            use_dynamic_loss_scaling = amp_config.get(
+                "use_dynamic_loss_scaling", False)
+            self.scaler = build_scaler(
+                use_amp,
+                scale_loss=scale_loss,
+                use_dynamic_loss_scaling=use_dynamic_loss_scaling)
+
+            if self.mode == "train":
+                self.model, self.optimizer = paddle.amp.decorate(
+                    models=self.model,
+                    optimizers=self.optimizer,
+                    level=amp_level,
+                    save_dtype='float32')
+            elif amp_eval:
+                self.model = paddle.amp.decorate(
+                    models=self.model, level=amp_level, save_dtype='float32')
+
+            if self.mode == "train" and len(self.train_loss_func.parameters(
+            )) > 0:
+                self.train_loss_func = paddle.amp.decorate(
+                    models=self.train_loss_func,
+                    level=self.amp_level,
+                    save_dtype='float32')
+
diff --git a/example/low_precision_training/ppcls/engine/evaluation/__init__.py b/example/low_precision_training/ppcls/engine/evaluation/__init__.py
new file mode 100755
index 000000000..a301ad7fd
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/evaluation/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.engine.evaluation.classification import classification_eval
+from ppcls.engine.evaluation.retrieval import retrieval_eval
+from ppcls.engine.evaluation.adaface import adaface_eval
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/engine/evaluation/adaface.py b/example/low_precision_training/ppcls/engine/evaluation/adaface.py
new file mode 100755
index 000000000..e62144b5c
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/evaluation/adaface.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import numpy as np
+import platform
+import paddle
+import sklearn
+from sklearn.model_selection import KFold
+from sklearn.decomposition import PCA
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+
+
+def fuse_features_with_norm(stacked_embeddings, stacked_norms):
+    assert stacked_embeddings.ndim == 3  # (n_features_to_fuse, batch_size, channel)
+    assert stacked_norms.ndim == 3  # (n_features_to_fuse, batch_size, 1)
+    pre_norm_embeddings = stacked_embeddings * stacked_norms
+    fused = pre_norm_embeddings.sum(axis=0)
+    norm = paddle.norm(fused, 2, 1, True)
+    fused = paddle.divide(fused, norm)
+    return fused, norm
+
+
+def adaface_eval(engine, epoch_id=0):
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = engine.config["Global"]["print_batch_step"]
+
+    metric_key = None
+    tic = time.time()
+    unique_dict = {}
+    for iter_id, batch in enumerate(engine.eval_dataloader):
+        images, labels, dataname, image_index = batch
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = images.shape[0]
+        batch[0] = paddle.to_tensor(images)
+        embeddings = engine.model(images, labels)['features']
+        norms = paddle.divide(embeddings, paddle.norm(embeddings, 2, 1, True))
+        embeddings = paddle.divide(embeddings, norms)
+        fliped_images = paddle.flip(images, axis=[3])
+        flipped_embeddings = engine.model(fliped_images, labels)['features']
+        flipped_norms = paddle.divide(
+            flipped_embeddings, paddle.norm(flipped_embeddings, 2, 1, True))
+        flipped_embeddings = paddle.divide(flipped_embeddings, flipped_norms)
+        stacked_embeddings = paddle.stack(
+            [embeddings, flipped_embeddings], axis=0)
+        stacked_norms = paddle.stack([norms, flipped_norms], axis=0)
+        embeddings, norms = fuse_features_with_norm(stacked_embeddings,
+                                                    stacked_norms)
+
+        for out, nor, label, data, idx in zip(embeddings, norms, labels,
+                                              dataname, image_index):
+            unique_dict[int(idx.numpy())] = {
+                'output': out,
+                'norm': nor,
+                'target': label,
+                'dataname': data
+            }
+            #  calc metric
+        time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].val)
+                for key in output_info
+            ])
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(engine.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+
+    unique_keys = sorted(unique_dict.keys())
+    all_output_tensor = paddle.stack(
+        [unique_dict[key]['output'] for key in unique_keys], axis=0)
+    all_norm_tensor = paddle.stack(
+        [unique_dict[key]['norm'] for key in unique_keys], axis=0)
+    all_target_tensor = paddle.stack(
+        [unique_dict[key]['target'] for key in unique_keys], axis=0)
+    all_dataname_tensor = paddle.stack(
+        [unique_dict[key]['dataname'] for key in unique_keys], axis=0)
+
+    eval_result = cal_metric(all_output_tensor, all_norm_tensor,
+                             all_target_tensor, all_dataname_tensor)
+
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, output_info[key].avg) for key in output_info
+    ])
+    face_msg = ", ".join([
+        "{}: {:.5f}".format(key, eval_result[key])
+        for key in eval_result.keys()
+    ])
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg + ", " +
+                                                 face_msg))
+
+    # return 1st metric in the dict
+    return eval_result['all_test_acc']
+
+
+def cal_metric(all_output_tensor, all_norm_tensor, all_target_tensor,
+               all_dataname_tensor):
+    all_target_tensor = all_target_tensor.reshape([-1])
+    all_dataname_tensor = all_dataname_tensor.reshape([-1])
+    dataname_to_idx = {
+        "agedb_30": 0,
+        "cfp_fp": 1,
+        "lfw": 2,
+        "cplfw": 3,
+        "calfw": 4
+    }
+    idx_to_dataname = {val: key for key, val in dataname_to_idx.items()}
+    test_logs = {}
+    # _, indices = paddle.unique(all_dataname_tensor, return_index=True, return_inverse=False, return_counts=False)
+    for dataname_idx in all_dataname_tensor.unique():
+        dataname = idx_to_dataname[dataname_idx.item()]
+        # per dataset evaluation
+        embeddings = all_output_tensor[all_dataname_tensor ==
+                                       dataname_idx].numpy()
+        labels = all_target_tensor[all_dataname_tensor == dataname_idx].numpy()
+        issame = labels[0::2]
+        tpr, fpr, accuracy, best_thresholds = evaluate_face(
+            embeddings, issame, nrof_folds=10)
+        acc, best_threshold = accuracy.mean(), best_thresholds.mean()
+
+        num_test_samples = len(embeddings)
+        test_logs[f'{dataname}_test_acc'] = acc
+        test_logs[f'{dataname}_test_best_threshold'] = best_threshold
+        test_logs[f'{dataname}_num_test_samples'] = num_test_samples
+
+    test_acc = np.mean([
+        test_logs[f'{dataname}_test_acc']
+        for dataname in dataname_to_idx.keys()
+        if f'{dataname}_test_acc' in test_logs
+    ])
+
+    test_logs['all_test_acc'] = test_acc
+    return test_logs
+
+
+def evaluate_face(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy, best_thresholds = calculate_roc(
+        thresholds,
+        embeddings1,
+        embeddings2,
+        np.asarray(actual_issame),
+        nrof_folds=nrof_folds,
+        pca=pca)
+    return tpr, fpr, accuracy, best_thresholds
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    best_thresholds = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+    # print('pca', pca)
+    dist = None
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        # print('train_set', train_set)
+        # print('test_set', test_set)
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            # print(_embed_train.shape)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            # print(embed1.shape, embed2.shape)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        best_thresholds[fold_idx] = thresholds[best_threshold_index]
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx, threshold_idx], fprs[
+                fold_idx, threshold_idx], _ = calculate_accuracy(
+                    threshold, dist[test_set], actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy, best_thresholds
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(
+            np.logical_not(predict_issame), np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
diff --git a/example/low_precision_training/ppcls/engine/evaluation/classification.py b/example/low_precision_training/ppcls/engine/evaluation/classification.py
new file mode 100755
index 000000000..d802eeeda
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/evaluation/classification.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import platform
+import paddle
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+
+
+def classification_eval(engine, epoch_id=0):
+    if hasattr(engine.eval_metric_func, "reset"):
+        engine.eval_metric_func.reset()
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = engine.config["Global"]["print_batch_step"]
+
+    tic = time.time()
+    accum_samples = 0
+    total_samples = len(
+        engine.eval_dataloader.
+        dataset) if not engine.use_dali else engine.eval_dataloader.size
+    max_iter = len(engine.eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(engine.eval_dataloader)
+    for iter_id, batch in enumerate(engine.eval_dataloader):
+        if iter_id >= max_iter:
+            break
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = batch[0].shape[0]
+        batch[0] = paddle.to_tensor(batch[0])
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+
+        # image input
+        with engine.auto_cast(is_eval=True):
+            if engine.is_rec:
+                out = engine.model(batch[0], batch[1])
+            else:
+                out = engine.model(batch[0])
+
+        # just for DistributedBatchSampler issue: repeat sampling
+        current_samples = batch_size * paddle.distributed.get_world_size()
+        accum_samples += current_samples
+
+        if isinstance(out, dict) and "Student" in out:
+            out = out["Student"]
+        if isinstance(out, dict) and "logits" in out:
+            out = out["logits"]
+
+        # gather Tensor when distributed
+        if paddle.distributed.get_world_size() > 1:
+            label_list = []
+            device_id = paddle.distributed.ParallelEnv().device_id
+            label = batch[1].cuda(device_id) if engine.config["Global"][
+                "device"] == "gpu" else batch[1]
+            paddle.distributed.all_gather(label_list, label)
+            labels = paddle.concat(label_list, 0)
+
+            if isinstance(out, list):
+                preds = []
+                for x in out:
+                    pred_list = []
+                    paddle.distributed.all_gather(pred_list, x)
+                    pred_x = paddle.concat(pred_list, 0)
+                    preds.append(pred_x)
+            else:
+                pred_list = []
+                paddle.distributed.all_gather(pred_list, out)
+                preds = paddle.concat(pred_list, 0)
+
+            if accum_samples > total_samples and not engine.use_dali:
+                if isinstance(preds, list):
+                    preds = [
+                        pred[:total_samples + current_samples - accum_samples]
+                        for pred in preds
+                    ]
+                else:
+                    preds = preds[:total_samples + current_samples -
+                                  accum_samples]
+                labels = labels[:total_samples + current_samples -
+                                accum_samples]
+                current_samples = total_samples + current_samples - accum_samples
+        else:
+            labels = batch[1]
+            preds = out
+
+        # calc loss
+        if engine.eval_loss_func is not None:
+            with engine.auto_cast(is_eval=True):
+                loss_dict = engine.eval_loss_func(preds, labels)
+
+            for key in loss_dict:
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+                output_info[key].update(float(loss_dict[key]), current_samples)
+
+        #  calc metric
+        if engine.eval_metric_func is not None:
+            engine.eval_metric_func(preds, labels)
+        time_info["batch_cost"].update(time.time() - tic)
+
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            if "ATTRMetric" in engine.config["Metric"]["Eval"][0]:
+                metric_msg = ""
+            else:
+                metric_msg = ", ".join([
+                    "{}: {:.5f}".format(key, output_info[key].val)
+                    for key in output_info
+                ])
+                if "MultiLabelMAP" not in engine.config["Metric"]["Eval"][0]:
+                    metric_msg += ", {}".format(engine.eval_metric_func.avg_info)
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(engine.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+    if engine.use_dali:
+        engine.eval_dataloader.reset()
+
+    if "ATTRMetric" in engine.config["Metric"]["Eval"][0]:
+        metric_msg = ", ".join([
+            "evalres: ma: {:.5f} label_f1: {:.5f} label_pos_recall: {:.5f} label_neg_recall: {:.5f} instance_f1: {:.5f} instance_acc: {:.5f} instance_prec: {:.5f} instance_recall: {:.5f}".
+            format(*engine.eval_metric_func.attr_res())
+        ])
+        logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+        # do not try to save best eval.model
+        if engine.eval_metric_func is None:
+            return -1
+        # return 1st metric in the dict
+        return engine.eval_metric_func.attr_res()[0]
+    else:
+        metric_msg = ", ".join([
+            "{}: {:.5f}".format(key, output_info[key].avg)
+            for key in output_info
+        ])
+        metric_msg += ", {}".format(engine.eval_metric_func.avg_info)
+        logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+        # do not try to save best eval.model
+        if engine.eval_metric_func is None:
+            return -1
+        # return 1st metric in the dict
+        return engine.eval_metric_func.avg
diff --git a/example/low_precision_training/ppcls/engine/evaluation/retrieval.py b/example/low_precision_training/ppcls/engine/evaluation/retrieval.py
new file mode 100755
index 000000000..53f744f1a
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/evaluation/retrieval.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+
+import numpy as np
+import paddle
+import scipy
+
+from ppcls.utils import all_gather, logger
+
+
+def retrieval_eval(engine, epoch_id=0):
+    engine.model.eval()
+    # step1. prepare query and gallery features
+    if engine.gallery_query_dataloader is not None:
+        gallery_feat, gallery_label, gallery_camera = compute_feature(
+            engine, "gallery_query")
+        query_feat, query_label, query_camera = gallery_feat, gallery_label, gallery_camera
+    else:
+        gallery_feat, gallery_label, gallery_camera = compute_feature(
+            engine, "gallery")
+        query_feat, query_label, query_camera = compute_feature(engine,
+                                                                "query")
+
+    # step2. split features into feature blocks for saving memory
+    num_query = len(query_feat)
+    block_size = engine.config["Global"].get("sim_block_size", 64)
+    sections = [block_size] * (num_query // block_size)
+    if num_query % block_size > 0:
+        sections.append(num_query % block_size)
+
+    query_feat_blocks = paddle.split(query_feat, sections)
+    query_label_blocks = paddle.split(query_label, sections)
+    query_camera_blocks = paddle.split(
+        query_camera, sections) if query_camera is not None else None
+    metric_key = None
+
+    # step3. compute metric
+    if engine.eval_loss_func is None:
+        metric_dict = {metric_key: 0.0}
+    else:
+        use_reranking = engine.config["Global"].get("re_ranking", False)
+        logger.info(f"re_ranking={use_reranking}")
+        if use_reranking:
+            # compute distance matrix
+            distmat = compute_re_ranking_dist(
+                query_feat, gallery_feat, engine.config["Global"].get(
+                    "feature_normalize", True), 20, 6, 0.3)
+            # exclude illegal distance
+            if query_camera is not None:
+                camera_mask = query_camera != gallery_camera.t()
+                label_mask = query_label != gallery_label.t()
+                keep_mask = label_mask | camera_mask
+                distmat = keep_mask.astype(query_feat.dtype) * distmat + (
+                    ~keep_mask).astype(query_feat.dtype) * (distmat.max() + 1)
+            else:
+                keep_mask = None
+            # compute metric with all samples
+            metric_dict = engine.eval_metric_func(-distmat, query_label,
+                                                  gallery_label, keep_mask)
+        else:
+            metric_dict = defaultdict(float)
+            for block_idx, block_feat in enumerate(query_feat_blocks):
+                # compute distance matrix
+                distmat = paddle.matmul(
+                    block_feat, gallery_feat, transpose_y=True)
+                # exclude illegal distance
+                if query_camera is not None:
+                    camera_mask = query_camera_blocks[
+                        block_idx] != gallery_camera.t()
+                    label_mask = query_label_blocks[
+                        block_idx] != gallery_label.t()
+                    keep_mask = label_mask | camera_mask
+                    distmat = keep_mask.astype(query_feat.dtype) * distmat
+                else:
+                    keep_mask = None
+                # compute metric by block
+                metric_block = engine.eval_metric_func(
+                    distmat, query_label_blocks[block_idx], gallery_label,
+                    keep_mask)
+                # accumulate metric
+                for key in metric_block:
+                    metric_dict[key] += metric_block[key] * block_feat.shape[
+                        0] / num_query
+
+    metric_info_list = []
+    for key, value in metric_dict.items():
+        metric_info_list.append(f"{key}: {value:.5f}")
+        if metric_key is None:
+            metric_key = key
+    metric_msg = ", ".join(metric_info_list)
+    logger.info(f"[Eval][Epoch {epoch_id}][Avg]{metric_msg}")
+
+    return metric_dict[metric_key]
+
+
+def compute_feature(engine, name="gallery"):
+    if name == "gallery":
+        dataloader = engine.gallery_dataloader
+    elif name == "query":
+        dataloader = engine.query_dataloader
+    elif name == "gallery_query":
+        dataloader = engine.gallery_query_dataloader
+    else:
+        raise ValueError(
+            f"Only support gallery or query or gallery_query dataset, but got {name}"
+        )
+
+    all_feat = []
+    all_label = []
+    all_camera = []
+    has_camera = False
+    for idx, batch in enumerate(dataloader):  # load is very time-consuming
+        if idx % engine.config["Global"]["print_batch_step"] == 0:
+            logger.info(
+                f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
+            )
+
+        batch = [paddle.to_tensor(x) for x in batch]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        if len(batch) >= 3:
+            has_camera = True
+            batch[2] = batch[2].reshape([-1, 1]).astype("int64")
+        with engine.auto_cast(is_eval=True):
+            if engine.is_rec:
+                out = engine.model(batch[0], batch[1])
+            else:
+                out = engine.model(batch[0])
+        if "Student" in out:
+            out = out["Student"]
+
+        # get features
+        if engine.config["Global"].get("retrieval_feature_from",
+                                       "features") == "features":
+            # use output from neck as feature
+            batch_feat = out["features"]
+        else:
+            # use output from backbone as feature
+            batch_feat = out["backbone"]
+
+        # do norm(optional)
+        if engine.config["Global"].get("feature_normalize", True):
+            batch_feat = paddle.nn.functional.normalize(batch_feat, p=2)
+
+        # do binarize(optional)
+        if engine.config["Global"].get("feature_binarize") == "round":
+            batch_feat = paddle.round(batch_feat).astype("float32") * 2.0 - 1.0
+        elif engine.config["Global"].get("feature_binarize") == "sign":
+            batch_feat = paddle.sign(batch_feat).astype("float32")
+
+        if paddle.distributed.get_world_size() > 1:
+            all_feat.append(all_gather(batch_feat))
+            all_label.append(all_gather(batch[1]))
+            if has_camera:
+                all_camera.append(all_gather(batch[2]))
+        else:
+            all_feat.append(batch_feat)
+            all_label.append(batch[1])
+            if has_camera:
+                all_camera.append(batch[2])
+
+    if engine.use_dali:
+        dataloader.reset()
+
+    all_feat = paddle.concat(all_feat)
+    all_label = paddle.concat(all_label)
+    if has_camera:
+        all_camera = paddle.concat(all_camera)
+    else:
+        all_camera = None
+    # discard redundant padding sample(s) at the end
+    total_samples = dataloader.size if engine.use_dali else len(
+        dataloader.dataset)
+    all_feat = all_feat[:total_samples]
+    all_label = all_label[:total_samples]
+    if has_camera:
+        all_camera = all_camera[:total_samples]
+
+    logger.info(f"Build {name} done, all feat shape: {all_feat.shape}")
+    return all_feat, all_label, all_camera
+
+
+def k_reciprocal_neighbor(rank: np.ndarray, p: int, k: int) -> np.ndarray:
+    """Implementation of k-reciprocal nearest neighbors, i.e. R(p, k)
+
+    Args:
+        rank (np.ndarray): Rank mat with shape of [N, N].
+        p (int): Probe index.
+        k (int): Parameter k for k-reciprocal nearest neighbors algorithm.
+
+    Returns:
+        np.ndarray: K-reciprocal nearest neighbors of probe p with shape of [M, ].
+    """
+    # use k+1 for excluding probe index itself
+    forward_k_neigh_index = rank[p, :k + 1]
+    backward_k_neigh_index = rank[forward_k_neigh_index, :k + 1]
+    candidate = np.where(backward_k_neigh_index == p)[0]
+    return forward_k_neigh_index[candidate]
+
+
+def compute_re_ranking_dist(query_feat: paddle.Tensor,
+                            gallery_feat: paddle.Tensor,
+                            feature_normed: bool=True,
+                            k1: int=20,
+                            k2: int=6,
+                            lamb: float=0.5) -> paddle.Tensor:
+    """
+    Re-ranking Person Re-identification with k-reciprocal Encoding
+    Reference: https://arxiv.org/abs/1701.08398
+    Code refernence: https://github.com/michuanhaohao/reid-strong-baseline/blob/master/utils/re_ranking.py
+
+    Args:
+        query_feat (paddle.Tensor): Query features with shape of [num_query, feature_dim].
+        gallery_feat (paddle.Tensor):  Gallery features with shape of [num_gallery, feature_dim].
+        feature_normed (bool, optional):  Whether input features are normalized.
+        k1 (int, optional): Parameter for K-reciprocal nearest neighbors. Defaults to 20.
+        k2 (int, optional): Parameter for K-nearest neighbors. Defaults to 6.
+        lamb (float, optional): Penalty factor. Defaults to 0.5.
+
+    Returns:
+        paddle.Tensor: (1 - lamb) x Dj + lamb x D, with shape of [num_query, num_gallery].
+    """
+    num_query = query_feat.shape[0]
+    num_gallery = gallery_feat.shape[0]
+    num_all = num_query + num_gallery
+    feat = paddle.concat([query_feat, gallery_feat], 0)
+    logger.info("Using GPU to compute original distance matrix")
+    # use L2 distance
+    if feature_normed:
+        original_dist = 2 - 2 * paddle.matmul(feat, feat, transpose_y=True)
+    else:
+        original_dist = paddle.pow(feat, 2).sum(axis=1, keepdim=True).expand([num_all, num_all]) + \
+            paddle.pow(feat, 2).sum(axis=1, keepdim=True).expand([num_all, num_all]).t()
+        original_dist = original_dist.addmm(feat, feat.t(), -2.0, 1.0)
+    original_dist = original_dist.numpy()
+    del feat
+
+    original_dist = np.transpose(original_dist / np.max(original_dist, axis=0))
+    V = np.zeros_like(original_dist).astype(np.float16)
+    initial_rank = np.argpartition(original_dist, range(1, k1 + 1))
+    logger.info("Start re-ranking...")
+
+    for p in range(num_all):
+        # compute R(p,k1)
+        p_k_reciprocal_ind = k_reciprocal_neighbor(initial_rank, p, k1)
+
+        # compute R*(p,k1)=R(p,k1)∪R(q,k1/2)
+        # s.t. |R(p,k1)∩R(q,k1/2)|>=2/3|R(q,k1/2)|, ∀q∈R(p,k1)
+        p_k_reciprocal_exp_ind = p_k_reciprocal_ind
+        for _, q in enumerate(p_k_reciprocal_ind):
+            q_k_reciprocal_ind = k_reciprocal_neighbor(initial_rank, q,
+                                                       int(np.around(k1 / 2)))
+            if len(
+                    np.intersect1d(
+                        p_k_reciprocal_ind,
+                        q_k_reciprocal_ind,
+                        assume_unique=True)) > 2 / 3 * len(q_k_reciprocal_ind):
+                p_k_reciprocal_exp_ind = np.append(p_k_reciprocal_exp_ind,
+                                                   q_k_reciprocal_ind)
+        p_k_reciprocal_exp_ind = np.unique(p_k_reciprocal_exp_ind)
+        # reweight distance using gaussian kernel
+        weight = np.exp(-original_dist[p, p_k_reciprocal_exp_ind])
+        V[p, p_k_reciprocal_exp_ind] = weight / np.sum(weight)
+
+    # local query expansion
+    original_dist = original_dist[:num_query, ]
+    if k2 > 1:
+        try:
+            # use sparse tensor to speed up query expansion
+            indices = (np.repeat(np.arange(num_all), k2),
+                       initial_rank[:, :k2].reshape([-1, ]))
+            values = np.array(
+                [1 / k2 for _ in range(num_all * k2)], dtype="float16")
+            V = scipy.sparse.coo_matrix(
+                (values, indices), V.shape,
+                dtype="float16") @V.astype("float16")
+        except Exception as e:
+            logger.info(
+                f"Failed to do local query expansion with sparse tensor for reason: \n{e}\n"
+                f"now use for-loop instead")
+            # use vanilla for-loop
+            V_qe = np.zeros_like(V, dtype=np.float16)
+            for i in range(num_all):
+                V_qe[i, :] = np.mean(V[initial_rank[i, :k2], :], axis=0)
+            V = V_qe
+            del V_qe
+    del initial_rank
+
+    # cache k-reciprocal sets which contains gj
+    invIndex = []
+    for gj in range(num_all):
+        invIndex.append(np.nonzero(V[:, gj])[0])
+
+    # compute jaccard distance
+    jaccard_dist = np.zeros_like(original_dist, dtype=np.float16)
+    for p in range(num_query):
+        sum_min = np.zeros(shape=[1, num_all], dtype=np.float16)
+        gj_ind = np.nonzero(V[p, :])[0]
+        gj_ind_inv = [invIndex[gj] for gj in gj_ind]
+        for j, gj in enumerate(gj_ind):
+            gi = gj_ind_inv[j]
+            sum_min[0, gi] += np.minimum(V[p, gj], V[gi, gj])
+        jaccard_dist[p] = 1 - sum_min / (2 - sum_min)
+
+    # fuse jaccard distance with original distance
+    final_dist = (1 - lamb) * jaccard_dist + lamb * original_dist
+    del original_dist
+    del V
+    del jaccard_dist
+    final_dist = final_dist[:num_query, num_query:]
+    final_dist = paddle.to_tensor(final_dist)
+    return final_dist
diff --git a/example/low_precision_training/ppcls/engine/train/__init__.py b/example/low_precision_training/ppcls/engine/train/__init__.py
new file mode 100755
index 000000000..50bf9037f
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/train/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.engine.train.train import train_epoch
+from ppcls.engine.train.train_fixmatch import train_epoch_fixmatch
+from ppcls.engine.train.train_fixmatch_ccssl import train_epoch_fixmatch_ccssl
+from ppcls.engine.train.train_progressive import train_epoch_progressive
+from ppcls.engine.train.train_metabin import train_epoch_metabin
diff --git a/example/low_precision_training/ppcls/engine/train/train.py b/example/low_precision_training/ppcls/engine/train/train.py
new file mode 100755
index 000000000..d21bef9ee
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/train/train.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info, type_name
+from ppcls.utils import profiler
+
+
+def train_epoch(engine, epoch_id, print_batch_step):
+    tic = time.time()
+
+    if not hasattr(engine, "train_dataloader_iter"):
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+
+    for iter_id in range(engine.iter_per_epoch):
+        # fetch data batch from dataloader
+        try:
+            batch = next(engine.train_dataloader_iter)
+        except Exception:
+            # NOTE: reset DALI dataloader manually
+            if engine.use_dali:
+                engine.train_dataloader.reset()
+            engine.train_dataloader_iter = iter(engine.train_dataloader)
+            batch = next(engine.train_dataloader_iter)
+
+        profiler.add_profiler_step(engine.config["profiler_options"])
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        engine.time_info["reader_cost"].update(time.time() - tic)
+
+        batch_size = batch[0].shape[0]
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([batch_size, -1])
+        engine.global_step += 1
+
+        # image input
+        with engine.auto_cast(is_eval=False):
+            out = forward(engine, batch)
+            loss_dict = engine.train_loss_func(out, batch[1])
+
+        # loss
+        loss = loss_dict["loss"] / engine.update_freq
+
+        # backward & step opt
+        scaled = engine.scaler.scale(loss)
+
+        # print(scaled)
+
+        scaled.backward()
+        if (iter_id + 1) % engine.update_freq == 0:
+            for i in range(len(engine.optimizer)):
+                # optimizer.step() with auto amp
+                engine.scaler.step(engine.optimizer[i])
+                engine.scaler.update()
+
+        if (iter_id + 1) % engine.update_freq == 0:
+            # clear grad
+            for i in range(len(engine.optimizer)):
+                engine.optimizer[i].clear_grad()
+            # step lr(by step)
+            for i in range(len(engine.lr_sch)):
+                if not getattr(engine.lr_sch[i], "by_epoch", False):
+                    engine.lr_sch[i].step()
+            # update ema
+            if engine.ema:
+                engine.model_ema.update(engine.model)
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, out, batch, batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+    # step lr(by epoch)
+    for i in range(len(engine.lr_sch)):
+        if getattr(engine.lr_sch[i], "by_epoch", False) and \
+                type_name(engine.lr_sch[i]) != "ReduceOnPlateau":
+            engine.lr_sch[i].step()
+
+
+def forward(engine, batch):
+    if not engine.is_rec:
+        return engine.model(batch[0])
+    else:
+        return engine.model(batch[0], batch[1])
diff --git a/example/low_precision_training/ppcls/engine/train/train_fixmatch.py b/example/low_precision_training/ppcls/engine/train/train_fixmatch.py
new file mode 100755
index 000000000..26a3daa74
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/train/train_fixmatch.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+from ppcls.utils import profiler
+from paddle.nn import functional as F
+import numpy as np
+
+
+def train_epoch_fixmatch(engine, epoch_id, print_batch_step):
+    tic = time.time()
+    if not hasattr(engine, "train_dataloader_iter"):
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+        engine.unlabel_train_dataloader_iter = iter(
+            engine.unlabel_train_dataloader)
+    temperture = engine.config["SSL"].get("temperture", 1)
+    threshold = engine.config["SSL"].get("threshold", 0.95)
+    assert engine.iter_per_epoch is not None, "Global.iter_per_epoch need to be set."
+    threshold = paddle.to_tensor(threshold)
+    for iter_id in range(engine.iter_per_epoch):
+        if iter_id >= engine.iter_per_epoch:
+            break
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        try:
+            label_data_batch = engine.train_dataloader_iter.next()
+        except Exception:
+            engine.train_dataloader_iter = iter(engine.train_dataloader)
+            label_data_batch = engine.train_dataloader_iter.next()
+        try:
+            unlabel_data_batch = engine.unlabel_train_dataloader_iter.next()
+        except Exception:
+            engine.unlabel_train_dataloader_iter = iter(
+                engine.unlabel_train_dataloader)
+            unlabel_data_batch = engine.unlabel_train_dataloader_iter.next()
+        assert len(unlabel_data_batch) == 3
+        assert unlabel_data_batch[0].shape == unlabel_data_batch[1].shape
+        engine.time_info["reader_cost"].update(time.time() - tic)
+        batch_size = label_data_batch[0].shape[0] + unlabel_data_batch[0].shape[0] \
+            + unlabel_data_batch[1].shape[0]
+        engine.global_step += 1
+
+        # make inputs
+        inputs_x, targets_x = label_data_batch
+        inputs_u_w, inputs_u_s, targets_u = unlabel_data_batch
+        batch_size_label = inputs_x.shape[0]
+        inputs = paddle.concat([inputs_x, inputs_u_w, inputs_u_s], axis=0)
+
+        # image input
+        with engine.auto_cast(is_eval=False):
+            loss_dict, logits_label = get_loss(engine, inputs,
+                                               batch_size_label, temperture,
+                                               threshold, targets_x)
+
+        # loss
+        loss = loss_dict["loss"]
+
+        # backward & step opt
+        scaled = engine.scaler.scale(loss)
+        scaled.backward()
+
+        for i in range(len(engine.optimizer)):
+            # optimizer.step() with auto amp
+            engine.scaler.step(engine.optimizer[i])
+            engine.scaler.update()
+
+        # step lr(by step)
+        for i in range(len(engine.lr_sch)):
+            if not getattr(engine.lr_sch[i], "by_epoch", False):
+                engine.lr_sch[i].step()
+        # clear grad
+        for i in range(len(engine.optimizer)):
+            engine.optimizer[i].clear_grad()
+
+        # update ema
+        if engine.ema:
+            engine.model_ema.update(engine.model)
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, logits_label, label_data_batch, batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+    # step lr(by epoch)
+    for i in range(len(engine.lr_sch)):
+        if getattr(engine.lr_sch[i], "by_epoch", False):
+            engine.lr_sch[i].step()
+
+
+def get_loss(engine, inputs, batch_size_label, temperture, threshold,
+             targets_x):
+    # For pytroch version, inputs need to use interleave and de_interleave
+    # to reshape and transpose inputs and logits, but it dosen't affect the
+    # result. So this paddle version dose not use the two transpose func.
+    # inputs = interleave(inputs, inputs.shape[0] // batch_size_label)
+    logits = engine.model(inputs)
+    # logits = de_interleave(logits, inputs.shape[0] // batch_size_label)
+    logits_x = logits[:batch_size_label]
+    logits_u_w, logits_u_s = logits[batch_size_label:].chunk(2)
+    loss_dict_label = engine.train_loss_func(logits_x, targets_x)
+    probs_u_w = F.softmax(logits_u_w.detach() / temperture, axis=-1)
+    p_targets_u, mask = get_psuedo_label_and_mask(probs_u_w, threshold)
+    unlabel_celoss = engine.unlabel_train_loss_func(logits_u_s,
+                                                    p_targets_u)["CELoss"]
+    unlabel_celoss = (unlabel_celoss * mask).mean()
+    loss_dict = dict()
+    for k, v in loss_dict_label.items():
+        if k != "loss":
+            loss_dict[k + "_label"] = v
+    loss_dict["CELoss_unlabel"] = unlabel_celoss
+    loss_dict["loss"] = loss_dict_label['loss'] + unlabel_celoss
+    return loss_dict, logits_x
+
+
+def get_psuedo_label_and_mask(probs_u_w, threshold):
+    max_probs = paddle.max(probs_u_w, axis=-1)
+    p_targets_u = paddle.argmax(probs_u_w, axis=-1)
+
+    mask = paddle.greater_equal(max_probs, threshold).astype('float')
+    return p_targets_u, mask
+
+
+def interleave(x, size):
+    s = list(x.shape)
+    return x.reshape([-1, size] + s[1:]).transpose(
+        [1, 0, 2, 3, 4]).reshape([-1] + s[1:])
+
+
+def de_interleave(x, size):
+    s = list(x.shape)
+    return x.reshape([size, -1] + s[1:]).transpose(
+        [1, 0, 2]).reshape([-1] + s[1:])
diff --git a/example/low_precision_training/ppcls/engine/train/train_fixmatch_ccssl.py b/example/low_precision_training/ppcls/engine/train/train_fixmatch_ccssl.py
new file mode 100755
index 000000000..43d20519f
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/train/train_fixmatch_ccssl.py
@@ -0,0 +1,125 @@
+from __future__ import absolute_import, division, print_function
+import time
+import paddle
+from ppcls.engine.train.train_fixmatch import get_loss
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+from ppcls.utils import profiler
+from paddle.nn import functional as F
+import numpy as np
+import paddle
+
+
+def train_epoch_fixmatch_ccssl(engine, epoch_id, print_batch_step):
+    tic = time.time()
+    if not hasattr(engine, 'train_dataloader_iter'):
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+        engine.unlabel_train_dataloader_iter = iter(engine.unlabel_train_dataloader)
+    
+    temperture = engine.config['SSL'].get("T", 1)
+    threshold = engine.config['SSL'].get("threshold", 0.95)
+    assert engine.iter_per_epoch is not None, "Global.iter_per_epoch need to be set"
+    threshold = paddle.to_tensor(threshold)
+
+    for iter_id in range(engine.iter_per_epoch):
+        if iter_id >= engine.iter_per_epoch:
+            break
+
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+
+        try:
+            label_data_batch = engine.train_dataloader_iter.next()
+        except Exception:
+            engine.train_dataloader_iter = iter(engine.train_dataloader)
+            label_data_batch = engine.train_dataloader_iter.next()
+
+        try:
+            unlabel_data_batch = engine.unlabel_train_dataloader_iter.next()
+        except Exception:
+            engine.unlabel_train_dataloader_iter = iter(engine.unlabel_train_dataloader)
+            unlabel_data_batch = engine.unlabel_train_dataloader_iter.next()
+
+        assert len(unlabel_data_batch) in [3, 4]
+        assert unlabel_data_batch[0].shape == unlabel_data_batch[1].shape == unlabel_data_batch[2].shape
+
+        engine.time_info['reader_cost'].update(time.time() - tic)
+        batch_size = label_data_batch[0].shape[0] \
+                    + unlabel_data_batch[0].shape[0] \
+                    + unlabel_data_batch[1].shape[0] \
+                    + unlabel_data_batch[2].shape[0]
+        engine.global_step += 1
+
+        inputs_x, targets_x = label_data_batch
+        inputs_w, inputs_s1, inputs_s2 = unlabel_data_batch[:3]
+        batch_size_label = inputs_x.shape[0]
+        inputs = paddle.concat([inputs_x, inputs_w, inputs_s1, inputs_s2], axis=0)
+
+        loss_dict, logits_label = get_loss(engine, inputs, batch_size_label, 
+                                           temperture, threshold, targets_x,
+                                           )
+        loss = loss_dict['loss']
+        loss.backward()
+        
+        for i in range(len(engine.optimizer)):
+            engine.optimizer[i].step()
+        
+        for i in range(len(engine.lr_sch)):
+            if not getattr(engine.lr_sch[i], 'by_epoch', False):
+                engine.lr_sch[i].step()
+
+        for i in range(len(engine.optimizer)):
+            engine.optimizer[i].clear_grad()
+
+        if engine.ema:
+            engine.model_ema.update(engine.model)
+        update_metric(engine, logits_label, label_data_batch, batch_size)
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info['batch_cost'].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+
+        tic = time.time()
+
+    for i in range(len(engine.lr_sch)):
+        if getattr(engine.lr_sch[i], 'by_epoch', False):
+            engine.lr_sch[i].step()
+
+def get_loss(engine,
+             inputs,
+             batch_size_label,
+             temperture,
+             threshold,
+             targets_x,
+             **kwargs
+             ):
+    out = engine.model(inputs)
+
+    logits, feats = out['logits'], out['features']
+    feat_w, feat_s1, feat_s2 = feats[batch_size_label:].chunk(3)
+    feat_x = feats[:batch_size_label]
+    logits_x = logits[:batch_size_label]
+    logits_w, logits_s1, logits_s2 = logits[batch_size_label:].chunk(3)
+    loss_dict_label = engine.train_loss_func(logits_x, targets_x)
+    probs_u_w = F.softmax(logits_w.detach() / temperture, axis=-1)
+    max_probs, p_targets_u_w = probs_u_w.max(axis=-1), probs_u_w.argmax(axis=-1)
+    mask = paddle.greater_equal(max_probs, threshold).astype('float')
+    
+    feats = paddle.concat([feat_s1.unsqueeze(1), feat_s2.unsqueeze(1)], axis=1)
+    batch = {'logits_w': logits_w,
+             'logits_s1': logits_s1,
+             'p_targets_u_w': p_targets_u_w,
+             'mask': mask,
+             'max_probs': max_probs,
+             }
+    unlabel_loss = engine.unlabel_train_loss_func(feats, batch)
+    loss_dict = {}
+    for k, v in loss_dict_label.items():
+        if k != 'loss':
+            loss_dict[k] = v
+    for k, v in unlabel_loss.items():
+        if k != 'loss':
+            loss_dict[k] = v
+    loss_dict['loss'] = loss_dict_label['loss'] + unlabel_loss['loss']
+
+    return loss_dict, logits_x
diff --git a/example/low_precision_training/ppcls/engine/train/train_metabin.py b/example/low_precision_training/ppcls/engine/train/train_metabin.py
new file mode 100755
index 000000000..d1d50f1d4
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/train/train_metabin.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2011.14670v2
+
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+import numpy as np
+from collections import defaultdict
+
+from ppcls.engine.train.utils import update_loss, update_metric, log_info, type_name
+from ppcls.utils import profiler
+from ppcls.data import build_dataloader
+from ppcls.loss import build_loss
+
+
+def train_epoch_metabin(engine, epoch_id, print_batch_step):
+    tic = time.time()
+
+    if not hasattr(engine, "train_dataloader_iter"):
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+
+    if not hasattr(engine, "meta_dataloader"):
+        engine.meta_dataloader = build_dataloader(
+            config=engine.config['DataLoader']['Metalearning'],
+            mode='Train',
+            device=engine.device)
+        engine.meta_dataloader_iter = iter(engine.meta_dataloader)
+
+    num_domain = engine.train_dataloader.dataset.num_cams
+    for iter_id in range(engine.iter_per_epoch):
+        # fetch data batch from dataloader
+        try:
+            train_batch = next(engine.train_dataloader_iter)
+        except Exception:
+            engine.train_dataloader_iter = iter(engine.train_dataloader)
+            train_batch = next(engine.train_dataloader_iter)
+
+        try:
+            mtrain_batch, mtest_batch = get_meta_data(
+                engine.meta_dataloader_iter, num_domain)
+        except Exception:
+            engine.meta_dataloader_iter = iter(engine.meta_dataloader)
+            mtrain_batch, mtest_batch = get_meta_data(
+                engine.meta_dataloader_iter, num_domain)
+
+        profiler.add_profiler_step(engine.config["profiler_options"])
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        engine.time_info["reader_cost"].update(time.time() - tic)
+
+        train_batch_size = train_batch[0].shape[0]
+        mtrain_batch_size = mtrain_batch[0].shape[0]
+        mtest_batch_size = mtest_batch[0].shape[0]
+        if not engine.config["Global"].get("use_multilabel", False):
+            train_batch[1] = train_batch[1].reshape([train_batch_size, -1])
+            mtrain_batch[1] = mtrain_batch[1].reshape([mtrain_batch_size, -1])
+            mtest_batch[1] = mtest_batch[1].reshape([mtest_batch_size, -1])
+
+        engine.global_step += 1
+
+        if engine.global_step == 1:  # update model (execpt gate) to warmup
+            for i in range(engine.config["Global"]["warmup_iter"] - 1):
+                out, basic_loss_dict = basic_update(engine, train_batch)
+                loss_dict = basic_loss_dict
+                try:
+                    train_batch = next(engine.train_dataloader_iter)
+                except Exception:
+                    engine.train_dataloader_iter = iter(
+                        engine.train_dataloader)
+                    train_batch = next(engine.train_dataloader_iter)
+
+        out, basic_loss_dict = basic_update(engine=engine, batch=train_batch)
+        mtrain_loss_dict, mtest_loss_dict = metalearning_update(
+            engine=engine, mtrain_batch=mtrain_batch, mtest_batch=mtest_batch)
+        loss_dict = {
+            **
+            {"train_" + key: value
+             for key, value in basic_loss_dict.items()}, ** {
+                 "mtrain_" + key: value
+                 for key, value in mtrain_loss_dict.items()
+             }, **
+            {"mtest_" + key: value
+             for key, value in mtest_loss_dict.items()}
+        }
+        # step lr (by iter)
+        for i in range(len(engine.lr_sch)):
+            if not getattr(engine.lr_sch[i], "by_epoch", False):
+                engine.lr_sch[i].step()
+        # update ema
+        if engine.ema:
+            engine.model_ema.update(engine.model)
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, out, train_batch, train_batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, train_batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, train_batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+    # step lr(by epoch)
+    for i in range(len(engine.lr_sch)):
+        if getattr(engine.lr_sch[i], "by_epoch", False) and \
+                type_name(engine.lr_sch[i]) != "ReduceOnPlateau":
+            engine.lr_sch[i].step()
+
+
+def setup_opt(engine, stage):
+    assert stage in ["train", "mtrain", "mtest"]
+    opt = defaultdict()
+    if stage == "train":
+        opt["bn_mode"] = "general"
+        opt["enable_inside_update"] = False
+        opt["lr_gate"] = 0.0
+    elif stage == "mtrain":
+        opt["bn_mode"] = "hold"
+        opt["enable_inside_update"] = False
+        opt["lr_gate"] = 0.0
+    elif stage == "mtest":
+        norm_lr = engine.lr_sch[1].last_lr
+        cyclic_lr = engine.lr_sch[2].get_lr()
+        opt["bn_mode"] = "hold"
+        opt["enable_inside_update"] = True
+        opt["lr_gate"] = norm_lr * cyclic_lr
+    for layer in engine.model.backbone.sublayers():
+        if type_name(layer) == "MetaBIN":
+            layer.setup_opt(opt)
+    engine.model.neck.setup_opt(opt)
+
+
+def reset_opt(model):
+    for layer in model.backbone.sublayers():
+        if type_name(layer) == "MetaBIN":
+            layer.reset_opt()
+    model.neck.reset_opt()
+
+
+def get_meta_data(meta_dataloader_iter, num_domain):
+    """
+    fetch data batch from dataloader then divide the batch by domains
+    """
+    list_all = np.random.permutation(num_domain)
+    list_mtrain = list(list_all[:num_domain // 2])
+    batch = next(meta_dataloader_iter)
+    domain_idx = batch[2]
+    cnt = 0
+    for sample in list_mtrain:
+        if cnt == 0:
+            is_mtrain_domain = domain_idx == sample
+        else:
+            is_mtrain_domain = paddle.logical_or(is_mtrain_domain,
+                                                 domain_idx == sample)
+        cnt += 1
+
+    # mtrain_batch
+    if not any(is_mtrain_domain):
+        mtrain_batch = None
+        raise RuntimeError
+    else:
+        mtrain_batch = [batch[i][is_mtrain_domain] for i in range(len(batch))]
+
+    # mtest_batch
+    is_mtest_domains = is_mtrain_domain == False
+    if not any(is_mtest_domains):
+        mtest_batch = None
+        raise RuntimeError
+    else:
+        mtest_batch = [batch[i][is_mtest_domains] for i in range(len(batch))]
+    return mtrain_batch, mtest_batch
+
+
+def forward(engine, batch, loss_func):
+    batch_info = defaultdict()
+    batch_info = {"label": batch[1], "domain": batch[2]}
+
+    with engine.auto_cast(is_eval=False):
+        out = engine.model(batch[0], batch[1])
+        loss_dict = loss_func(out, batch_info)
+
+    return out, loss_dict
+
+
+def backward(engine, loss, optimizer):
+    optimizer.clear_grad()
+    scaled = engine.scaler.scale(loss)
+    scaled.backward()
+
+    # optimizer.step() with auto amp
+    engine.scaler.step(optimizer)
+    engine.scaler.update()
+
+    for name, layer in engine.model.backbone.named_sublayers():
+        if "gate" == name.split('.')[-1]:
+            layer.clip_gate()
+
+
+def basic_update(engine, batch):
+    setup_opt(engine, "train")
+    train_loss_func = build_loss(engine.config["Loss"]["Basic"])
+    out, train_loss_dict = forward(engine, batch, train_loss_func)
+    train_loss = train_loss_dict["loss"]
+    backward(engine, train_loss, engine.optimizer[0])
+    engine.optimizer[0].clear_grad()
+    reset_opt(engine.model)
+    return out, train_loss_dict
+
+
+def metalearning_update(engine, mtrain_batch, mtest_batch):
+    # meta train
+    mtrain_loss_func = build_loss(engine.config["Loss"]["MetaTrain"])
+    setup_opt(engine, "mtrain")
+
+    mtrain_batch_info = defaultdict()
+    mtrain_batch_info = {"label": mtrain_batch[1], "domain": mtrain_batch[2]}
+    out = engine.model(mtrain_batch[0], mtrain_batch[1])
+    mtrain_loss_dict = mtrain_loss_func(out, mtrain_batch_info)
+    mtrain_loss = mtrain_loss_dict["loss"]
+    engine.optimizer[1].clear_grad()
+    mtrain_loss.backward()
+
+    # meta test
+    mtest_loss_func = build_loss(engine.config["Loss"]["MetaTest"])
+    setup_opt(engine, "mtest")
+
+    out, mtest_loss_dict = forward(engine, mtest_batch, mtest_loss_func)
+    engine.optimizer[1].clear_grad()
+    mtest_loss = mtest_loss_dict["loss"]
+    backward(engine, mtest_loss, engine.optimizer[1])
+
+    engine.optimizer[0].clear_grad()
+    engine.optimizer[1].clear_grad()
+    reset_opt(engine.model)
+
+    return mtrain_loss_dict, mtest_loss_dict
diff --git a/example/low_precision_training/ppcls/engine/train/train_progressive.py b/example/low_precision_training/ppcls/engine/train/train_progressive.py
new file mode 100755
index 000000000..9999024b9
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/train/train_progressive.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+from ppcls.data import build_dataloader
+from ppcls.engine.train.utils import type_name
+from ppcls.utils import logger
+
+from .train import train_epoch
+
+
+def train_epoch_progressive(engine, epoch_id, print_batch_step):
+    # 1. Build training hyper-parameters for different training stage
+    num_stage = 4
+    ratio_list = [(i + 1) / num_stage for i in range(num_stage)]
+    stones = [
+        int(engine.config["Global"]["epochs"] * ratio_list[i])
+        for i in range(num_stage)
+    ]
+    stage_id = 0
+    for i in range(num_stage):
+        if epoch_id > stones[i]:
+            stage_id = i + 1
+
+    # 2. Adjust training hyper-parameters for different training stage
+    if not hasattr(engine, 'last_stage') or engine.last_stage < stage_id:
+        cur_dropout_rate = 0.0
+
+        def _change_dp_func(m):
+            global cur_dropout_rate
+            if type_name(m) == "Head" and hasattr(m, "_dropout"):
+                m._dropout.p = m.dropout_rate[stage_id]
+                cur_dropout_rate = m.dropout_rate[stage_id]
+
+        engine.model.apply(_change_dp_func)
+
+        cur_image_size = engine.config["DataLoader"]["Train"]["dataset"][
+            "transform_ops"][1]["RandCropImage"]["progress_size"][stage_id]
+        cur_magnitude = engine.config["DataLoader"]["Train"]["dataset"][
+            "transform_ops"][3]["RandAugmentV2"]["progress_magnitude"][
+                stage_id]
+        engine.config["DataLoader"]["Train"]["dataset"]["transform_ops"][1][
+            "RandCropImage"]["size"] = cur_image_size
+        engine.config["DataLoader"]["Train"]["dataset"]["transform_ops"][3][
+            "RandAugmentV2"]["magnitude"] = cur_magnitude
+        engine.train_dataloader = build_dataloader(
+            engine.config["DataLoader"],
+            "Train",
+            engine.device,
+            engine.use_dali,
+            seed=epoch_id)
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+        engine.last_stage = stage_id
+    logger.info(f"Training stage: [{stage_id+1}/{num_stage}]("
+                f"random_aug_magnitude={cur_magnitude}, "
+                f"train_image_size={cur_image_size}, "
+                f"dropout_rate={cur_dropout_rate}"
+                f")")
+
+    # 3. Train one epoch as usual at current stage
+    train_epoch(engine, epoch_id, print_batch_step)
diff --git a/example/low_precision_training/ppcls/engine/train/utils.py b/example/low_precision_training/ppcls/engine/train/utils.py
new file mode 100755
index 000000000..1d5c70b2e
--- /dev/null
+++ b/example/low_precision_training/ppcls/engine/train/utils.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import datetime
+from ppcls.utils import logger
+from ppcls.utils.misc import AverageMeter
+
+
+def update_metric(trainer, out, batch, batch_size):
+    # calc metric
+    if trainer.train_metric_func is not None:
+        metric_dict = trainer.train_metric_func(out, batch[-1])
+        for key in metric_dict:
+            if key not in trainer.output_info:
+                trainer.output_info[key] = AverageMeter(key, '7.5f')
+            trainer.output_info[key].update(
+                float(metric_dict[key]), batch_size)
+
+
+def update_loss(trainer, loss_dict, batch_size):
+    # update_output_info
+    for key in loss_dict:
+        if key not in trainer.output_info:
+            trainer.output_info[key] = AverageMeter(key, '7.5f')
+        trainer.output_info[key].update(float(loss_dict[key]), batch_size)
+
+
+def log_info(trainer, batch_size, epoch_id, iter_id):
+    lr_msg = ", ".join([
+        "lr({}): {:.8f}".format(type_name(lr), lr.get_lr())
+        for i, lr in enumerate(trainer.lr_sch)
+    ])
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, trainer.output_info[key].avg)
+        for key in trainer.output_info
+    ])
+    time_msg = "s, ".join([
+        "{}: {:.5f}".format(key, trainer.time_info[key].avg)
+        for key in trainer.time_info
+    ])
+
+    ips_msg = "ips: {:.5f} samples/s".format(
+        batch_size / trainer.time_info["batch_cost"].avg)
+
+    global_epochs = trainer.config["Global"]["epochs"]
+    eta_sec = (
+        (trainer.config["Global"]["epochs"] - epoch_id + 1) *
+        trainer.iter_per_epoch - iter_id) * trainer.time_info["batch_cost"].avg
+    eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
+    max_mem_reserved_msg = ""
+    max_mem_allocated_msg = ""
+    max_mem_msg = ""
+    print_mem_info = trainer.config["Global"].get("print_mem_info", False)
+    if print_mem_info:
+        if paddle.device.is_compiled_with_cuda():
+            max_mem_reserved_msg = f"max_mem_reserved: {format(paddle.device.cuda.max_memory_reserved() / (1024 ** 2), '.2f')} MB"
+            max_mem_allocated_msg = f"max_mem_allocated: {format(paddle.device.cuda.max_memory_allocated() / (1024 ** 2), '.2f')} MB"
+            max_mem_msg = f", {max_mem_reserved_msg}, {max_mem_allocated_msg}"
+    logger.info(
+        f"[Train][Epoch {epoch_id}/{global_epochs}][Iter: {iter_id}/{trainer.iter_per_epoch}]{lr_msg}, {metric_msg}, {time_msg}, {ips_msg}, {eta_msg}{max_mem_msg}"
+    )
+    for key in trainer.time_info:
+        trainer.time_info[key].reset()
+
+    for i, lr in enumerate(trainer.lr_sch):
+        logger.scaler(
+            name="lr({})".format(type_name(lr)),
+            value=lr.get_lr(),
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
+    for key in trainer.output_info:
+        logger.scaler(
+            name="train_{}".format(key),
+            value=trainer.output_info[key].avg,
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
+
+
+def type_name(object: object) -> str:
+    """get class name of an object"""
+    return object.__class__.__name__
diff --git a/example/low_precision_training/ppcls/loss/__init__.py b/example/low_precision_training/ppcls/loss/__init__.py
new file mode 100755
index 000000000..7ab8be4fa
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/__init__.py
@@ -0,0 +1,91 @@
+import copy
+
+import paddle
+import paddle.nn as nn
+from ppcls.utils import logger
+
+from .celoss import CELoss, MixCELoss
+from .googlenetloss import GoogLeNetLoss
+from .centerloss import CenterLoss
+from .contrasiveloss import ContrastiveLoss
+from .contrasiveloss import ContrastiveLoss_XBM
+from .emlloss import EmlLoss
+from .msmloss import MSMLoss
+from .npairsloss import NpairsLoss
+from .trihardloss import TriHardLoss
+from .triplet import TripletLoss, TripletLossV2
+from .tripletangularmarginloss import TripletAngularMarginLoss, TripletAngularMarginLoss_XBM
+from .supconloss import SupConLoss
+from .softsuploss import SoftSupConLoss
+from .ccssl_loss import CCSSLCELoss
+from .pairwisecosface import PairwiseCosface
+from .dmlloss import DMLLoss
+from .distanceloss import DistanceLoss
+from .softtargetceloss import SoftTargetCrossEntropy
+from .distillationloss import DistillationCELoss
+from .distillationloss import DistillationGTCELoss
+from .distillationloss import DistillationDMLLoss
+from .distillationloss import DistillationDistanceLoss
+from .distillationloss import DistillationRKDLoss
+from .distillationloss import DistillationKLDivLoss
+from .distillationloss import DistillationDKDLoss
+from .distillationloss import DistillationWSLLoss
+from .distillationloss import DistillationSKDLoss
+from .distillationloss import DistillationMultiLabelLoss
+from .distillationloss import DistillationDISTLoss
+from .distillationloss import DistillationPairLoss
+
+from .multilabelloss import MultiLabelLoss, MultiLabelAsymmetricLoss
+from .afdloss import AFDLoss
+
+from .deephashloss import DSHSDLoss
+from .deephashloss import LCDSHLoss
+from .deephashloss import DCHLoss
+
+from .metabinloss import CELossForMetaBIN
+from .metabinloss import TripletLossForMetaBIN
+from .metabinloss import InterDomainShuffleLoss
+from .metabinloss import IntraDomainScatterLoss
+
+
+class CombinedLoss(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.loss_func = []
+        self.loss_weight = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            name = list(config)[0]
+            param = config[name]
+            assert "weight" in param, "weight must be in param, but param just contains {}".format(
+                param.keys())
+            self.loss_weight.append(param.pop("weight"))
+            self.loss_func.append(eval(name)(**param))
+            self.loss_func = nn.LayerList(self.loss_func)
+
+    def __call__(self, input, batch):
+        loss_dict = {}
+        # just for accelerate classification traing speed
+        if len(self.loss_func) == 1:
+            loss = self.loss_func[0](input, batch)
+            loss_dict.update(loss)
+            loss_dict["loss"] = list(loss.values())[0]
+        else:
+            for idx, loss_func in enumerate(self.loss_func):
+                loss = loss_func(input, batch)
+                weight = self.loss_weight[idx]
+                loss = {key: loss[key] * weight for key in loss}
+                loss_dict.update(loss)
+            loss_dict["loss"] = paddle.add_n(list(loss_dict.values()))
+        return loss_dict
+
+
+def build_loss(config):
+    if config is None:
+        return None
+    module_class = CombinedLoss(copy.deepcopy(config))
+    logger.debug("build loss {} success.".format(module_class))
+    return module_class
diff --git a/example/low_precision_training/ppcls/loss/afdloss.py b/example/low_precision_training/ppcls/loss/afdloss.py
new file mode 100755
index 000000000..e2f457451
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/afdloss.py
@@ -0,0 +1,130 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+import warnings
+warnings.filterwarnings('ignore')
+
+
+class LinearBNReLU(nn.Layer):
+    def __init__(self, nin, nout):
+        super().__init__()
+        self.linear = nn.Linear(nin, nout)
+        self.bn = nn.BatchNorm1D(nout)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, relu=True):
+        if relu:
+            return self.relu(self.bn(self.linear(x)))
+        return self.bn(self.linear(x))
+
+
+def unique_shape(s_shapes):
+    n_s = []
+    unique_shapes = []
+    n = -1
+    for s_shape in s_shapes:
+        if s_shape not in unique_shapes:
+            unique_shapes.append(s_shape)
+            n += 1
+        n_s.append(n)
+    return n_s, unique_shapes
+
+
+class AFDLoss(nn.Layer):
+    """
+    AFDLoss
+    https://www.aaai.org/AAAI21Papers/AAAI-9785.JiM.pdf
+    https://github.com/clovaai/attention-feature-distillation
+    """
+
+    def __init__(self,
+                 model_name_pair=["Student", "Teacher"],
+                 student_keys=["bilinear_key", "value"],
+                 teacher_keys=["query", "value"],
+                 s_shapes=[[64, 16, 160], [128, 8, 160], [256, 4, 160],
+                           [512, 2, 160]],
+                 t_shapes=[[640, 48], [320, 96], [160, 192]],
+                 qk_dim=128,
+                 name="loss_afd"):
+        super().__init__()
+        assert isinstance(model_name_pair, list)
+        self.model_name_pair = model_name_pair
+        self.student_keys = student_keys
+        self.teacher_keys = teacher_keys
+        self.s_shapes = [[1] + s_i for s_i in s_shapes]
+        self.t_shapes = [[1] + t_i for t_i in t_shapes]
+        self.qk_dim = qk_dim
+        self.n_t, self.unique_t_shapes = unique_shape(self.t_shapes)
+        self.attention = Attention(self.qk_dim, self.t_shapes, self.s_shapes,
+                                   self.n_t, self.unique_t_shapes)
+        self.name = name
+
+    def forward(self, predicts, batch):
+        s_features_dict = predicts[self.model_name_pair[0]]
+        t_features_dict = predicts[self.model_name_pair[1]]
+
+        g_s = [s_features_dict[key] for key in self.student_keys]
+        g_t = [t_features_dict[key] for key in self.teacher_keys]
+
+        loss = self.attention(g_s, g_t)
+        sum_loss = sum(loss)
+
+        loss_dict = dict()
+        loss_dict[self.name] = sum_loss
+
+        return loss_dict
+
+
+class Attention(nn.Layer):
+    def __init__(self, qk_dim, t_shapes, s_shapes, n_t, unique_t_shapes):
+        super().__init__()
+        self.qk_dim = qk_dim
+        self.n_t = n_t
+
+        self.p_t = self.create_parameter(
+            shape=[len(t_shapes), qk_dim],
+            default_initializer=nn.initializer.XavierNormal())
+        self.p_s = self.create_parameter(
+            shape=[len(s_shapes), qk_dim],
+            default_initializer=nn.initializer.XavierNormal())
+
+    def forward(self, g_s, g_t):
+        bilinear_key, h_hat_s_all = g_s
+        query, h_t_all = g_t
+
+        p_logit = paddle.matmul(self.p_t, self.p_s.t())
+
+        logit = paddle.add(
+            paddle.einsum('bstq,btq->bts', bilinear_key, query),
+            p_logit) / np.sqrt(self.qk_dim)
+        atts = F.softmax(logit, axis=2)  # b x t x s
+
+        loss = []
+
+        for i, (n, h_t) in enumerate(zip(self.n_t, h_t_all)):
+            h_hat_s = h_hat_s_all[n]
+            diff = self.cal_diff(h_hat_s, h_t, atts[:, i])
+            loss.append(diff)
+        return loss
+
+    def cal_diff(self, v_s, v_t, att):
+        diff = (v_s - v_t.unsqueeze(1)).pow(2).mean(2)
+        diff = paddle.multiply(diff, att).sum(1).mean()
+        return diff
diff --git a/example/low_precision_training/ppcls/loss/ccssl_loss.py b/example/low_precision_training/ppcls/loss/ccssl_loss.py
new file mode 100755
index 000000000..1b3b71d56
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/ccssl_loss.py
@@ -0,0 +1,19 @@
+from ppcls.engine.train.train import forward
+from .softsuploss import SoftSupConLoss
+import copy
+import paddle.nn as nn
+
+
+class CCSSLCELoss(nn.Layer):
+    def __init__(self, **kwargs):
+        super(CCSSLCELoss, self).__init__()
+        self.celoss = nn.CrossEntropyLoss(reduction='none')
+
+    def forward(self, inputs, batch, **kwargs):
+        p_targets_u_w = batch['p_targets_u_w']
+        logits_s1 = batch['logits_s1']
+        mask = batch['mask']
+        loss_u = self.celoss(logits_s1, p_targets_u_w) * mask
+        loss_u = loss_u.mean()
+
+        return {'CCSSLCELoss': loss_u}
diff --git a/example/low_precision_training/ppcls/loss/celoss.py b/example/low_precision_training/ppcls/loss/celoss.py
new file mode 100755
index 000000000..2715dee19
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/celoss.py
@@ -0,0 +1,76 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.utils import logger
+
+
+class CELoss(nn.Layer):
+    """
+    Cross entropy loss
+    """
+
+    def __init__(self, reduction="mean", epsilon=None):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+        assert reduction in ["mean", "sum", "none"]
+        self.reduction = reduction
+
+    def _labelsmoothing(self, target, class_num):
+        if len(target.shape) == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        if self.epsilon is not None:
+            class_num = x.shape[-1]
+            label = self._labelsmoothing(label, class_num)
+            x = -F.log_softmax(x, axis=-1)
+            loss = paddle.sum(x * label, axis=-1)
+            if self.reduction == 'mean':
+                loss = loss.mean()
+            elif self.reduction == 'sum':
+                loss = loss.sum()
+        else:
+            if label.shape[-1] == x.shape[-1]:
+                label = F.softmax(label, axis=-1)
+                soft_label = True
+            else:
+                soft_label = False
+            loss = F.cross_entropy(
+                x,
+                label=label,
+                soft_label=soft_label,
+                reduction=self.reduction)
+        return {"CELoss": loss}
+
+
+class MixCELoss(object):
+    def __init__(self, *args, **kwargs):
+        msg = "\"MixCELos\" is deprecated, please use \"CELoss\" instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
diff --git a/example/low_precision_training/ppcls/loss/centerloss.py b/example/low_precision_training/ppcls/loss/centerloss.py
new file mode 100755
index 000000000..23a86ee88
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/centerloss.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Dict
+
+import paddle
+import paddle.nn as nn
+
+
+class CenterLoss(nn.Layer):
+    """Center loss
+    paper : [A Discriminative Feature Learning Approach for Deep Face Recognition](https://link.springer.com/content/pdf/10.1007%2F978-3-319-46478-7_31.pdf)
+    code reference: https://github.com/michuanhaohao/reid-strong-baseline/blob/master/layers/center_loss.py#L7
+    Args:
+        num_classes (int): number of classes.
+        feat_dim (int): number of feature dimensions.
+        feature_from (str): feature from "backbone" or "features"
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 feat_dim: int,
+                 feature_from: str="features"):
+        super(CenterLoss, self).__init__()
+        self.num_classes = num_classes
+        self.feat_dim = feat_dim
+        self.feature_from = feature_from
+        random_init_centers = paddle.randn(
+            shape=[self.num_classes, self.feat_dim])
+        self.centers = self.create_parameter(
+            shape=(self.num_classes, self.feat_dim),
+            default_initializer=nn.initializer.Assign(random_init_centers))
+        self.add_parameter("centers", self.centers)
+
+    def __call__(self, input: Dict[str, paddle.Tensor],
+                 target: paddle.Tensor) -> Dict[str, paddle.Tensor]:
+        """compute center loss.
+
+        Args:
+            input (Dict[str, paddle.Tensor]): {'features': (batch_size, feature_dim), ...}.
+            target (paddle.Tensor): ground truth label with shape (batch_size, ).
+
+        Returns:
+            Dict[str, paddle.Tensor]: {'CenterLoss': loss}.
+        """
+        feats = input[self.feature_from]
+        labels = target
+
+        # squeeze labels to shape (batch_size, )
+        if labels.ndim >= 2 and labels.shape[-1] == 1:
+            labels = paddle.squeeze(labels, axis=[-1])
+
+        batch_size = feats.shape[0]
+        distmat = paddle.pow(feats, 2).sum(axis=1, keepdim=True).expand([batch_size, self.num_classes]) + \
+            paddle.pow(self.centers, 2).sum(axis=1, keepdim=True).expand([self.num_classes, batch_size]).t()
+        distmat = distmat.addmm(x=feats, y=self.centers.t(), beta=1, alpha=-2)
+
+        classes = paddle.arange(self.num_classes).astype(labels.dtype)
+        labels = labels.unsqueeze(1).expand([batch_size, self.num_classes])
+        mask = labels.equal(classes.expand([batch_size, self.num_classes]))
+
+        dist = distmat * mask.astype(feats.dtype)
+        loss = dist.clip(min=1e-12, max=1e+12).sum() / batch_size
+        # return loss
+        return {'CenterLoss': loss}
diff --git a/example/low_precision_training/ppcls/loss/comfunc.py b/example/low_precision_training/ppcls/loss/comfunc.py
new file mode 100755
index 000000000..277bdd6b5
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/comfunc.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def rerange_index(batch_size, samples_each_class):
+    tmp = np.arange(0, batch_size * batch_size)
+    tmp = tmp.reshape(-1, batch_size)
+    rerange_index = []
+
+    for i in range(batch_size):
+        step = i // samples_each_class
+        start = step * samples_each_class
+        end = (step + 1) * samples_each_class
+
+        pos_idx = []
+        neg_idx = []
+        for j, k in enumerate(tmp[i]):
+            if j >= start and j < end:
+                if j == i:
+                    pos_idx.insert(0, k)
+                else:
+                    pos_idx.append(k)
+            else:
+                neg_idx.append(k)
+        rerange_index += (pos_idx + neg_idx)
+
+    rerange_index = np.array(rerange_index).astype(np.int32)
+    return rerange_index
diff --git a/example/low_precision_training/ppcls/loss/contrasiveloss.py b/example/low_precision_training/ppcls/loss/contrasiveloss.py
new file mode 100755
index 000000000..d27dbe22e
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/contrasiveloss.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Dict
+
+import paddle
+import paddle.nn as nn
+from ppcls.loss.xbm import CrossBatchMemory
+
+
+class ContrastiveLoss(nn.Layer):
+    """ContrastiveLoss
+
+    Args:
+        margin (float): margin
+        embedding_size (int): number of embedding's dimension
+        normalize_feature (bool, optional): whether to normalize embedding. Defaults to True.
+        epsilon (float, optional): epsilon. Defaults to 1e-5.
+        feature_from (str, optional): which key embedding from input dict. Defaults to "features".
+    """
+
+    def __init__(self,
+                 margin: float,
+                 embedding_size: int,
+                 normalize_feature=True,
+                 epsilon: float=1e-5,
+                 feature_from: str="features"):
+        super(ContrastiveLoss, self).__init__()
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.normalize_feature = normalize_feature
+        self.epsilon = epsilon
+        self.feature_from = feature_from
+
+    def forward(self, input: Dict[str, paddle.Tensor],
+                target: paddle.Tensor) -> Dict[str, paddle.Tensor]:
+        feats = input[self.feature_from]
+        labels = target
+
+        # normalize along feature dim
+        if self.normalize_feature:
+            feats = nn.functional.normalize(feats, p=2, axis=1)
+
+        # squeeze labels to shape (batch_size, )
+        if labels.ndim >= 2 and labels.shape[-1] == 1:
+            labels = paddle.squeeze(labels, axis=[-1])
+
+        loss = self._compute_loss(feats, target, feats, target)
+
+        return {'ContrastiveLoss': loss}
+
+    def _compute_loss(self,
+                      inputs_q: paddle.Tensor,
+                      targets_q: paddle.Tensor,
+                      inputs_k: paddle.Tensor,
+                      targets_k: paddle.Tensor) -> paddle.Tensor:
+        batch_size = inputs_q.shape[0]
+        # Compute similarity matrix
+        sim_mat = paddle.matmul(inputs_q, inputs_k.t())
+
+        loss = []
+        for i in range(batch_size):
+            pos_pair_ = paddle.masked_select(sim_mat[i],
+                                             targets_q[i] == targets_k)
+            pos_pair_ = paddle.masked_select(pos_pair_,
+                                             pos_pair_ < 1 - self.epsilon)
+
+            neg_pair_ = paddle.masked_select(sim_mat[i],
+                                             targets_q[i] != targets_k)
+            neg_pair = paddle.masked_select(neg_pair_, neg_pair_ > self.margin)
+
+            pos_loss = paddle.sum(-pos_pair_ + 1)
+
+            if len(neg_pair) > 0:
+                neg_loss = paddle.sum(neg_pair)
+            else:
+                neg_loss = 0
+            loss.append(pos_loss + neg_loss)
+
+        loss = sum(loss) / batch_size
+        return loss
+
+
+class ContrastiveLoss_XBM(ContrastiveLoss):
+    """ContrastiveLoss with CrossBatchMemory
+
+    Args:
+        xbm_size (int): size of memory bank
+        xbm_weight (int): weight of CrossBatchMemory's loss
+        start_iter (int): store embeddings after start_iter
+        margin (float): margin
+        embedding_size (int): number of embedding's dimension
+        epsilon (float, optional): epsilon. Defaults to 1e-5.
+        normalize_feature (bool, optional): whether to normalize embedding. Defaults to True.
+        feature_from (str, optional): which key embedding from input dict. Defaults to "features".
+    """
+
+    def __init__(self,
+                 xbm_size: int,
+                 xbm_weight: int,
+                 start_iter: int,
+                 margin: float,
+                 embedding_size: int,
+                 epsilon: float=1e-5,
+                 normalize_feature=True,
+                 feature_from: str="features"):
+        super(ContrastiveLoss_XBM, self).__init__(
+            margin, embedding_size, normalize_feature, epsilon, feature_from)
+        self.xbm = CrossBatchMemory(xbm_size, embedding_size)
+        self.xbm_weight = xbm_weight
+        self.start_iter = start_iter
+        self.iter = 0
+
+    def __call__(self, input: Dict[str, paddle.Tensor],
+                 target: paddle.Tensor) -> Dict[str, paddle.Tensor]:
+        feats = input[self.feature_from]
+        labels = target
+
+        # normalize along feature dim
+        if self.normalize_feature:
+            feats = nn.functional.normalize(feats, p=2, axis=1)
+
+        # squeeze labels to shape (batch_size, )
+        if labels.ndim >= 2 and labels.shape[-1] == 1:
+            labels = paddle.squeeze(labels, axis=[-1])
+
+        loss = self._compute_loss(feats, labels, feats, labels)
+
+        # compute contrastive loss from memory bank
+        self.iter += 1
+        if self.iter > self.start_iter:
+            self.xbm.enqueue_dequeue(feats.detach(), labels.detach())
+            xbm_feats, xbm_labels = self.xbm.get()
+            xbm_loss = self._compute_loss(feats, labels, xbm_feats, xbm_labels)
+            loss = loss + self.xbm_weight * xbm_loss
+
+        return {'ContrastiveLoss_XBM': loss}
diff --git a/example/low_precision_training/ppcls/loss/deephashloss.py b/example/low_precision_training/ppcls/loss/deephashloss.py
new file mode 100755
index 000000000..7dda519a8
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/deephashloss.py
@@ -0,0 +1,149 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+
+class DSHSDLoss(nn.Layer):
+    """
+    # DSHSD(IEEE ACCESS 2019)
+    # paper [Deep Supervised Hashing Based on Stable Distribution](https://ieeexplore.ieee.org/document/8648432/)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/DSHSD.py
+    """
+
+    def __init__(self, alpha, multi_label=False):
+        super(DSHSDLoss, self).__init__()
+        self.alpha = alpha
+        self.multi_label = multi_label
+
+    def forward(self, input, label):
+        features = input["features"]
+        logits = input["logits"]
+
+        features_temp1 = paddle.unsqueeze(features, 1)
+        features_temp2 = paddle.unsqueeze(features, 0)
+        dist = features_temp1 - features_temp2
+        dist = paddle.square(dist)
+        dist = paddle.sum(dist, axis=2)
+
+        n_class = logits.shape[1]
+        labels = paddle.nn.functional.one_hot(label, n_class)
+        labels = labels.squeeze().astype("float32")
+
+        s = paddle.matmul(labels, labels, transpose_y=True)
+        s = (s == 0).astype("float32")
+        margin = 2 * features.shape[1]
+        Ld = (1 - s) / 2 * dist + s / 2 * (margin - dist).clip(min=0)
+        Ld = Ld.mean()
+
+        if self.multi_label:
+            Lc_temp = (1 + (-logits).exp()).log()
+            Lc = (logits - labels * logits + Lc_temp).sum(axis=1)
+        else:
+            probs = paddle.nn.functional.softmax(logits)
+            Lc = (-probs.log() * labels).sum(axis=1)
+        Lc = Lc.mean()
+
+        loss = Lc + Ld * self.alpha
+        return {"dshsdloss": loss}
+
+
+class LCDSHLoss(nn.Layer):
+    """
+    # paper [Locality-Constrained Deep Supervised Hashing for Image Retrieval](https://www.ijcai.org/Proceedings/2017/0499.pdf)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/LCDSH.py
+    """
+
+    def __init__(self, n_class, _lambda):
+        super(LCDSHLoss, self).__init__()
+        self._lambda = _lambda
+        self.n_class = n_class
+
+    def forward(self, input, label):
+        features = input["features"]
+        labels = paddle.nn.functional.one_hot(label, self.n_class)
+        labels = labels.squeeze().astype("float32")
+
+        s = paddle.matmul(labels, labels, transpose_y=True)
+        s = 2 * (s > 0).astype("float32") - 1
+
+        inner_product = paddle.matmul(features, features, transpose_y=True)
+        inner_product = inner_product * 0.5
+        inner_product = inner_product.clip(min=-50, max=50)
+        L1 = paddle.log(1 + paddle.exp(-s * inner_product))
+        L1 = L1.mean()
+
+        binary_features = features.sign()
+
+        inner_product_ = paddle.matmul(
+            binary_features, binary_features, transpose_y=True)
+        inner_product_ = inner_product_ * 0.5
+        sigmoid = paddle.nn.Sigmoid()
+        L2 = (sigmoid(inner_product) - sigmoid(inner_product_)).pow(2)
+        L2 = L2.mean()
+
+        loss = L1 + self._lambda * L2
+        return {"lcdshloss": loss}
+
+
+class DCHLoss(paddle.nn.Layer):
+    """
+    # paper [Deep Cauchy Hashing for Hamming Space Retrieval]
+    URL:(http://ise.thss.tsinghua.edu.cn/~mlong/doc/deep-cauchy-hashing-cvpr18.pdf)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/DCH.py
+    """
+
+    def __init__(self, gamma, _lambda, n_class):
+        super(DCHLoss, self).__init__()
+        self.gamma = gamma
+        self._lambda = _lambda
+        self.n_class = n_class
+
+    def distance(self, feature_i, feature_j):
+        assert feature_i.shape[1] == feature_j.shape[
+            1], "feature len of feature_i and feature_j is different, please check whether the featurs are right"
+        K = feature_i.shape[1]
+        inner_product = paddle.matmul(feature_i, feature_j, transpose_y=True)
+
+        len_i = feature_i.pow(2).sum(axis=1, keepdim=True).pow(0.5)
+        len_j = feature_j.pow(2).sum(axis=1, keepdim=True).pow(0.5)
+        norm = paddle.matmul(len_i, len_j, transpose_y=True)
+        cos = inner_product / norm.clip(min=0.0001)
+        dist = (1 - cos.clip(max=0.99)) * K / 2
+        return dist
+
+    def forward(self, input, label):
+        features = input["features"]
+        labels = paddle.nn.functional.one_hot(label, self.n_class)
+        labels = labels.squeeze().astype("float32")
+
+        s = paddle.matmul(labels, labels, transpose_y=True).astype("float32")
+        if (1 - s).sum() != 0 and s.sum() != 0:
+            positive_w = s * s.numel() / s.sum()
+            negative_w = (1 - s) * s.numel() / (1 - s).sum()
+            w = positive_w + negative_w
+        else:
+            w = 1
+
+        dist_matric = self.distance(features, features)
+        cauchy_loss = w * (s * paddle.log(dist_matric / self.gamma) +
+                           paddle.log(1 + self.gamma / dist_matric))
+
+        all_one = paddle.ones_like(features, dtype="float32")
+        dist_to_one = self.distance(features.abs(), all_one)
+        quantization_loss = paddle.log(1 + dist_to_one / self.gamma)
+
+        loss = cauchy_loss.mean() + self._lambda * quantization_loss.mean()
+        return {"dchloss": loss}
diff --git a/example/low_precision_training/ppcls/loss/dist_loss.py b/example/low_precision_training/ppcls/loss/dist_loss.py
new file mode 100755
index 000000000..78c8e12ff
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/dist_loss.py
@@ -0,0 +1,52 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def cosine_similarity(a, b, eps=1e-8):
+    return (a * b).sum(1) / (a.norm(axis=1) * b.norm(axis=1) + eps)
+
+
+def pearson_correlation(a, b, eps=1e-8):
+    return cosine_similarity(a - a.mean(1).unsqueeze(1),
+                             b - b.mean(1).unsqueeze(1), eps)
+
+
+def inter_class_relation(y_s, y_t):
+    return 1 - pearson_correlation(y_s, y_t).mean()
+
+
+def intra_class_relation(y_s, y_t):
+    return inter_class_relation(y_s.transpose([1, 0]), y_t.transpose([1, 0]))
+
+
+class DISTLoss(nn.Layer):
+    # DISTLoss
+    # paper [Knowledge Distillation from A Stronger Teacher](https://arxiv.org/pdf/2205.10536v1.pdf)
+    # code reference: https://github.com/hunto/image_classification_sota/blob/d4f15a0494/lib/models/losses/dist_kd.py
+    def __init__(self, beta=1.0, gamma=1.0):
+        super().__init__()
+        self.beta = beta
+        self.gamma = gamma
+
+    def forward(self, z_s, z_t):
+        y_s = F.softmax(z_s, axis=-1)
+        y_t = F.softmax(z_t, axis=-1)
+        inter_loss = inter_class_relation(y_s, y_t)
+        intra_loss = intra_class_relation(y_s, y_t)
+        kd_loss = self.beta * inter_loss + self.gamma * intra_loss
+        return kd_loss
diff --git a/example/low_precision_training/ppcls/loss/distanceloss.py b/example/low_precision_training/ppcls/loss/distanceloss.py
new file mode 100755
index 000000000..0a09f0cb2
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/distanceloss.py
@@ -0,0 +1,43 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn import L1Loss
+from paddle.nn import MSELoss as L2Loss
+from paddle.nn import SmoothL1Loss
+
+
+class DistanceLoss(nn.Layer):
+    """
+    DistanceLoss:
+        mode: loss mode
+    """
+
+    def __init__(self, mode="l2", **kargs):
+        super().__init__()
+        assert mode in ["l1", "l2", "smooth_l1"]
+        if mode == "l1":
+            self.loss_func = nn.L1Loss(**kargs)
+        elif mode == "l2":
+            self.loss_func = nn.MSELoss(**kargs)
+        elif mode == "smooth_l1":
+            self.loss_func = nn.SmoothL1Loss(**kargs)
+        self.mode = mode
+
+    def forward(self, x, y):
+        loss = self.loss_func(x, y)
+        return {"loss_{}".format(self.mode): loss}
diff --git a/example/low_precision_training/ppcls/loss/distillationloss.py b/example/low_precision_training/ppcls/loss/distillationloss.py
new file mode 100755
index 000000000..6ccbbb840
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/distillationloss.py
@@ -0,0 +1,426 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .celoss import CELoss
+from .dmlloss import DMLLoss
+from .distanceloss import DistanceLoss
+from .rkdloss import RKdAngle, RkdDistance
+from .kldivloss import KLDivLoss
+from .dkdloss import DKDLoss
+from .wslloss import WSLLoss
+from .dist_loss import DISTLoss
+from .multilabelloss import MultiLabelLoss
+from .mgd_loss import MGDLoss
+from .skdloss import SKDLoss
+from .pefdloss import PEFDLoss
+
+
+class DistillationCELoss(CELoss):
+    """
+    DistillationCELoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 epsilon=None,
+                 key=None,
+                 name="loss_ce"):
+        super().__init__(epsilon=epsilon)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            for key in loss:
+                loss_dict["{}_{}_{}".format(key, pair[0], pair[1])] = loss[key]
+        return loss_dict
+
+
+class DistillationGTCELoss(CELoss):
+    """
+    DistillationGTCELoss
+    """
+
+    def __init__(self,
+                 model_names=[],
+                 epsilon=None,
+                 key=None,
+                 name="loss_gt_ce"):
+        super().__init__(epsilon=epsilon)
+        assert isinstance(model_names, list)
+        self.key = key
+        self.model_names = model_names
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for name in self.model_names:
+            out = predicts[name]
+            if self.key is not None:
+                out = out[self.key]
+            loss = super().forward(out, batch)
+            for key in loss:
+                loss_dict["{}_{}".format(key, name)] = loss[key]
+        return loss_dict
+
+
+class DistillationDMLLoss(DMLLoss):
+    """
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 act="softmax",
+                 weight_ratio=False,
+                 sum_across_class_dim=False,
+                 key=None,
+                 name="loss_dml"):
+        super().__init__(act=act, sum_across_class_dim=sum_across_class_dim)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+        self.weight_ratio = weight_ratio
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            if self.weight_ratio is True:
+                loss = super().forward(out1, out2, batch)
+            else:
+                loss = super().forward(out1, out2)
+            if isinstance(loss, dict):
+                for key in loss:
+                    loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1],
+                                                   idx)] = loss[key]
+            else:
+                loss_dict["{}_{}".format(self.name, idx)] = loss
+        return loss_dict
+
+
+class DistillationDistanceLoss(DistanceLoss):
+    """
+    """
+
+    def __init__(self,
+                 mode="l2",
+                 model_name_pairs=[],
+                 act=None,
+                 key=None,
+                 name="loss_",
+                 **kargs):
+        super().__init__(mode=mode, **kargs)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name + mode
+        assert act in [None, "sigmoid", "softmax"]
+        if act == "sigmoid":
+            self.act = nn.Sigmoid()
+        elif act == "softmax":
+            self.act = nn.Softmax(axis=-1)
+        else:
+            self.act = None
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            if self.act is not None:
+                out1 = self.act(out1)
+                out2 = self.act(out2)
+            loss = super().forward(out1, out2)
+            for key in loss:
+                loss_dict["{}_{}_{}".format(self.name, key, idx)] = loss[key]
+        return loss_dict
+
+
+class DistillationRKDLoss(nn.Layer):
+    def __init__(self,
+                 target_size=None,
+                 model_name_pairs=(["Student", "Teacher"], ),
+                 student_keepkeys=[],
+                 teacher_keepkeys=[]):
+        super().__init__()
+        self.student_keepkeys = student_keepkeys
+        self.teacher_keepkeys = teacher_keepkeys
+        self.model_name_pairs = model_name_pairs
+        assert len(self.student_keepkeys) == len(self.teacher_keepkeys)
+
+        self.rkd_angle_loss = RKdAngle(target_size=target_size)
+        self.rkd_dist_loss = RkdDistance(target_size=target_size)
+
+    def __call__(self, predicts, batch):
+        loss_dict = {}
+        for m1, m2 in self.model_name_pairs:
+            for idx, (
+                    student_name, teacher_name
+            ) in enumerate(zip(self.student_keepkeys, self.teacher_keepkeys)):
+                student_out = predicts[m1][student_name]
+                teacher_out = predicts[m2][teacher_name]
+
+                loss_dict[f"loss_angle_{idx}_{m1}_{m2}"] = self.rkd_angle_loss(
+                    student_out, teacher_out)
+                loss_dict[f"loss_dist_{idx}_{m1}_{m2}"] = self.rkd_dist_loss(
+                    student_out, teacher_out)
+
+        return loss_dict
+
+
+class DistillationKLDivLoss(KLDivLoss):
+    """
+    DistillationKLDivLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 temperature=4,
+                 key=None,
+                 name="loss_kl"):
+        super().__init__(temperature=temperature)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            for key in loss:
+                loss_dict["{}_{}_{}".format(key, pair[0], pair[1])] = loss[key]
+        return loss_dict
+
+
+class DistillationDKDLoss(DKDLoss):
+    """
+    DistillationDKDLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 key=None,
+                 temperature=1.0,
+                 alpha=1.0,
+                 beta=1.0,
+                 use_target_as_gt=False,
+                 name="loss_dkd"):
+        super().__init__(
+            temperature=temperature,
+            alpha=alpha,
+            beta=beta,
+            use_target_as_gt=use_target_as_gt)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2, batch)
+            loss_dict[f"{self.name}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
+
+
+class DistillationWSLLoss(WSLLoss):
+    """
+    DistillationWSLLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 key=None,
+                 temperature=2.0,
+                 name="wsl_loss"):
+        super().__init__(temperature)
+        self.model_name_pairs = model_name_pairs
+        self.key = key
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2, batch)
+            loss_dict[f"{self.name}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
+
+
+class DistillationSKDLoss(SKDLoss):
+    """
+    DistillationSKDLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 key=None,
+                 temperature=1.0,
+                 multiplier=2.0,
+                 alpha=0.9,
+                 use_target_as_gt=False,
+                 name="skd_loss"):
+        super().__init__(temperature, multiplier, alpha, use_target_as_gt)
+        self.model_name_pairs = model_name_pairs
+        self.key = key
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2, batch)
+            loss_dict[f"{self.name}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
+
+
+class DistillationMultiLabelLoss(MultiLabelLoss):
+    """
+    DistillationMultiLabelLoss
+    """
+
+    def __init__(self,
+                 model_names=[],
+                 epsilon=None,
+                 size_sum=False,
+                 weight_ratio=False,
+                 key=None,
+                 name="loss_mll"):
+        super().__init__(
+            epsilon=epsilon, size_sum=size_sum, weight_ratio=weight_ratio)
+        assert isinstance(model_names, list)
+        self.key = key
+        self.model_names = model_names
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for name in self.model_names:
+            out = predicts[name]
+            if self.key is not None:
+                out = out[self.key]
+            loss = super().forward(out, batch)
+            for key in loss:
+                loss_dict["{}_{}".format(key, name)] = loss[key]
+        return loss_dict
+
+
+class DistillationDISTLoss(DISTLoss):
+    """
+    DistillationDISTLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 key=None,
+                 beta=1.0,
+                 gamma=1.0,
+                 name="loss_dist"):
+        super().__init__(beta=beta, gamma=gamma)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            loss_dict[f"{self.name}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
+
+
+class DistillationPairLoss(nn.Layer):
+    """
+    DistillationPairLoss
+    """
+
+    def __init__(self,
+                 base_loss_name,
+                 model_name_pairs=[],
+                 s_key=None,
+                 t_key=None,
+                 name="loss",
+                 **kwargs):
+        super().__init__()
+        self.loss_func = eval(base_loss_name)(**kwargs)
+        assert type(s_key) == type(t_key)
+        self.s_key = s_key
+        self.t_key = t_key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if isinstance(self.s_key, str):
+                out1 = out1[self.s_key]
+                out2 = out2[self.t_key]
+            else:
+                out1 = [out1[k] if k is not None else out1 for k in self.s_key]
+                out2 = [out2[k] if k is not None else out2 for k in self.t_key]
+
+            loss = self.loss_func.forward(out1, out2)
+            if isinstance(loss, dict):
+                for k in loss:
+                    loss_dict[
+                        f"{self.name}_{idx}_{pair[0]}_{pair[1]}_{k}"] = loss[k]
+            else:
+                loss_dict[f"{self.name}_{idx}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
diff --git a/example/low_precision_training/ppcls/loss/dkdloss.py b/example/low_precision_training/ppcls/loss/dkdloss.py
new file mode 100755
index 000000000..bf9224e31
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/dkdloss.py
@@ -0,0 +1,68 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class DKDLoss(nn.Layer):
+    """
+    DKDLoss
+    Reference: https://arxiv.org/abs/2203.08679
+    Code was heavily based on https://github.com/megvii-research/mdistiller
+    """
+
+    def __init__(self,
+                 temperature=1.0,
+                 alpha=1.0,
+                 beta=1.0,
+                 use_target_as_gt=False):
+        super().__init__()
+        self.temperature = temperature
+        self.alpha = alpha
+        self.beta = beta
+        self.use_target_as_gt = use_target_as_gt
+
+    def forward(self, logits_student, logits_teacher, target=None):
+        if target is None or self.use_target_as_gt:
+            target = logits_teacher.argmax(axis=-1)
+        gt_mask = _get_gt_mask(logits_student, target)
+        other_mask = 1 - gt_mask
+        pred_student = F.softmax(logits_student / self.temperature, axis=1)
+        pred_teacher = F.softmax(logits_teacher / self.temperature, axis=1)
+        pred_student = cat_mask(pred_student, gt_mask, other_mask)
+        pred_teacher = cat_mask(pred_teacher, gt_mask, other_mask)
+        log_pred_student = paddle.log(pred_student)
+        tckd_loss = (F.kl_div(
+            log_pred_student, pred_teacher,
+            reduction='sum') * (self.temperature**2) / target.shape[0])
+        pred_teacher_part2 = F.softmax(
+            logits_teacher / self.temperature - 1000.0 * gt_mask, axis=1)
+        log_pred_student_part2 = F.log_softmax(
+            logits_student / self.temperature - 1000.0 * gt_mask, axis=1)
+        nckd_loss = (F.kl_div(
+            log_pred_student_part2, pred_teacher_part2,
+            reduction='sum') * (self.temperature**2) / target.shape[0])
+        return self.alpha * tckd_loss + self.beta * nckd_loss
+
+
+def _get_gt_mask(logits, target):
+    target = target.reshape([-1]).unsqueeze(1)
+    updates = paddle.ones_like(target)
+    mask = scatter(
+        paddle.zeros_like(logits), target, updates.astype('float32'))
+    return mask
+
+
+def cat_mask(t, mask1, mask2):
+    t1 = (t * mask1).sum(axis=1, keepdim=True)
+    t2 = (t * mask2).sum(axis=1, keepdim=True)
+    rt = paddle.concat([t1, t2], axis=1)
+    return rt
+
+
+def scatter(x, index, updates):
+    i, j = index.shape
+    grid_x, grid_y = paddle.meshgrid(paddle.arange(i), paddle.arange(j))
+    index = paddle.stack([grid_x.flatten(), index.flatten()], axis=1)
+    updates_index = paddle.stack([grid_x.flatten(), grid_y.flatten()], axis=1)
+    updates = paddle.gather_nd(updates, index=updates_index)
+    return paddle.scatter_nd_add(x, index, updates)
diff --git a/example/low_precision_training/ppcls/loss/dmlloss.py b/example/low_precision_training/ppcls/loss/dmlloss.py
new file mode 100755
index 000000000..e8983ed08
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/dmlloss.py
@@ -0,0 +1,62 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.loss.multilabelloss import ratio2weight
+
+
+class DMLLoss(nn.Layer):
+    """
+    DMLLoss
+    """
+
+    def __init__(self, act="softmax", sum_across_class_dim=False, eps=1e-12):
+        super().__init__()
+        if act is not None:
+            assert act in ["softmax", "sigmoid"]
+        if act == "softmax":
+            self.act = nn.Softmax(axis=-1)
+        elif act == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            self.act = None
+        self.eps = eps
+        self.sum_across_class_dim = sum_across_class_dim
+
+    def _kldiv(self, x, target):
+        class_num = x.shape[-1]
+        cost = target * paddle.log(
+            (target + self.eps) / (x + self.eps)) * class_num
+        return cost
+
+    def forward(self, x, target, gt_label=None):
+        if self.act is not None:
+            x = self.act(x)
+            target = self.act(target)
+        loss = self._kldiv(x, target) + self._kldiv(target, x)
+        loss = loss / 2
+
+        # for multi-label dml loss
+        if gt_label is not None:
+            gt_label, label_ratio = gt_label[:, 0, :], gt_label[:, 1, :]
+            targets_mask = paddle.cast(gt_label > 0.5, 'float32')
+            weight = ratio2weight(targets_mask, paddle.to_tensor(label_ratio))
+            weight = weight * (gt_label > -1)
+            loss = loss * weight
+
+        loss = loss.sum(1).mean() if self.sum_across_class_dim else loss.mean()
+        return {"DMLLoss": loss}
diff --git a/example/low_precision_training/ppcls/loss/emlloss.py b/example/low_precision_training/ppcls/loss/emlloss.py
new file mode 100755
index 000000000..38b707fe1
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/emlloss.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import numpy as np
+from .comfunc import rerange_index
+
+
+class EmlLoss(paddle.nn.Layer):
+    """Ensemble Metric Learning Loss
+    paper: [Large Scale Strongly Supervised Ensemble Metric Learning, with Applications to Face Verification and Retrieval](https://arxiv.org/pdf/1212.6094.pdf)
+    code reference: https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/metric_learning/losses/emlloss.py
+    """
+
+    def __init__(self, batch_size=40, samples_each_class=2):
+        super(EmlLoss, self).__init__()
+        assert (batch_size % samples_each_class == 0)
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+        self.thresh = 20.0
+        self.beta = 100000
+
+    def surrogate_function(self, beta, theta, bias):
+        x = theta * paddle.exp(bias)
+        output = paddle.log(1 + beta * x) / math.log(1 + beta)
+        return output
+
+    def surrogate_function_approximate(self, beta, theta, bias):
+        output = (
+            paddle.log(theta) + bias + math.log(beta)) / math.log(1 + beta)
+        return output
+
+    def surrogate_function_stable(self, beta, theta, target, thresh):
+        max_gap = paddle.to_tensor(thresh, dtype='float32')
+        max_gap.stop_gradient = True
+
+        target_max = paddle.maximum(target, max_gap)
+        target_min = paddle.minimum(target, max_gap)
+
+        loss1 = self.surrogate_function(beta, theta, target_min)
+        loss2 = self.surrogate_function_approximate(beta, theta, target_max)
+        bias = self.surrogate_function(beta, theta, max_gap)
+        loss = loss1 + loss2 - bias
+        return loss
+
+    def forward(self, input, target=None):
+        features = input["features"]
+        samples_each_class = self.samples_each_class
+        batch_size = self.batch_size
+        rerange_index = self.rerange_index
+
+        #calc distance
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        rerange_index = paddle.to_tensor(rerange_index)
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, batch_size])
+
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[
+                1, samples_each_class - 1, batch_size - samples_each_class
+            ],
+            axis=1)
+        ignore.stop_gradient = True
+
+        pos_max = paddle.max(pos, axis=1, keepdim=True)
+        pos = paddle.exp(pos - pos_max)
+        pos_mean = paddle.mean(pos, axis=1, keepdim=True)
+
+        neg_min = paddle.min(neg, axis=1, keepdim=True)
+        neg = paddle.exp(neg_min - neg)
+        neg_mean = paddle.mean(neg, axis=1, keepdim=True)
+
+        bias = pos_max - neg_min
+        theta = paddle.multiply(neg_mean, pos_mean)
+
+        loss = self.surrogate_function_stable(self.beta, theta, bias,
+                                              self.thresh)
+        loss = paddle.mean(loss)
+        return {"emlloss": loss}
diff --git a/example/low_precision_training/ppcls/loss/googlenetloss.py b/example/low_precision_training/ppcls/loss/googlenetloss.py
new file mode 100755
index 000000000..491311831
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/googlenetloss.py
@@ -0,0 +1,43 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class GoogLeNetLoss(nn.Layer):
+    """
+    Cross entropy loss used after googlenet
+    reference paper: [https://arxiv.org/pdf/1409.4842v1.pdf](Going Deeper with Convolutions)
+    """
+
+    def __init__(self, epsilon=None):
+        super().__init__()
+        assert (epsilon is None or epsilon <= 0 or
+                epsilon >= 1), "googlenet is not support label_smooth"
+
+    def forward(self, inputs, label):
+        input0, input1, input2 = inputs
+        if isinstance(input0, dict):
+            input0 = input0["logits"]
+        if isinstance(input1, dict):
+            input1 = input1["logits"]
+        if isinstance(input2, dict):
+            input2 = input2["logits"]
+
+        loss0 = F.cross_entropy(input0, label=label, soft_label=False)
+        loss1 = F.cross_entropy(input1, label=label, soft_label=False)
+        loss2 = F.cross_entropy(input2, label=label, soft_label=False)
+        loss = loss0 + 0.3 * loss1 + 0.3 * loss2
+        loss = loss.mean()
+        return {"GooleNetLoss": loss}
diff --git a/example/low_precision_training/ppcls/loss/kldivloss.py b/example/low_precision_training/ppcls/loss/kldivloss.py
new file mode 100755
index 000000000..da6ab02fb
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/kldivloss.py
@@ -0,0 +1,33 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class KLDivLoss(nn.Layer):
+    """
+    Distilling the Knowledge in a Neural Network
+    """
+
+    def __init__(self, temperature=4):
+        super(KLDivLoss, self).__init__()
+        self.T = temperature
+
+    def forward(self, y_s, y_t):
+        p_s = F.log_softmax(y_s / self.T, axis=1)
+        p_t = F.softmax(y_t / self.T, axis=1)
+        loss = F.kl_div(p_s, p_t, reduction='sum') * (self.T**2) / y_s.shape[0]
+        return {"loss_kldiv": loss}
diff --git a/example/low_precision_training/ppcls/loss/metabinloss.py b/example/low_precision_training/ppcls/loss/metabinloss.py
new file mode 100755
index 000000000..34159bdcd
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/metabinloss.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2011.14670
+
+import copy
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from .dist_loss import cosine_similarity
+from .celoss import CELoss
+
+
+def euclidean_dist(x, y):
+    m, n = x.shape[0], y.shape[0]
+    xx = paddle.pow(x, 2).sum(1, keepdim=True).expand([m, n])
+    yy = paddle.pow(y, 2).sum(1, keepdim=True).expand([n, m]).t()
+    dist = xx + yy - 2 * paddle.matmul(x, y.t())
+    dist = dist.clip(min=1e-12).sqrt()  # for numerical stability
+    return dist
+
+
+def hard_example_mining(dist_mat, is_pos, is_neg):
+    """For each anchor, find the hardest positive and negative sample.
+    Args:
+        dist_mat: pairwise distance between samples, shape [N, M]
+        is_pos: positive index with shape [N, M]
+        is_neg: negative index with shape [N, M]
+    Returns:
+        dist_ap: distance(anchor, positive); shape [N, 1]
+        dist_an: distance(anchor, negative); shape [N, 1]
+    """
+
+    inf = float("inf")
+
+    def _masked_max(tensor, mask, axis):
+        masked = paddle.multiply(tensor, mask.astype(tensor.dtype))
+        neg_inf = paddle.zeros_like(tensor)
+        neg_inf.stop_gradient = True
+        neg_inf[paddle.logical_not(mask)] = -inf
+        return paddle.max(masked + neg_inf, axis=axis, keepdim=True)
+
+    def _masked_min(tensor, mask, axis):
+        masked = paddle.multiply(tensor, mask.astype(tensor.dtype))
+        pos_inf = paddle.zeros_like(tensor)
+        pos_inf.stop_gradient = True
+        pos_inf[paddle.logical_not(mask)] = inf
+        return paddle.min(masked + pos_inf, axis=axis, keepdim=True)
+
+    assert len(dist_mat.shape) == 2
+    dist_ap = _masked_max(dist_mat, is_pos, axis=1)
+    dist_an = _masked_min(dist_mat, is_neg, axis=1)
+    return dist_ap, dist_an
+
+
+class IntraDomainScatterLoss(nn.Layer):
+    """
+    IntraDomainScatterLoss
+    
+    enhance intra-domain diversity and disarrange inter-domain distributions like confusing multiple styles.
+
+    reference: https://arxiv.org/abs/2011.14670
+    """
+
+    def __init__(self, normalize_feature, feature_from):
+        super(IntraDomainScatterLoss, self).__init__()
+        self.normalize_feature = normalize_feature
+        self.feature_from = feature_from
+
+    def forward(self, input, batch):
+        domains = batch["domain"]
+        inputs = input[self.feature_from]
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        unique_label = paddle.unique(domains)
+        features_per_domain = list()
+        for i, x in enumerate(unique_label):
+            features_per_domain.append(inputs[x == domains])
+        num_domain = len(features_per_domain)
+        losses = []
+        for i in range(num_domain):
+            features_in_same_domain = features_per_domain[i]
+            center = paddle.mean(features_in_same_domain, 0)
+            cos_sim = cosine_similarity(
+                center.unsqueeze(0), features_in_same_domain)
+            losses.append(paddle.mean(cos_sim))
+        loss = paddle.mean(paddle.stack(losses))
+        return {"IntraDomainScatterLoss": loss}
+
+
+class InterDomainShuffleLoss(nn.Layer):
+    """
+    InterDomainShuffleLoss
+
+    pull the negative sample of the interdomain and push the negative sample of the intra-domain, 
+    so that the inter-domain distributions are shuffled.
+
+    reference: https://arxiv.org/abs/2011.14670
+    """
+
+    def __init__(self, normalize_feature=True, feature_from="features"):
+        super(InterDomainShuffleLoss, self).__init__()
+        self.feature_from = feature_from
+        self.normalize_feature = normalize_feature
+
+    def forward(self, input, batch):
+        target = batch["label"]
+        domains = batch["domain"]
+        inputs = input[self.feature_from]
+        bs = inputs.shape[0]
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        # compute distance
+        dist_mat = euclidean_dist(inputs, inputs)
+
+        is_same_img = np.zeros(shape=[bs, bs], dtype=bool)
+        np.fill_diagonal(is_same_img, True)
+        is_same_img = paddle.to_tensor(is_same_img)
+        is_diff_instance = target.reshape([bs, 1]).expand([bs, bs])\
+            .not_equal(target.reshape([bs, 1]).expand([bs, bs]).t())
+        is_same_domain = domains.reshape([bs, 1]).expand([bs, bs])\
+            .equal(domains.reshape([bs, 1]).expand([bs, bs]).t())
+        is_diff_domain = is_same_domain == False
+
+        is_pos = paddle.logical_or(is_same_img, is_diff_domain)
+        is_neg = paddle.logical_and(is_diff_instance, is_same_domain)
+
+        dist_ap, dist_an = hard_example_mining(dist_mat, is_pos, is_neg)
+
+        y = paddle.ones_like(dist_an)
+        loss = F.soft_margin_loss(dist_an - dist_ap, y)
+        if loss == float('Inf'):
+            loss = F.margin_ranking_loss(dist_an, dist_ap, y, margin=0.3)
+        return {"InterDomainShuffleLoss": loss}
+
+
+class CELossForMetaBIN(CELoss):
+    def _labelsmoothing(self, target, class_num):
+        if len(target.shape) == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        # epsilon is different from the one in original CELoss
+        epsilon = class_num / (class_num - 1) * self.epsilon
+        soft_target = F.label_smooth(one_hot_target, epsilon=epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, x, batch):
+        label = batch["label"]
+        return super().forward(x, label)
+
+
+class TripletLossForMetaBIN(nn.Layer):
+    def __init__(self,
+                 margin=1,
+                 normalize_feature=False,
+                 feature_from="feature"):
+        super(TripletLossForMetaBIN, self).__init__()
+        self.margin = margin
+        self.feature_from = feature_from
+        self.normalize_feature = normalize_feature
+
+    def forward(self, input, batch):
+        inputs = input[self.feature_from]
+        targets = batch["label"]
+        bs = inputs.shape[0]
+        all_targets = targets
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        dist_mat = euclidean_dist(inputs, inputs)
+
+        is_pos = all_targets.reshape([bs, 1]).expand([bs, bs]).equal(
+            all_targets.reshape([bs, 1]).expand([bs, bs]).t())
+        is_neg = all_targets.reshape([bs, 1]).expand([bs, bs]).not_equal(
+            all_targets.reshape([bs, 1]).expand([bs, bs]).t())
+        dist_ap, dist_an = hard_example_mining(dist_mat, is_pos, is_neg)
+
+        y = paddle.ones_like(dist_an)
+        loss = F.margin_ranking_loss(dist_an, dist_ap, y, margin=self.margin)
+        return {"TripletLoss": loss}
diff --git a/example/low_precision_training/ppcls/loss/mgd_loss.py b/example/low_precision_training/ppcls/loss/mgd_loss.py
new file mode 100755
index 000000000..799a91431
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/mgd_loss.py
@@ -0,0 +1,84 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppcls.utils.initializer import kaiming_normal_
+
+
+class MGDLoss(nn.Layer):
+    """Paddle version of `Masked Generative Distillation`
+    MGDLoss
+    Reference: https://arxiv.org/abs/2205.01529
+    Code was heavily based on https://github.com/yzd-v/MGD
+    """
+
+    def __init__(
+            self,
+            student_channels,
+            teacher_channels,
+            alpha_mgd=1.756,
+            lambda_mgd=0.15, ):
+        super().__init__()
+        self.alpha_mgd = alpha_mgd
+        self.lambda_mgd = lambda_mgd
+
+        if student_channels != teacher_channels:
+            self.align = nn.Conv2D(
+                student_channels,
+                teacher_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0)
+        else:
+            self.align = None
+
+        self.generation = nn.Sequential(
+            nn.Conv2D(
+                teacher_channels, teacher_channels, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                teacher_channels, teacher_channels, kernel_size=3, padding=1))
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+
+    def forward(self, pred_s, pred_t):
+        """Forward function.
+        Args:
+            pred_s(Tensor): Bs*C*H*W, student's feature map
+            pred_t(Tensor): Bs*C*H*W, teacher's feature map
+        """
+        assert pred_s.shape[-2:] == pred_t.shape[-2:]
+
+        if self.align is not None:
+            pred_s = self.align(pred_s)
+
+        loss = self.get_dis_loss(pred_s, pred_t) * self.alpha_mgd
+
+        return loss
+
+    def get_dis_loss(self, pred_s, pred_t):
+        loss_mse = nn.MSELoss(reduction='mean')
+        N, C, _, _ = pred_t.shape
+        mat = paddle.rand([N, C, 1, 1])
+        mat = paddle.where(mat < self.lambda_mgd, 0, 1).astype("float32")
+        masked_fea = paddle.multiply(pred_s, mat)
+        new_fea = self.generation(masked_fea)
+        dis_loss = loss_mse(new_fea, pred_t)
+        return dis_loss
diff --git a/example/low_precision_training/ppcls/loss/msmloss.py b/example/low_precision_training/ppcls/loss/msmloss.py
new file mode 100755
index 000000000..adf03ef8e
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/msmloss.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+from .comfunc import rerange_index
+
+
+class MSMLoss(paddle.nn.Layer):
+    """
+    paper : [Margin Sample Mining Loss: A Deep Learning Based Method for Person Re-identification](https://arxiv.org/pdf/1710.00478.pdf)
+    code reference: https://github.com/michuanhaohao/keras_reid/blob/master/reid_tripletcls.py
+    Margin Sample Mining Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+
+    def __init__(self, batch_size=120, samples_each_class=2, margin=0.1):
+        super(MSMLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+
+    def forward(self, input, target=None):
+        #normalization
+        features = input["features"]
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index = paddle.to_tensor(self.rerange_index)
+
+        #calc sm
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        #rerange
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])
+
+        #split
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[1, samples_each_class - 1, -1],
+            axis=1)
+        ignore.stop_gradient = True
+
+        hard_pos = paddle.max(pos)
+        hard_neg = paddle.min(neg)
+
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)
+        return {"msmloss": loss}
+
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
diff --git a/example/low_precision_training/ppcls/loss/multilabelloss.py b/example/low_precision_training/ppcls/loss/multilabelloss.py
new file mode 100755
index 000000000..51ae97838
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/multilabelloss.py
@@ -0,0 +1,119 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def ratio2weight(targets, ratio):
+    pos_weights = targets * (1. - ratio)
+    neg_weights = (1. - targets) * ratio
+    weights = paddle.exp(neg_weights + pos_weights)
+
+    # for RAP dataloader, targets element may be 2, with or without smooth, some element must great than 1
+    weights = weights - weights * (targets > 1).astype(weights.dtype)
+
+    return weights
+
+
+class MultiLabelLoss(nn.Layer):
+    """
+    Multi-label loss
+    """
+
+    def __init__(self, epsilon=None, size_sum=False, weight_ratio=False):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+        self.weight_ratio = weight_ratio
+        self.size_sum = size_sum
+
+    def _labelsmoothing(self, target, class_num):
+        if target.ndim == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def _binary_crossentropy(self, input, target, class_num):
+        if self.weight_ratio:
+            target, label_ratio = target[:, 0, :], target[:, 1, :]
+        elif target.ndim == 3:
+            target = target[:, 0, :]
+        if self.epsilon is not None:
+            target = self._labelsmoothing(target, class_num)
+        cost = F.binary_cross_entropy_with_logits(
+            logit=input, label=target, reduction='none')
+
+        if self.weight_ratio:
+            targets_mask = paddle.cast(target > 0.5, 'float32')
+            weight = ratio2weight(targets_mask, paddle.to_tensor(label_ratio))
+            weight = weight * (target > -1).astype(weight.dtype)
+            cost = cost * weight
+
+        if self.size_sum:
+            cost = cost.sum(1).mean() if self.size_sum else cost.mean()
+
+        return cost
+
+    def forward(self, x, target):
+        if isinstance(x, dict):
+            x = x["logits"]
+        class_num = x.shape[-1]
+        loss = self._binary_crossentropy(x, target, class_num)
+        loss = loss.mean()
+        return {"MultiLabelLoss": loss}
+
+
+class MultiLabelAsymmetricLoss(nn.Layer):
+    """
+    Multi-label asymmetric loss, introduced by
+    Emanuel Ben-Baruch at el. in https://arxiv.org/pdf/2009.14119v4.pdf.
+    """
+
+    def __init__(self,
+                 gamma_pos=1,
+                 gamma_neg=4,
+                 clip=0.05,
+                 epsilon=1e-8,
+                 disable_focal_loss_grad=True,
+                 reduction="sum"):
+        super().__init__()
+        self.gamma_pos = gamma_pos
+        self.gamma_neg = gamma_neg
+        self.clip = clip
+        self.epsilon = epsilon
+        self.disable_focal_loss_grad = disable_focal_loss_grad
+        assert reduction in ["mean", "sum", "none"]
+        self.reduction = reduction
+
+    def forward(self, x, target):
+        if isinstance(x, dict):
+            x = x["logits"]
+        pred_sigmoid = F.sigmoid(x)
+        target = target.astype(pred_sigmoid.dtype)
+
+        # Asymmetric Clipping and Basic CE calculation
+        if self.clip and self.clip > 0:
+            pt = (1 - pred_sigmoid + self.clip).clip(max=1) \
+                * (1 - target) + pred_sigmoid * target
+        else:
+            pt = (1 - pred_sigmoid) * (1 - target) + pred_sigmoid * target
+
+        # Asymmetric Focusing
+        if self.disable_focal_loss_grad:
+            paddle.set_grad_enabled(False)
+        asymmetric_weight = (
+            1 - pt
+        ).pow(self.gamma_pos * target + self.gamma_neg * (1 - target))
+        if self.disable_focal_loss_grad:
+            paddle.set_grad_enabled(True)
+
+        loss = -paddle.log(pt.clip(min=self.epsilon)) * asymmetric_weight
+
+        if self.reduction == 'mean':
+            loss = loss.mean()
+        elif self.reduction == 'sum':
+            loss = loss.sum()
+        return {"MultiLabelAsymmetricLoss": loss}
diff --git a/example/low_precision_training/ppcls/loss/npairsloss.py b/example/low_precision_training/ppcls/loss/npairsloss.py
new file mode 100755
index 000000000..131c799a4
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/npairsloss.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+
+
+class NpairsLoss(paddle.nn.Layer):
+    """Npair_loss_
+    paper [Improved deep metric learning with multi-class N-pair loss objective](https://dl.acm.org/doi/10.5555/3157096.3157304)
+    code reference: https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/contrib/losses/metric_learning/npairs_loss
+    """
+
+    def __init__(self, reg_lambda=0.01):
+        super(NpairsLoss, self).__init__()
+        self.reg_lambda = reg_lambda
+
+    def forward(self, input, target=None):
+        """
+        anchor and positive(should include label)
+        """
+        features = input["features"]
+        reg_lambda = self.reg_lambda
+        batch_size = features.shape[0]
+        fea_dim = features.shape[1]
+        num_class = batch_size // 2
+
+        #reshape
+        out_feas = paddle.reshape(features, shape=[-1, 2, fea_dim])
+        anc_feas, pos_feas = paddle.split(out_feas, num_or_sections=2, axis=1)
+        anc_feas = paddle.squeeze(anc_feas, axis=1)
+        pos_feas = paddle.squeeze(pos_feas, axis=1)
+
+        #get simi matrix
+        similarity_matrix = paddle.matmul(
+            anc_feas, pos_feas, transpose_y=True)  #get similarity matrix
+        sparse_labels = paddle.arange(0, num_class, dtype='int64')
+        xentloss = paddle.nn.CrossEntropyLoss()(
+            similarity_matrix, sparse_labels)  #by default: mean
+
+        #l2 norm
+        reg = paddle.mean(paddle.sum(paddle.square(features), axis=1))
+        l2loss = 0.5 * reg_lambda * reg
+        return {"npairsloss": xentloss + l2loss}
diff --git a/example/low_precision_training/ppcls/loss/pairwisecosface.py b/example/low_precision_training/ppcls/loss/pairwisecosface.py
new file mode 100755
index 000000000..f1fc73024
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/pairwisecosface.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class PairwiseCosface(nn.Layer):
+    """
+    paper: Circle Loss: A Unified Perspective of Pair Similarity Optimization
+    code reference: https://github.com/leoluopy/circle-loss-demonstration/blob/main/circle_loss.py
+    """
+
+    def __init__(self, margin, gamma):
+        super(PairwiseCosface, self).__init__()
+        self.margin = margin
+        self.gamma = gamma
+
+    def forward(self, embedding, targets):
+        if isinstance(embedding, dict):
+            embedding = embedding['features']
+        # Normalize embedding features
+        embedding = F.normalize(embedding, axis=1)
+        dist_mat = paddle.matmul(embedding, embedding, transpose_y=True)
+
+        N = dist_mat.shape[0]
+        is_pos = targets.reshape([N, 1]).expand([N, N]).equal(
+            paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float32')
+        is_neg = targets.reshape([N, 1]).expand([N, N]).not_equal(
+            paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float32')
+
+        # Mask scores related to itself
+        is_pos = is_pos - paddle.eye(N, N)
+
+        s_p = dist_mat * is_pos
+        s_n = dist_mat * is_neg
+
+        logit_p = -self.gamma * s_p + (-99999999.) * (1 - is_pos)
+        logit_n = self.gamma * (s_n + self.margin) + (-99999999.) * (1 - is_neg
+                                                                     )
+
+        loss = F.softplus(
+            paddle.logsumexp(
+                logit_p, axis=1) + paddle.logsumexp(
+                    logit_n, axis=1)).mean()
+
+        return {"PairwiseCosface": loss}
diff --git a/example/low_precision_training/ppcls/loss/pefdloss.py b/example/low_precision_training/ppcls/loss/pefdloss.py
new file mode 100755
index 000000000..f16a8d5dc
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/pefdloss.py
@@ -0,0 +1,83 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.utils.initializer import kaiming_normal_, kaiming_uniform_
+
+
+class Regressor(nn.Layer):
+    """Linear regressor"""
+
+    def __init__(self, dim_in=1024, dim_out=1024):
+        super(Regressor, self).__init__()
+        self.conv = nn.Linear(dim_in, dim_out)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(x)
+        return x
+
+
+class PEFDLoss(nn.Layer):
+    """Improved Feature Distillation via Projector Ensemble
+    Reference: https://arxiv.org/pdf/2210.15274.pdf
+    Code reference: https://github.com/chenyd7/PEFD
+    """
+
+    def __init__(self,
+                 student_channel,
+                 teacher_channel,
+                 num_projectors=3,
+                 mode="flatten"):
+        super().__init__()
+
+        if num_projectors <= 0:
+            raise ValueError("Number of projectors must be greater than 0.")
+
+        if mode not in ["flatten", "gap"]:
+            raise ValueError("Mode must be \"flatten\" or \"gap\".")
+
+        self.mode = mode
+        self.projectors = nn.LayerList()
+
+        for _ in range(num_projectors):
+            self.projectors.append(Regressor(student_channel, teacher_channel))
+
+    def forward(self, student_feature, teacher_feature):
+        if self.mode == "gap":
+            student_feature = F.adaptive_avg_pool2d(student_feature, (1, 1))
+            teacher_feature = F.adaptive_avg_pool2d(teacher_feature, (1, 1))
+
+        student_feature = student_feature.flatten(1)
+        f_t = teacher_feature.flatten(1)
+
+        q = len(self.projectors)
+        f_s = 0.0
+        for i in range(q):
+            f_s += self.projectors[i](student_feature)
+        f_s = f_s / q
+
+        # inner product (normalize first and inner product)
+        normft = f_t.pow(2).sum(1, keepdim=True).pow(1. / 2)
+        outft = f_t / normft
+        normfs = f_s.pow(2).sum(1, keepdim=True).pow(1. / 2)
+        outfs = f_s / normfs
+
+        cos_theta = (outft * outfs).sum(1, keepdim=True)
+        loss = paddle.mean(1 - cos_theta)
+
+        return loss
diff --git a/example/low_precision_training/ppcls/loss/rkdloss.py b/example/low_precision_training/ppcls/loss/rkdloss.py
new file mode 100755
index 000000000..aa6ae2324
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/rkdloss.py
@@ -0,0 +1,99 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def pdist(e, squared=False, eps=1e-12):
+    e_square = e.pow(2).sum(axis=1)
+    prod = paddle.mm(e, e.t())
+    res = (e_square.unsqueeze(1) + e_square.unsqueeze(0) - 2 * prod).clip(
+        min=eps)
+
+    if not squared:
+        res = res.sqrt()
+    return res
+
+
+class RKdAngle(nn.Layer):
+    # paper : [Relational Knowledge Distillation](https://arxiv.org/abs/1904.05068?context=cs.LG)
+    # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py
+    def __init__(self, target_size=None):
+        super().__init__()
+        if target_size is not None:
+            self.avgpool = paddle.nn.AdaptiveAvgPool2D(target_size)
+        else:
+            self.avgpool = None
+
+    def forward(self, student, teacher):
+        # GAP to reduce memory
+        if self.avgpool is not None:
+            # NxC1xH1xW1 -> NxC1x1x1
+            student = self.avgpool(student)
+            # NxC2xH2xW2 -> NxC2x1x1
+            teacher = self.avgpool(teacher)
+
+        # reshape for feature map distillation
+        bs = student.shape[0]
+        student = student.reshape([bs, -1])
+        teacher = teacher.reshape([bs, -1])
+
+        td = (teacher.unsqueeze(0) - teacher.unsqueeze(1))
+        norm_td = F.normalize(td, p=2, axis=2)
+        t_angle = paddle.bmm(norm_td, norm_td.transpose([0, 2, 1])).reshape(
+            [-1, 1])
+
+        sd = (student.unsqueeze(0) - student.unsqueeze(1))
+        norm_sd = F.normalize(sd, p=2, axis=2)
+        s_angle = paddle.bmm(norm_sd, norm_sd.transpose([0, 2, 1])).reshape(
+            [-1, 1])
+        loss = F.smooth_l1_loss(s_angle, t_angle, reduction='mean')
+        return loss
+
+
+class RkdDistance(nn.Layer):
+    # paper : [Relational Knowledge Distillation](https://arxiv.org/abs/1904.05068?context=cs.LG)
+    # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py
+    def __init__(self, eps=1e-12, target_size=1):
+        super().__init__()
+        self.eps = eps
+        if target_size is not None:
+            self.avgpool = paddle.nn.AdaptiveAvgPool2D(target_size)
+        else:
+            self.avgpool = None
+
+    def forward(self, student, teacher):
+        # GAP to reduce memory
+        if self.avgpool is not None:
+            # NxC1xH1xW1 -> NxC1x1x1
+            student = self.avgpool(student)
+            # NxC2xH2xW2 -> NxC2x1x1
+            teacher = self.avgpool(teacher)
+
+        bs = student.shape[0]
+        student = student.reshape([bs, -1])
+        teacher = teacher.reshape([bs, -1])
+
+        t_d = pdist(teacher, squared=False)
+        mean_td = t_d.mean()
+        t_d = t_d / (mean_td + self.eps)
+
+        d = pdist(student, squared=False)
+        mean_d = d.mean()
+        d = d / (mean_d + self.eps)
+
+        loss = F.smooth_l1_loss(d, t_d, reduction="mean")
+        return loss
diff --git a/example/low_precision_training/ppcls/loss/skdloss.py b/example/low_precision_training/ppcls/loss/skdloss.py
new file mode 100755
index 000000000..fe8e8e14d
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/skdloss.py
@@ -0,0 +1,72 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SKDLoss(nn.Layer):
+    """
+    Spherical Knowledge Distillation
+    paper: https://arxiv.org/pdf/2010.07485.pdf
+    code reference: https://github.com/forjiuzhou/Spherical-Knowledge-Distillation
+    """
+
+    def __init__(self,
+                 temperature,
+                 multiplier=2.0,
+                 alpha=0.9,
+                 use_target_as_gt=False):
+        super().__init__()
+        self.temperature = temperature
+        self.multiplier = multiplier
+        self.alpha = alpha
+        self.use_target_as_gt = use_target_as_gt
+
+    def forward(self, logits_student, logits_teacher, target=None):
+        """Compute Spherical Knowledge Distillation loss.
+        Args:
+            logits_student: student's logits with shape (batch_size, num_classes)
+            logits_teacher: teacher's logits with shape (batch_size, num_classes)
+        """
+        if target is None or self.use_target_as_gt:
+            target = logits_teacher.argmax(axis=-1)
+
+        target = F.one_hot(
+            target.reshape([-1]), num_classes=logits_student[0].shape[0])
+
+        logits_student = F.layer_norm(
+            logits_student,
+            logits_student.shape[1:],
+            weight=None,
+            bias=None,
+            epsilon=1e-7) * self.multiplier
+        logits_teacher = F.layer_norm(
+            logits_teacher,
+            logits_teacher.shape[1:],
+            weight=None,
+            bias=None,
+            epsilon=1e-7) * self.multiplier
+
+        kd_loss = -paddle.sum(F.softmax(logits_teacher / self.temperature) *
+                              F.log_softmax(logits_student / self.temperature),
+                              axis=1)
+
+        kd_loss = paddle.mean(kd_loss) * self.temperature**2
+
+        ce_loss = paddle.mean(-paddle.sum(
+            target * F.log_softmax(logits_student), axis=1))
+
+        return kd_loss * self.alpha + ce_loss * (1 - self.alpha)
diff --git a/example/low_precision_training/ppcls/loss/softsuploss.py b/example/low_precision_training/ppcls/loss/softsuploss.py
new file mode 100755
index 000000000..b1389a0ba
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/softsuploss.py
@@ -0,0 +1,75 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+class SoftSupConLoss(nn.Layer):
+    """
+    Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+    It also supports the unsupervised contrastive loss in SimCLR
+    """
+    def __init__(self, temperature=0.07, contrast_mode='all', base_temperature=0.07):
+        super(SoftSupConLoss, self).__init__()
+        self.temperature = temperature
+        self.contrast_mode = contrast_mode
+        self.base_temperature = base_temperature
+
+    def __call__(self, feat, batch, max_probs=None, labels=None, mask=None, reduction="mean", select_matrix=None):
+        """Compute loss for model. If both `labels` and `mask` are None,
+        it degenerates to SimCLR unsupervised loss:
+        https://arxiv.org/pdf/2002.05709.pdf
+
+        Args:
+            feat: hidden vector of shape [batch_size, n_views, ...].
+            labels: ground truth of shape [batch_size].
+            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
+                has the same class as sample i. Can be asymmetric.
+        Returns:
+            A loss scalar.
+        """
+        max_probs = batch['max_probs']
+        labels = batch['p_targets_u_w']
+        # reduction = batch['reduction']
+        batch_size = feat.shape[0]
+        if labels is not None:
+            labels = labels.reshape((-1, 1))
+            mask = paddle.equal(labels, labels.T).astype('float32')
+            max_probs = max_probs.reshape((-1, 1))
+            score_mask = paddle.matmul(max_probs, max_probs.T)
+            mask = paddle.multiply(mask, score_mask)
+            
+        contrast_count = feat.shape[1]
+        contrast_feat = paddle.concat(paddle.unbind(feat, axis=1), axis=0)  # (2n, d)
+        if self.contrast_mode == 'all':
+            anchor_feat = contrast_feat
+            anchor_count = contrast_count
+        anchor_dot_contrast = paddle.matmul(anchor_feat, contrast_feat.T) / self.temperature
+        logits_max = anchor_dot_contrast.max(axis=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+        mask = paddle.concat([mask, mask], axis=0)
+        mask = paddle.concat([mask, mask], axis=1)
+        
+        logits_mask = 1 - paddle.eye(batch_size * contrast_count, dtype=paddle.float64)
+        mask = mask * logits_mask
+        exp_logits = paddle.exp(logits) * logits_mask
+        log_prob = logits - paddle.log(exp_logits.sum(axis=1, keepdim=True))
+        
+        mean_log_prob_pos = (mask * log_prob).sum(axis=1) / mask.sum(axis=1)
+        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = loss.reshape((anchor_count, batch_size))
+        if reduction == 'mean':
+            loss = loss.mean()
+
+        return {"SoftSupConLoss": loss}
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/loss/softtargetceloss.py b/example/low_precision_training/ppcls/loss/softtargetceloss.py
new file mode 100755
index 000000000..351db50e3
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/softtargetceloss.py
@@ -0,0 +1,16 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SoftTargetCrossEntropy(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, target):
+        loss = paddle.sum(-target * F.log_softmax(x, axis=-1), axis=-1)
+        loss = loss.mean()
+        return {"SoftTargetCELoss": loss}
+
+    def __str__(self, ):
+        return type(self).__name__
diff --git a/example/low_precision_training/ppcls/loss/supconloss.py b/example/low_precision_training/ppcls/loss/supconloss.py
new file mode 100755
index 000000000..753ceaf41
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/supconloss.py
@@ -0,0 +1,109 @@
+import paddle
+from paddle import nn
+
+
+class SupConLoss(nn.Layer):
+    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+    code reference: https://github.com/HobbitLong/SupContrast/blob/master/losses.py
+    It also supports the unsupervised contrastive loss in SimCLR"""
+
+    def __init__(self,
+                 views=16,
+                 temperature=0.07,
+                 contrast_mode='all',
+                 base_temperature=0.07,
+                 normalize_feature=True):
+        super(SupConLoss, self).__init__()
+        self.temperature = paddle.to_tensor(temperature)
+        self.contrast_mode = contrast_mode
+        self.base_temperature = paddle.to_tensor(base_temperature)
+        self.num_ids = None
+        self.views = views
+        self.normalize_feature = normalize_feature
+
+    def forward(self, features, labels, mask=None):
+        """Compute loss for model. If both `labels` and `mask` are None,
+        it degenerates to SimCLR unsupervised loss:
+        https://arxiv.org/pdf/2002.05709.pdf
+        Args:
+            features: hidden vector of shape [bsz, n_views, ...].
+            labels: ground truth of shape [bsz].
+            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
+                has the same class as sample i. Can be asymmetric.
+        Returns:
+            A loss scalar.
+        """
+        features = features["features"]
+        if self.num_ids is None:
+            self.num_ids = int(features.shape[0] / self.views)
+
+        if self.normalize_feature:
+            features = 1. * features / (paddle.expand_as(
+                paddle.norm(
+                    features, p=2, axis=-1, keepdim=True), features) + 1e-12)
+        features = features.reshape([self.num_ids, self.views, -1])
+        labels = labels.reshape([self.num_ids, self.views])[:, 0]
+
+        if len(features.shape) < 3:
+            raise ValueError('`features` needs to be [bsz, n_views, ...],'
+                             'at least 3 dimensions are required')
+        if len(features.shape) > 3:
+            features = features.reshape(
+                [features.shape[0], features.shape[1], -1])
+
+        batch_size = features.shape[0]
+        if labels is not None and mask is not None:
+            raise ValueError('Cannot define both `labels` and `mask`')
+        elif labels is None and mask is None:
+            mask = paddle.eye(batch_size, dtype='float32')
+        elif labels is not None:
+            labels = labels.reshape([-1, 1])
+            if labels.shape[0] != batch_size:
+                raise ValueError(
+                    'Num of labels does not match num of features')
+            mask = paddle.cast(
+                paddle.equal(labels, paddle.t(labels)), 'float32')
+        else:
+            mask = paddle.cast(mask, 'float32')
+
+        contrast_count = features.shape[1]
+        contrast_feature = paddle.concat(
+            paddle.unbind(
+                features, axis=1), axis=0)
+        if self.contrast_mode == 'one':
+            anchor_feature = features[:, 0]
+            anchor_count = 1
+        elif self.contrast_mode == 'all':
+            anchor_feature = contrast_feature
+            anchor_count = contrast_count
+        else:
+            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
+
+        # compute logits
+        anchor_dot_contrast = paddle.divide(
+            paddle.matmul(anchor_feature, paddle.t(contrast_feature)),
+            self.temperature)
+        # for numerical stability
+        logits_max = paddle.max(anchor_dot_contrast, axis=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+
+        # tile mask
+        mask = paddle.tile(mask, [anchor_count, contrast_count])
+
+        logits_mask = 1 - paddle.eye(batch_size * anchor_count)
+        mask = mask * logits_mask
+
+        # compute log_prob
+        exp_logits = paddle.exp(logits) * logits_mask
+        log_prob = logits - paddle.log(
+            paddle.sum(exp_logits, axis=1, keepdim=True))
+
+        # compute mean of log-likelihood over positive
+        mean_log_prob_pos = paddle.sum((mask * log_prob),
+                                       axis=1) / paddle.sum(mask, axis=1)
+
+        # loss
+        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = paddle.mean(loss.reshape([anchor_count, batch_size]))
+
+        return {"SupConLoss": loss}
diff --git a/example/low_precision_training/ppcls/loss/trihardloss.py b/example/low_precision_training/ppcls/loss/trihardloss.py
new file mode 100755
index 000000000..96cb42cb4
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/trihardloss.py
@@ -0,0 +1,84 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .comfunc import rerange_index
+
+
+class TriHardLoss(paddle.nn.Layer):
+    """
+    paper: In Defense of the Triplet Loss for Person Re-Identification
+    code reference: https://github.com/VisualComputingInstitute/triplet-reid/blob/master/loss.py
+    TriHard Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+
+    def __init__(self, batch_size=120, samples_each_class=2, margin=0.1):
+        super(TriHardLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+
+    def forward(self, input, target=None):
+        features = input["features"]
+        assert (self.batch_size == features.shape[0])
+
+        #normalization
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index = paddle.to_tensor(self.rerange_index)
+
+        #calc sm
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        #rerange
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])
+
+        #split
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[1, samples_each_class - 1, -1],
+            axis=1)
+
+        ignore.stop_gradient = True
+        hard_pos = paddle.max(pos, axis=1)
+        hard_neg = paddle.min(neg, axis=1)
+
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)
+        loss = paddle.mean(loss)
+        return {"trihardloss": loss}
+
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
diff --git a/example/low_precision_training/ppcls/loss/triplet.py b/example/low_precision_training/ppcls/loss/triplet.py
new file mode 100755
index 000000000..0da7cc5df
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/triplet.py
@@ -0,0 +1,157 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class TripletLossV2(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    paper : [Facenet: A unified embedding for face recognition and clustering](https://arxiv.org/pdf/1503.03832.pdf)
+    code reference: https://github.com/okzhili/Cartoon-face-recognition/blob/master/loss/triplet_loss.py
+    Args:
+        margin (float): margin for triplet.
+    """
+
+    def __init__(self,
+                 margin=0.5,
+                 normalize_feature=True,
+                 feature_from="features"):
+        super(TripletLossV2, self).__init__()
+        self.margin = margin
+        self.feature_from = feature_from
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+        self.normalize_feature = normalize_feature
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input[self.feature_from]
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        bs = inputs.shape[0]
+
+        # compute distance
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(
+            input=dist, x=inputs, y=inputs.t(), alpha=-2.0, beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+
+        # hard negative mining
+        is_pos = paddle.expand(target, (
+            bs, bs)).equal(paddle.expand(target, (bs, bs)).t())
+        is_neg = paddle.expand(target, (
+            bs, bs)).not_equal(paddle.expand(target, (bs, bs)).t())
+
+        # `dist_ap` means distance(anchor, positive)
+        ## both `dist_ap` and `relative_p_inds` with shape [N, 1]
+        '''
+        dist_ap, relative_p_inds = paddle.max(
+            paddle.reshape(dist[is_pos], (bs, -1)), axis=1, keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an, relative_n_inds = paddle.min(
+            paddle.reshape(dist[is_neg], (bs, -1)), axis=1, keepdim=True)
+        '''
+        dist_ap = paddle.max(paddle.reshape(
+            paddle.masked_select(dist, is_pos), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an = paddle.min(paddle.reshape(
+            paddle.masked_select(dist, is_neg), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # shape [N]
+        dist_ap = paddle.squeeze(dist_ap, axis=1)
+        dist_an = paddle.squeeze(dist_an, axis=1)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLossV2": loss}
+
+
+class TripletLoss(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    Reference:
+    Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737.
+    Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py.
+    Args:
+        margin (float): margin for triplet.
+    """
+
+    def __init__(self, margin=1.0):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input["features"]
+
+        bs = inputs.shape[0]
+        # Compute pairwise distance, replace by the official when merged
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(
+            input=dist, x=inputs, y=inputs.t(), alpha=-2.0, beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+
+        mask = paddle.equal(
+            target.expand([bs, bs]), target.expand([bs, bs]).t())
+        mask_numpy_idx = mask.numpy()
+        dist_ap, dist_an = [], []
+        for i in range(bs):
+            # dist_ap_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i]].max(),dtype='float64').unsqueeze(0)
+            # dist_ap_i.stop_gradient = False
+            # dist_ap.append(dist_ap_i)
+            dist_ap.append(
+                max([
+                    dist[i][j] if mask_numpy_idx[i][j] == True else float(
+                        "-inf") for j in range(bs)
+                ]).unsqueeze(0))
+            # dist_an_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i] == False].min(), dtype='float64').unsqueeze(0)
+            # dist_an_i.stop_gradient = False
+            # dist_an.append(dist_an_i)
+            dist_an.append(
+                min([
+                    dist[i][k] if mask_numpy_idx[i][k] == False else float(
+                        "inf") for k in range(bs)
+                ]).unsqueeze(0))
+
+        dist_ap = paddle.concat(dist_ap, axis=0)
+        dist_an = paddle.concat(dist_an, axis=0)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLoss": loss}
diff --git a/example/low_precision_training/ppcls/loss/tripletangularmarginloss.py b/example/low_precision_training/ppcls/loss/tripletangularmarginloss.py
new file mode 100755
index 000000000..df03489f2
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/tripletangularmarginloss.py
@@ -0,0 +1,241 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from ppcls.loss.xbm import CrossBatchMemory
+
+
+class TripletAngularMarginLoss(nn.Layer):
+    """A more robust triplet loss with hard positive/negative mining on angular margin instead of relative distance between d(a,p) and d(a,n).
+
+    Args:
+        margin (float, optional): angular margin. Defaults to 0.5.
+        normalize_feature (bool, optional): whether to apply L2-norm in feature before computing distance(cos-similarity). Defaults to True.
+        reduction (str, optional): reducing option within an batch . Defaults to "mean".
+        add_absolute (bool, optional): whether add absolute loss within d(a,p) or d(a,n). Defaults to False.
+        absolute_loss_weight (float, optional): weight for absolute loss. Defaults to 1.0.
+        ap_value (float, optional): weight for d(a, p). Defaults to 0.9.
+        an_value (float, optional): weight for d(a, n). Defaults to 0.5.
+        feature_from (str, optional): which key feature from. Defaults to "features".
+    """
+
+    def __init__(self,
+                 margin=0.5,
+                 normalize_feature=True,
+                 reduction="mean",
+                 add_absolute=False,
+                 absolute_loss_weight=1.0,
+                 ap_value=0.9,
+                 an_value=0.5,
+                 feature_from="features"):
+        super(TripletAngularMarginLoss, self).__init__()
+        self.margin = margin
+        self.feature_from = feature_from
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(
+            margin=margin, reduction=reduction)
+        self.normalize_feature = normalize_feature
+        self.add_absolute = add_absolute
+        self.ap_value = ap_value
+        self.an_value = an_value
+        self.absolute_loss_weight = absolute_loss_weight
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (batch_size)
+        """
+        inputs = input[self.feature_from]
+
+        if self.normalize_feature:
+            inputs = paddle.divide(
+                inputs, paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True))
+
+        bs = inputs.shape[0]
+
+        # compute distance(cos-similarity)
+        dist = paddle.matmul(inputs, inputs.t())
+
+        # hard negative mining
+        is_pos = paddle.expand(target, (
+            bs, bs)).equal(paddle.expand(target, (bs, bs)).t())
+        is_neg = paddle.expand(target, (
+            bs, bs)).not_equal(paddle.expand(target, (bs, bs)).t())
+
+        # `dist_ap` means distance(anchor, positive)
+        # both `dist_ap` and `relative_p_inds` with shape [N, 1]
+        dist_ap = paddle.min(paddle.reshape(
+            paddle.masked_select(dist, is_pos), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an = paddle.max(paddle.reshape(
+            paddle.masked_select(dist, is_neg), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # shape [N]
+        dist_ap = paddle.squeeze(dist_ap, axis=1)
+        dist_an = paddle.squeeze(dist_an, axis=1)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_ap, dist_an, y)
+
+        if self.add_absolute:
+            absolut_loss_ap = self.ap_value - dist_ap
+            absolut_loss_ap = paddle.where(absolut_loss_ap > 0,
+                                           absolut_loss_ap,
+                                           paddle.zeros_like(absolut_loss_ap))
+
+            absolut_loss_an = dist_an - self.an_value
+            absolut_loss_an = paddle.where(absolut_loss_an > 0,
+                                           absolut_loss_an,
+                                           paddle.ones_like(absolut_loss_an))
+
+            loss = (absolut_loss_an.mean() + absolut_loss_ap.mean()
+                    ) * self.absolute_loss_weight + loss.mean()
+
+        return {"TripletAngularMarginLoss": loss}
+
+
+class TripletAngularMarginLoss_XBM(TripletAngularMarginLoss):
+    """TripletAngularMarginLoss combined with CrossBatchMemory
+
+    Args:
+        start_iter: (int): from which step CrossBatchMemory is enabled
+        xbm_size: (int): Size of CrossBatchMemory
+        xbm_weight: (float): Weight of CrossBatchMemory loss
+        feat_dim: (int): Channels of features in CrossBatchMemory
+        margin (float, optional): angular margin. Defaults to 0.5.
+        normalize_feature (bool, optional): whether to apply L2-norm in feature before computing distance(cos-similarity). Defaults to True.
+        reduction (str, optional): reducing option within an batch . Defaults to "mean".
+        add_absolute (bool, optional): whether add absolute loss within d(a,p) or d(a,n). Defaults to False.
+        absolute_loss_weight (float, optional): weight for absolute loss. Defaults to 1.0.
+        ap_value (float, optional): weight for d(a, p). Defaults to 0.9.
+        an_value (float, optional): weight for d(a, n). Defaults to 0.5.
+        feature_from (str, optional): which key feature from. Defaults to "features".
+    """
+
+    def __init__(self,
+                 start_iter: int,
+                 xbm_size: int,
+                 xbm_weight: float,
+                 feat_dim: int,
+                 margin=0.5,
+                 normalize_feature=True,
+                 reduction="mean",
+                 add_absolute=False,
+                 absolute_loss_weight=1.0,
+                 ap_value=0.9,
+                 an_value=0.5,
+                 feature_from="features"):
+        super(TripletAngularMarginLoss_XBM, self).__init__(
+            margin, normalize_feature, reduction, add_absolute,
+            absolute_loss_weight, ap_value, an_value, feature_from)
+        self.start_iter = start_iter
+        self.xbm = CrossBatchMemory(xbm_size, feat_dim)
+        self.xbm_weight = xbm_weight
+        self.inf = 10  # 10 is big enough as inf for cos-similarity
+        self.register_buffer("iter", paddle.to_tensor(0, dtype="int64"))
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (batch_size)
+        """
+        feats = input[self.feature_from]
+        if self.normalize_feature:
+            feats = nn.functional.normalize(feats, p=2, axis=1)
+
+        labels = target
+        if labels.ndim >= 2 and labels.shape[-1] == 1:
+            labels = paddle.squeeze(labels, axis=[-1])
+
+        loss = self._compute_loss(feats, labels, feats, labels)
+
+        # XBM loss below
+        self.iter += 1
+        if self.iter.item() > self.start_iter:
+            self.xbm.enqueue_dequeue(feats.detach(), labels.detach())
+            xbm_feats, xbm_labels = self.xbm.get()
+            xbm_loss = self._compute_loss(feats, labels, xbm_feats, xbm_labels)
+            loss = loss + self.xbm_weight * xbm_loss
+
+        return {"TripletAngularMarginLoss_XBM": loss}
+
+    def _masked_max(self, tensor, mask, axis):
+        masked = paddle.multiply(tensor, mask.astype(tensor.dtype))
+        neg_inf = paddle.zeros_like(tensor)
+        neg_inf.stop_gradient = True
+        neg_inf[paddle.logical_not(mask)] = -self.inf
+        return paddle.max(masked + neg_inf, axis=axis, keepdim=True)
+
+    def _masked_min(self, tensor, mask, axis):
+        masked = paddle.multiply(tensor, mask.astype(tensor.dtype))
+        pos_inf = paddle.zeros_like(tensor)
+        pos_inf.stop_gradient = True
+        pos_inf[paddle.logical_not(mask)] = self.inf
+        return paddle.min(masked + pos_inf, axis=axis, keepdim=True)
+
+    def _compute_loss(self,
+                      inputs_q: paddle.Tensor,
+                      targets_q: paddle.Tensor,
+                      inputs_k: paddle.Tensor,
+                      targets_k: paddle.Tensor) -> paddle.Tensor:
+        Q = inputs_q.shape[0]
+        K = inputs_k.shape[0]
+
+        # compute distance(cos-similarity)
+        dist = paddle.matmul(inputs_q, inputs_k.t())  # [Q, K]
+
+        # hard negative mining
+        is_pos = paddle.expand(paddle.unsqueeze(targets_q, 1), (Q, K)).equal(
+            paddle.expand(paddle.unsqueeze(targets_k, 1),
+                          (K, Q)).t())  # [Q, K]
+        is_neg = paddle.expand(paddle.unsqueeze(targets_q, 1),
+                               (Q, K)).not_equal(
+                                   paddle.expand(
+                                       paddle.unsqueeze(targets_k, 1),
+                                       (K, Q)).t())  # [Q, K]
+
+        dist_ap = self._masked_min(dist, is_pos, axis=1)  # [Q, ]
+        dist_an = self._masked_max(dist, is_neg, axis=1)  # [Q, ]
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_ap, dist_an, y)
+
+        if self.add_absolute:
+            absolut_loss_ap = self.ap_value - dist_ap
+            absolut_loss_ap = paddle.where(absolut_loss_ap > 0,
+                                           absolut_loss_ap,
+                                           paddle.zeros_like(absolut_loss_ap))
+
+            absolut_loss_an = dist_an - self.an_value
+            absolut_loss_an = paddle.where(absolut_loss_an > 0,
+                                           absolut_loss_an,
+                                           paddle.ones_like(absolut_loss_an))
+
+            loss = (absolut_loss_an.mean() + absolut_loss_ap.mean()
+                    ) * self.absolute_loss_weight + loss.mean()
+
+        return loss
diff --git a/example/low_precision_training/ppcls/loss/wslloss.py b/example/low_precision_training/ppcls/loss/wslloss.py
new file mode 100755
index 000000000..8bdfaf8cc
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/wslloss.py
@@ -0,0 +1,66 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class WSLLoss(nn.Layer):
+    """
+    Weighted Soft Labels Loss
+    paper: https://arxiv.org/pdf/2102.00650.pdf
+    code reference: https://github.com/bellymonster/Weighted-Soft-Label-Distillation
+    """
+
+    def __init__(self, temperature=2.0, use_target_as_gt=False):
+        super().__init__()
+        self.temperature = temperature
+        self.use_target_as_gt = use_target_as_gt
+
+    def forward(self, logits_student, logits_teacher, target=None):
+        """Compute weighted soft labels loss.
+        Args:
+            logits_student: student's logits with shape (batch_size, num_classes)
+            logits_teacher: teacher's logits with shape (batch_size, num_classes)
+            target: ground truth labels with shape (batch_size)
+        """
+        if target is None or self.use_target_as_gt:
+            target = logits_teacher.argmax(axis=-1)
+
+        target = F.one_hot(
+            target.reshape([-1]), num_classes=logits_student[0].shape[0])
+
+        s_input_for_softmax = logits_student / self.temperature
+        t_input_for_softmax = logits_teacher / self.temperature
+
+        ce_loss_s = -paddle.sum(target *
+                                F.log_softmax(logits_student.detach()),
+                                axis=1)
+        ce_loss_t = -paddle.sum(target *
+                                F.log_softmax(logits_teacher.detach()),
+                                axis=1)
+
+        ratio = ce_loss_s / (ce_loss_t + 1e-7)
+        ratio = paddle.maximum(ratio, paddle.zeros_like(ratio))
+
+        kd_loss = -paddle.sum(F.softmax(t_input_for_softmax) *
+                              F.log_softmax(s_input_for_softmax),
+                              axis=1)
+        weight = 1 - paddle.exp(-ratio)
+
+        weighted_kd_loss = (self.temperature**2) * paddle.mean(kd_loss *
+                                                               weight)
+
+        return weighted_kd_loss
diff --git a/example/low_precision_training/ppcls/loss/xbm.py b/example/low_precision_training/ppcls/loss/xbm.py
new file mode 100755
index 000000000..d63583e44
--- /dev/null
+++ b/example/low_precision_training/ppcls/loss/xbm.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Tuple
+
+import paddle
+
+
+class CrossBatchMemory(paddle.nn.Layer):
+    """
+    CrossBatchMemory Implementation. refer to "Cross-Batch Memory for Embedding Learning".
+
+    code heavily based on https://github.com/msight-tech/research-xbm/blob/master/ret_benchmark/modeling/xbm.py
+
+    Args:
+        size (int): Size of memory bank
+        embedding_size (int): number of embedding dimension for memory bank
+    """
+
+    def __init__(self, size: int, embedding_size: int):
+        super().__init__()
+        self.size = size
+        self.embedding_size = embedding_size
+
+        # initialize and register feature queue for resume training
+        feats = paddle.zeros([self.size, self.embedding_size])
+        self.register_buffer("feats", feats)
+
+        # initialize and register label queue for resume training
+        targets = paddle.zeros([self.size, ], dtype="int64")
+        self.register_buffer("targets", targets)
+
+        self.ptr = 0
+        # self.accumulated_size = 0
+
+    @property
+    def _is_full(self) -> bool:
+        # return self.accumulated_size >= self.size
+        return self.targets[-1].item() != 0  # author's usage
+
+    def get(self) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """return features and targets in memory bank
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]: [features, targets]
+        """
+        if self._is_full:
+            return self.feats, self.targets
+        else:
+            return self.feats[:self.ptr], self.targets[:self.ptr]
+
+    def enqueue_dequeue(self, feats: paddle.Tensor,
+                        targets: paddle.Tensor) -> None:
+        """put newest feats and targets into memory bank and pop oldest feats and targets from momory bank
+
+        Args:
+            feats (paddle.Tensor): features to enque
+            targets (paddle.Tensor): targets to enque
+        """
+        input_size = len(targets)
+        if self.ptr + input_size > self.size:
+            self.feats[-input_size:] = feats
+            self.targets[-input_size:] = targets
+            self.ptr = 0
+        else:
+            self.feats[self.ptr:self.ptr + input_size] = feats
+            self.targets[self.ptr:self.ptr + input_size] = targets
+            self.ptr += input_size
+        # self.accumulated_size += input_size
+
+    def forward(self, *kargs, **kwargs):
+        raise NotImplementedError(
+            "CrossBatchMemory module is for memory-bank, forward method is not needed"
+        )
diff --git a/example/low_precision_training/ppcls/metric/__init__.py b/example/low_precision_training/ppcls/metric/__init__.py
new file mode 100755
index 000000000..ee35eb467
--- /dev/null
+++ b/example/low_precision_training/ppcls/metric/__init__.py
@@ -0,0 +1,71 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import copy
+from collections import OrderedDict
+
+from .avg_metrics import AvgMetrics
+from .metrics import TopkAcc, mAP, mINP, Recallk, Precisionk
+from .metrics import DistillationTopkAcc
+from .metrics import GoogLeNetTopkAcc
+from .metrics import HammingDistance, AccuracyScore
+from .metrics import ATTRMetric
+from .metrics import TprAtFpr, MultilabelMeanAccuracy
+from .metrics import MultiLabelMAP
+
+
+class CombinedMetrics(AvgMetrics):
+    def __init__(self, config_list):
+        super().__init__()
+        self.metric_func_list = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            metric_name = list(config)[0]
+            metric_params = config[metric_name]
+            if metric_params is not None:
+                self.metric_func_list.append(
+                    eval(metric_name)(**metric_params))
+            else:
+                self.metric_func_list.append(eval(metric_name)())
+        self.reset()
+
+    def forward(self, *args, **kwargs):
+        metric_dict = OrderedDict()
+        for idx, metric_func in enumerate(self.metric_func_list):
+            metric_dict.update(metric_func(*args, **kwargs))
+        return metric_dict
+
+    @property
+    def avg_info(self):
+        return ", ".join([metric.avg_info for metric in self.metric_func_list])
+
+    @property
+    def avg(self):
+        return self.metric_func_list[0].avg
+
+    def attr_res(self):
+        return self.metric_func_list[0].attrmeter.res()
+
+    def reset(self):
+        for metric in self.metric_func_list:
+            if hasattr(metric, "reset"):
+                metric.reset()
+
+
+def build_metrics(config):
+    metrics_list = CombinedMetrics(copy.deepcopy(config))
+    return metrics_list
diff --git a/example/low_precision_training/ppcls/metric/avg_metrics.py b/example/low_precision_training/ppcls/metric/avg_metrics.py
new file mode 100755
index 000000000..6f4b62290
--- /dev/null
+++ b/example/low_precision_training/ppcls/metric/avg_metrics.py
@@ -0,0 +1,20 @@
+from paddle import nn
+
+
+class AvgMetrics(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.avg_meters = {}
+
+    def reset(self):
+        self.avg_meters = {}
+
+    @property
+    def avg(self):
+        if self.avg_meters:
+            for metric_key in self.avg_meters:
+                return self.avg_meters[metric_key].avg
+
+    @property
+    def avg_info(self):
+        return ", ".join([self.avg_meters[key].avg_info for key in self.avg_meters])
diff --git a/example/low_precision_training/ppcls/metric/metrics.py b/example/low_precision_training/ppcls/metric/metrics.py
new file mode 100755
index 000000000..e39aafbe1
--- /dev/null
+++ b/example/low_precision_training/ppcls/metric/metrics.py
@@ -0,0 +1,661 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cmath import nan
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.preprocessing import binarize
+
+from easydict import EasyDict
+
+from ppcls.metric.avg_metrics import AvgMetrics
+from ppcls.utils.misc import AverageMeter, AttrMeter
+from ppcls.utils import logger
+
+
+class TopkAcc(AvgMetrics):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+        self.reset()
+        self.warned = False
+
+    def reset(self):
+        self.avg_meters = {
+            f"top{k}": AverageMeter(f"top{k}")
+            for k in self.topk
+        }
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+
+        output_dims = x.shape[-1]
+
+        metric_dict = dict()
+        for idx, k in enumerate(self.topk):
+            if output_dims < k:
+                if not self.warned:
+                    msg = f"The output dims({output_dims}) is less than k({k}), so the Top-{k} metric is meaningless."
+                    logger.warning(msg)
+                    self.warned = True
+                metric_dict[f"top{k}"] = 1
+            else:
+                metric_dict[f"top{k}"] = paddle.metric.accuracy(x, label, k=k)
+            self.avg_meters[f"top{k}"].update(metric_dict[f"top{k}"],
+                                              x.shape[0])
+        return metric_dict
+
+
+class mAP(nn.Layer):
+    def __init__(self, descending=True):
+        super().__init__()
+        self.descending = descending
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=self.descending)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+
+        if paddle.numel(num_rel_index).item() == 0:
+            metric_dict["mAP"] = np.nan
+            return metric_dict
+
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        div = paddle.arange(acc_sum.shape[1]).astype("float32") + 1
+        precision = paddle.divide(acc_sum, div)
+
+        #calc map
+        precision_mask = paddle.multiply(equal_flag, precision)
+        ap = paddle.sum(precision_mask, axis=1) / paddle.sum(equal_flag,
+                                                             axis=1)
+        metric_dict["mAP"] = float(paddle.mean(ap))
+        return metric_dict
+
+
+class mINP(nn.Layer):
+    def __init__(self, descending=True):
+        super().__init__()
+        self.descending = descending
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=self.descending)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.indechmx_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        #do accumulative sum
+        div = paddle.arange(equal_flag.shape[1]).astype("float32") + 2
+        minus = paddle.divide(equal_flag, div)
+        auxilary = paddle.subtract(equal_flag, minus)
+        hard_index = paddle.argmax(auxilary, axis=1).astype("float32")
+        all_INP = paddle.divide(paddle.sum(equal_flag, axis=1), hard_index)
+        mINP = paddle.mean(all_INP)
+        metric_dict["mINP"] = float(mINP)
+        return metric_dict
+
+
+class TprAtFpr(nn.Layer):
+    def __init__(self, max_fpr=1 / 1000.):
+        super().__init__()
+        self.gt_pos_score_list = []
+        self.gt_neg_score_list = []
+        self.softmax = nn.Softmax(axis=-1)
+        self.max_fpr = max_fpr
+        self.max_tpr = 0.
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        x = self.softmax(x)
+        for i, label_i in enumerate(label):
+            if label_i[0] == 0:
+                self.gt_neg_score_list.append(x[i][1].numpy())
+            else:
+                self.gt_pos_score_list.append(x[i][1].numpy())
+        return {}
+
+    def reset(self):
+        self.gt_pos_score_list = []
+        self.gt_neg_score_list = []
+        self.max_tpr = 0.
+
+    @property
+    def avg(self):
+        return self.max_tpr
+
+    @property
+    def avg_info(self):
+        max_tpr = 0.
+        result = ""
+        gt_pos_score_list = np.array(self.gt_pos_score_list)
+        gt_neg_score_list = np.array(self.gt_neg_score_list)
+        for i in range(0, 10000):
+            threshold = i / 10000.
+            if len(gt_pos_score_list) == 0:
+                continue
+            tpr = np.sum(
+                gt_pos_score_list > threshold) / len(gt_pos_score_list)
+            if len(gt_neg_score_list) == 0 and tpr > max_tpr:
+                max_tpr = tpr
+                result = "threshold: {}, fpr: 0.0, tpr: {:.5f}".format(
+                    threshold, tpr)
+                msg = f"The number of negative samples is 0, please add negative samples."
+                logger.warning(msg)
+            fpr = np.sum(
+                gt_neg_score_list > threshold) / len(gt_neg_score_list)
+            if fpr <= self.max_fpr and tpr > max_tpr:
+                max_tpr = tpr
+                result = "threshold: {}, fpr: {}, tpr: {:.5f}".format(
+                    threshold, fpr, tpr)
+        self.max_tpr = max_tpr
+        return result
+
+
+class MultilabelMeanAccuracy(nn.Layer):
+    def __init__(self,
+                 start_threshold=0.4,
+                 num_iterations=10,
+                 end_threshold=0.9):
+        super().__init__()
+        self.start_threshold = start_threshold
+        self.num_iterations = num_iterations
+        self.end_threshold = end_threshold
+        self.gt_all_score_list = []
+        self.gt_label_score_list = []
+        self.max_acc = 0.
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        x = F.sigmoid(x)
+        label = label[:, 0, :]
+        for i in range(len(x)):
+            self.gt_all_score_list.append(x[i].numpy())
+            self.gt_label_score_list.append(label[i].numpy())
+        return {}
+
+    def reset(self):
+        self.gt_all_score_list = []
+        self.gt_label_score_list = []
+        self.max_acc = 0.
+
+    @property
+    def avg(self):
+        return self.max_acc
+
+    @property
+    def avg_info(self):
+        max_acc = 0.
+        result = ""
+        gt_all_score_list = np.array(self.gt_all_score_list)
+        gt_label_score_list = np.array(self.gt_label_score_list)
+        for i in range(self.num_iterations):
+            threshold = self.start_threshold + i * (self.end_threshold -
+                                                    self.start_threshold
+                                                    ) / self.num_iterations
+            pred_label = (gt_all_score_list > threshold).astype(int)
+            TP = np.sum(
+                (gt_label_score_list == 1) * (pred_label == 1)).astype(float)
+            TN = np.sum(
+                (gt_label_score_list == 0) * (pred_label == 0)).astype(float)
+            acc = (TP + TN) / len(gt_all_score_list)
+            if max_acc <= acc:
+                max_acc = acc
+                result = "threshold: {}, mean_acc: {}".format(
+                    threshold, max_acc / len(gt_label_score_list[0]))
+        self.max_acc = max_acc / len(gt_label_score_list[0])
+        return result
+
+
+class Recallk(nn.Layer):
+    def __init__(self, topk=(1, 5), descending=True):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+        self.descending = descending
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        # get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=self.descending)
+        gallery_labels_transpose = gallery_img_id.t()
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype("float32"), choosen_indices)
+            equal_flag = equal_flag & keep_mask.astype("bool")
+        equal_flag = paddle.cast(equal_flag, "float32")
+        real_query_num = paddle.sum(equal_flag, axis=1)
+        real_query_num = paddle.sum((real_query_num > 0.0).astype("float32"))
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        mask = (acc_sum > 0.0).astype("float32")
+        all_cmc = (paddle.sum(mask, axis=0) / real_query_num).numpy()
+
+        for k in self.topk:
+            metric_dict["recall{}".format(k)] = all_cmc[k - 1]
+        return metric_dict
+
+
+class Precisionk(nn.Layer):
+    def __init__(self, topk=(1, 5), descending=True):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+        self.descending = descending
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        #get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=self.descending)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        Ns = paddle.arange(gallery_img_id.shape[0]) + 1
+        equal_flag_cumsum = paddle.cumsum(equal_flag, axis=1)
+        Precision_at_k = (paddle.mean(equal_flag_cumsum, axis=0) / Ns).numpy()
+
+        for k in self.topk:
+            metric_dict["precision@{}".format(k)] = Precision_at_k[k - 1]
+
+        return metric_dict
+
+
+class DistillationTopkAcc(TopkAcc):
+    def __init__(self, model_key, feature_key=None, topk=(1, 5)):
+        super().__init__(topk=topk)
+        self.model_key = model_key
+        self.feature_key = feature_key
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x[self.model_key]
+        if self.feature_key is not None:
+            x = x[self.feature_key]
+        return super().forward(x, label)
+
+
+class GoogLeNetTopkAcc(TopkAcc):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        return super().forward(x[0], label)
+
+
+class MultiLabelMetric(AvgMetrics):
+    def __init__(self, bi_threshold=0.5):
+        super().__init__()
+        self.bi_threshold = bi_threshold
+
+    def _multi_hot_encode(self, output):
+        logits = F.sigmoid(output).numpy()
+        return binarize(logits, threshold=self.bi_threshold)
+
+
+class HammingDistance(MultiLabelMetric):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.reset()
+
+    def reset(self):
+        self.avg_meters = {"HammingDistance": AverageMeter("HammingDistance")}
+
+    def forward(self, output, target):
+        preds = super()._multi_hot_encode(output)
+        metric_dict = dict()
+        metric_dict["HammingDistance"] = paddle.to_tensor(
+            hamming_loss(target, preds))
+        self.avg_meters["HammingDistance"].update(
+            float(metric_dict["HammingDistance"]), output.shape[0])
+        return metric_dict
+
+
+class AccuracyScore(MultiLabelMetric):
+    """
+    Hard metric for multilabel classification
+    Args:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    def __init__(self, base="label"):
+        super().__init__()
+        assert base in ["sample", "label"
+                        ], 'must be one of ["sample", "label"]'
+        self.base = base
+        self.reset()
+
+    def reset(self):
+        self.avg_meters = {"AccuracyScore": AverageMeter("AccuracyScore")}
+
+    def forward(self, output, target):
+        preds = super()._multi_hot_encode(output)
+        metric_dict = dict()
+        if self.base == "sample":
+            accuracy = accuracy_metric(target, preds)
+        elif self.base == "label":
+            mcm = multilabel_confusion_matrix(target, preds)
+            tns = mcm[:, 0, 0]
+            fns = mcm[:, 1, 0]
+            tps = mcm[:, 1, 1]
+            fps = mcm[:, 0, 1]
+            accuracy = (sum(tps) + sum(tns)) / (
+                sum(tps) + sum(tns) + sum(fns) + sum(fps))
+        metric_dict["AccuracyScore"] = paddle.to_tensor(accuracy)
+        self.avg_meters["AccuracyScore"].update(
+            float(metric_dict["AccuracyScore"]), output.shape[0])
+        return metric_dict
+
+
+def get_attr_metrics(gt_label, preds_probs, threshold):
+    """
+    index: evaluated label index
+    adapted from "https://github.com/valencebond/Rethinking_of_PAR/blob/master/metrics/pedestrian_metrics.py"
+    """
+    pred_label = (preds_probs > threshold).astype(int)
+
+    eps = 1e-20
+    result = EasyDict()
+
+    has_fuyi = gt_label == -1
+    pred_label[has_fuyi] = -1
+
+    ###############################
+    # label metrics
+    # TP + FN
+    result.gt_pos = np.sum((gt_label == 1), axis=0).astype(float)
+    # TN + FP
+    result.gt_neg = np.sum((gt_label == 0), axis=0).astype(float)
+    # TP
+    result.true_pos = np.sum((gt_label == 1) * (pred_label == 1),
+                             axis=0).astype(float)
+    # TN
+    result.true_neg = np.sum((gt_label == 0) * (pred_label == 0),
+                             axis=0).astype(float)
+    # FP
+    result.false_pos = np.sum(((gt_label == 0) * (pred_label == 1)),
+                              axis=0).astype(float)
+    # FN
+    result.false_neg = np.sum(((gt_label == 1) * (pred_label == 0)),
+                              axis=0).astype(float)
+
+    ################
+    # instance metrics
+    result.gt_pos_ins = np.sum((gt_label == 1), axis=1).astype(float)
+    result.true_pos_ins = np.sum((pred_label == 1), axis=1).astype(float)
+    # true positive
+    result.intersect_pos = np.sum((gt_label == 1) * (pred_label == 1),
+                                  axis=1).astype(float)
+    # IOU
+    result.union_pos = np.sum(((gt_label == 1) + (pred_label == 1)),
+                              axis=1).astype(float)
+
+    return result
+
+
+class ATTRMetric(nn.Layer):
+    def __init__(self, threshold=0.5):
+        super().__init__()
+        self.threshold = threshold
+
+    def reset(self):
+        self.attrmeter = AttrMeter(threshold=0.5)
+
+    def forward(self, output, target):
+        metric_dict = get_attr_metrics(target[:, 0, :].numpy(),
+                                       output.numpy(), self.threshold)
+        self.attrmeter.update(metric_dict)
+        return metric_dict
+
+
+class MultiLabelMAP(nn.Layer):
+    """
+    Calculate multi-label classification mean average precision.
+    Currently, support two types: 11point and integral
+
+    The code base on:
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/metrics/map_utils.py
+
+    Args:
+        map_type (str): Calculation method of mean average.
+    """
+
+    def __init__(self, map_type='integral'):
+        super().__init__()
+        assert map_type in ['11point', 'integral'], \
+            "map_type currently only support '11point' and 'integral'"
+        self.map_type = map_type
+
+        self.reset()
+
+    def reset(self):
+        self.is_latest = True
+        self.class_score_poss = None
+        self.class_gt_counts = None
+        self.mAP = 0.0
+
+    def one_class_update(self, score, gt_label, class_idx):
+        topk_idx = np.argsort(score)[::-1]
+        topk_score = score[topk_idx]
+        topk_gt_label = gt_label[topk_idx]
+        for s, l in zip(topk_score, topk_gt_label):
+            if int(l) == 1:
+                self.class_score_poss[class_idx].append([s, 1.])
+                self.class_gt_counts[class_idx] += 1
+            else:
+                self.class_score_poss[class_idx].append([s, 0.])
+
+    @staticmethod
+    def get_tp_fp_accum(score_pos_list):
+        """
+        Calculate accumulating true/false positive results from
+        [score, pos] records
+        """
+        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
+
+        accum_tp = 0
+        accum_fp = 0
+        accum_tp_list = []
+        accum_fp_list = []
+        for (score, pos) in sorted_list:
+            accum_tp += int(pos)
+            accum_tp_list.append(accum_tp)
+            accum_fp += 1 - int(pos)
+            accum_fp_list.append(accum_fp)
+
+        return accum_tp_list, accum_fp_list
+
+    def compute_mAP(self):
+        if not self.is_latest:
+            mAP = 0.
+            valid_cnt = 0
+            for score_pos, count in zip(self.class_score_poss,
+                                        self.class_gt_counts):
+                if count == 0:
+                    continue
+
+                if len(score_pos) == 0:
+                    valid_cnt += 1
+                    continue
+
+                accum_tp_list, accum_fp_list = \
+                    self.get_tp_fp_accum(score_pos)
+                precision = []
+                recall = []
+                for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
+                    precision.append(float(ac_tp) / (ac_tp + ac_fp))
+                    recall.append(float(ac_tp) / count)
+
+                one_class_ap = 0.0
+                if self.map_type == '11point':
+                    max_precisions = [0.] * 11
+                    start_idx = len(precision) - 1
+                    for j in range(10, -1, -1):
+                        for i in range(start_idx, -1, -1):
+                            if recall[i] < float(j) / 10.:
+                                start_idx = i
+                                if j > 0:
+                                    max_precisions[j - 1] = max_precisions[j]
+                                    break
+                            else:
+                                if max_precisions[j] < precision[i]:
+                                    max_precisions[j] = precision[i]
+                    one_class_ap = sum(max_precisions) / 11.
+                    mAP += one_class_ap
+                    valid_cnt += 1
+                elif self.map_type == 'integral':
+                    import math
+                    prev_recall = 0.
+                    for i in range(len(precision)):
+                        recall_gap = math.fabs(recall[i] - prev_recall)
+                        if recall_gap > 1e-6:
+                            one_class_ap += precision[i] * recall_gap
+                            prev_recall = recall[i]
+                    mAP += one_class_ap
+                    valid_cnt += 1
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported mAP type {self.map_type}")
+
+            self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
+
+            self.is_latest = True
+
+    def forward(self, output, target):
+        scores = F.sigmoid(output).numpy()
+        gt_labels = target.numpy()
+
+        if self.class_score_poss is None:
+            self.class_score_poss = [[] for _ in range(scores.shape[-1])]
+        if self.class_gt_counts is None:
+            self.class_gt_counts = [0] * scores.shape[-1]
+
+        for class_idx in range(scores.shape[-1]):
+            score = scores[:, class_idx]
+            gt_label = gt_labels[:, class_idx]
+            self.one_class_update(score, gt_label, class_idx)
+
+        self.is_latest = False
+
+        return {}
+
+    @property
+    def avg_info(self):
+        self.compute_mAP()
+        return f"MultiLabelMAP({self.map_type}): {self.mAP:.3f}"
+
+    @property
+    def avg(self):
+        self.compute_mAP()
+        return self.mAP
diff --git a/example/low_precision_training/ppcls/optimizer/__init__.py b/example/low_precision_training/ppcls/optimizer/__init__.py
new file mode 100755
index 000000000..992bc57a1
--- /dev/null
+++ b/example/low_precision_training/ppcls/optimizer/__init__.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import copy
+import paddle
+from typing import Dict, List
+
+from ppcls.engine.train.utils import type_name
+from ppcls.utils import logger
+
+from . import optimizer
+
+__all__ = ['build_optimizer']
+
+
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    from . import learning_rate
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+    if 'name' in lr_config:
+        lr_name = lr_config.pop('name')
+        lr = getattr(learning_rate, lr_name)(**lr_config)
+        if isinstance(lr, paddle.optimizer.lr.LRScheduler):
+            return lr
+        else:
+            return lr()
+    else:
+        lr = lr_config['learning_rate']
+    return lr
+
+
+# model_list is None in static graph
+def build_optimizer(config, epochs, step_each_epoch, model_list=None):
+    optim_config = copy.deepcopy(config)
+    if isinstance(optim_config, dict):
+        # convert {'name': xxx, **optim_cfg} to [{name: {scope: xxx, **optim_cfg}}]
+        optim_name = optim_config.pop("name")
+        optim_config: List[Dict[str, Dict]] = [{
+            optim_name: {
+                'scope': "all",
+                **
+                optim_config
+            }
+        }]
+    optim_list = []
+    lr_list = []
+    """NOTE:
+    Currently only support optim objets below.
+    1. single optimizer config.
+    2. next level uner Arch, such as Arch.backbone, Arch.neck, Arch.head.
+    3. loss which has parameters, such as CenterLoss.
+    """
+    for optim_item in optim_config:
+        # optim_cfg = {optim_name: {scope: xxx, **optim_cfg}}
+        # step1 build lr
+        optim_name = list(optim_item.keys())[0]  # get optim_name
+        optim_scope = optim_item[optim_name].pop('scope')  # get optim_scope
+        optim_cfg = optim_item[optim_name]  # get optim_cfg
+
+        lr = build_lr_scheduler(optim_cfg.pop('lr'), epochs, step_each_epoch)
+        logger.debug("build lr ({}) for scope ({}) success..".format(
+            lr, optim_scope))
+        # step2 build regularization
+        if 'regularizer' in optim_cfg and optim_cfg['regularizer'] is not None:
+            if 'weight_decay' in optim_cfg:
+                logger.warning(
+                    "ConfigError: Only one of regularizer and weight_decay can be set in Optimizer Config. \"weight_decay\" has been ignored."
+                )
+            reg_config = optim_cfg.pop('regularizer')
+            reg_name = reg_config.pop('name') + 'Decay'
+            reg = getattr(paddle.regularizer, reg_name)(**reg_config)
+            optim_cfg["weight_decay"] = reg
+            logger.debug("build regularizer ({}) for scope ({}) success..".
+                         format(reg, optim_scope))
+        # step3 build optimizer
+        if 'clip_norm' in optim_cfg:
+            clip_norm = optim_cfg.pop('clip_norm')
+            grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+        else:
+            grad_clip = None
+        optim_model = []
+
+        # for static graph
+        if model_list is None:
+            optim = getattr(optimizer, optim_name)(
+                learning_rate=lr, grad_clip=grad_clip,
+                **optim_cfg)(model_list=optim_model)
+            return optim, lr
+
+        # for dynamic graph
+        for i in range(len(model_list)):
+            if len(model_list[i].parameters()) == 0:
+                continue
+            if optim_scope == "all":
+                # optimizer for all
+                optim_model.append(model_list[i])
+            else:
+                if optim_scope.endswith("Loss"):
+                    # optimizer for loss
+                    for m in model_list[i].sublayers(True):
+                        if type_name(m) == optim_scope:
+                            optim_model.append(m)
+                else:
+                    # opmizer for module in model, such as backbone, neck, head...
+                    if optim_scope == type_name(model_list[i]):
+                        optim_model.append(model_list[i])
+                    elif hasattr(model_list[i], optim_scope):
+                        optim_model.append(getattr(model_list[i], optim_scope))
+                    else:
+                        for name, layer in model_list[i].named_sublayers():
+                            if len(layer.parameters()) != 0 \
+                                    and re.fullmatch(optim_scope, name):
+                                optim_model.append(layer)
+
+        optim = getattr(optimizer, optim_name)(
+            learning_rate=lr, grad_clip=grad_clip,
+            **optim_cfg)(model_list=optim_model)
+        logger.debug("build optimizer ({}) for scope ({}) success..".format(
+            optim, optim_scope))
+        optim_list.append(optim)
+        lr_list.append(lr)
+    return optim_list, lr_list
diff --git a/example/low_precision_training/ppcls/optimizer/learning_rate.py b/example/low_precision_training/ppcls/optimizer/learning_rate.py
new file mode 100755
index 000000000..168bf5e39
--- /dev/null
+++ b/example/low_precision_training/ppcls/optimizer/learning_rate.py
@@ -0,0 +1,687 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import math
+import types
+from abc import abstractmethod
+from typing import Union
+from paddle.optimizer import lr
+from ppcls.utils import logger
+
+
+class LRBase(object):
+    """Base class for custom learning rates
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
+        verbose (bool): If True, prints a message to stdout for each update. Defaults to False
+    """
+
+    def __init__(self,
+                 epochs: int,
+                 step_each_epoch: int,
+                 learning_rate: float,
+                 warmup_epoch: int,
+                 warmup_start_lr: float,
+                 last_epoch: int,
+                 by_epoch: bool,
+                 verbose: bool=False) -> None:
+        """Initialize and record the necessary parameters
+        """
+        super(LRBase, self).__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.epochs = epochs
+        self.step_each_epoch = step_each_epoch
+        self.learning_rate = learning_rate
+        self.warmup_epoch = warmup_epoch
+        self.warmup_steps = self.warmup_epoch if by_epoch else round(
+            self.warmup_epoch * self.step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+        self.last_epoch = last_epoch
+        self.by_epoch = by_epoch
+        self.verbose = verbose
+
+    @abstractmethod
+    def __call__(self, *kargs, **kwargs) -> lr.LRScheduler:
+        """generate an learning rate scheduler
+
+        Returns:
+            lr.LinearWarmup: learning rate scheduler
+        """
+        pass
+
+    def linear_warmup(
+            self,
+            learning_rate: Union[float, lr.LRScheduler]) -> lr.LinearWarmup:
+        """Add an Linear Warmup before learning_rate
+
+        Args:
+            learning_rate (Union[float, lr.LRScheduler]): original learning rate without warmup
+
+        Returns:
+            lr.LinearWarmup: learning rate scheduler with warmup
+        """
+        warmup_lr = lr.LinearWarmup(
+            learning_rate=learning_rate,
+            warmup_steps=self.warmup_steps,
+            start_lr=self.warmup_start_lr,
+            end_lr=self.learning_rate,
+            last_epoch=self.last_epoch,
+            verbose=self.verbose)
+        return warmup_lr
+
+
+class Constant(lr.LRScheduler):
+    """Constant learning rate Class implementation
+
+    Args:
+        learning_rate (float): The initial learning rate
+        last_epoch (int, optional): The index of last epoch. Default: -1.
+    """
+
+    def __init__(self, learning_rate, last_epoch=-1, **kwargs):
+        self.learning_rate = learning_rate
+        self.last_epoch = last_epoch
+        super(Constant, self).__init__()
+
+    def get_lr(self) -> float:
+        """always return the same learning rate
+        """
+        return self.learning_rate
+
+
+class ConstLR(LRBase):
+    """Constant learning rate
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(ConstLR, self).__init__(epochs, step_each_epoch, learning_rate,
+                                      warmup_epoch, warmup_start_lr,
+                                      last_epoch, by_epoch)
+
+    def __call__(self):
+        learning_rate = Constant(
+            learning_rate=self.learning_rate, last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Linear(LRBase):
+    """Linear learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        end_lr (float, optional): The minimum final learning rate. Defaults to 0.0.
+        power (float, optional): Power of polynomial. Defaults to 1.0.
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 end_lr=0.0,
+                 power=1.0,
+                 cycle=False,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(Linear, self).__init__(epochs, step_each_epoch, learning_rate,
+                                     warmup_epoch, warmup_start_lr, last_epoch,
+                                     by_epoch)
+        self.decay_steps = (epochs - self.warmup_epoch) * step_each_epoch
+        self.end_lr = end_lr
+        self.power = power
+        self.cycle = cycle
+        self.warmup_steps = round(self.warmup_epoch * step_each_epoch)
+        if self.by_epoch:
+            self.decay_steps = self.epochs - self.warmup_epoch
+
+    def __call__(self):
+        learning_rate = lr.PolynomialDecay(
+            learning_rate=self.learning_rate,
+            decay_steps=self.decay_steps,
+            end_lr=self.end_lr,
+            power=self.power,
+            cycle=self.cycle,
+            last_epoch=self.last_epoch) if self.decay_steps > 0 else Constant(
+                self.learning_rate)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Cosine(LRBase):
+    """Cosine learning rate decay
+
+    ``lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)``
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        eta_min (float, optional): Minimum learning rate. Defaults to 0.0.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 eta_min=0.0,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(Cosine, self).__init__(epochs, step_each_epoch, learning_rate,
+                                     warmup_epoch, warmup_start_lr, last_epoch,
+                                     by_epoch)
+        self.T_max = (self.epochs - self.warmup_epoch) * self.step_each_epoch
+        self.eta_min = eta_min
+        if self.by_epoch:
+            self.T_max = self.epochs - self.warmup_epoch
+
+    def __call__(self):
+        learning_rate = lr.CosineAnnealingDecay(
+            learning_rate=self.learning_rate,
+            T_max=self.T_max,
+            eta_min=self.eta_min,
+            last_epoch=self.last_epoch) if self.T_max > 0 else Constant(
+                self.learning_rate)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Cyclic(LRBase):
+    """Cyclic learning rate decay
+
+    Args:
+        epochs (int): Total epoch(s).
+        step_each_epoch (int): Number of iterations within an epoch.
+        base_learning_rate (float): Initial learning rate, which is the lower boundary in the cycle. The paper recommends
+            that set the base_learning_rate to 1/3 or 1/4 of max_learning_rate.
+        max_learning_rate (float): Maximum learning rate in the cycle. It defines the cycle amplitude as above.
+            Since there is some scaling operation during process of learning rate adjustment,
+            max_learning_rate may not actually be reached.
+        warmup_epoch (int): Number of warmup epoch(s).
+        warmup_start_lr (float): Start learning rate within warmup.
+        step_size_up (int): Number of training steps, which is used to increase learning rate in a cycle.
+            The step size of one cycle will be defined by step_size_up + step_size_down. According to the paper, step
+            size should be set as at least 3 or 4 times steps in one epoch.
+        step_size_down (int, optional): Number of training steps, which is used to decrease learning rate in a cycle.
+            If not specified, it's value will initialize to `` step_size_up `` . Default: None.
+        mode (str, optional): One of 'triangular', 'triangular2' or 'exp_range'.
+            If scale_fn is specified, this argument will be ignored. Default: 'triangular'.
+        exp_gamma (float): Constant in 'exp_range' scaling function: exp_gamma**iterations. Used only when mode = 'exp_range'. Default: 1.0.
+        scale_fn (function, optional): A custom scaling function, which is used to replace three build-in methods.
+            It should only have one argument. For all x >= 0, 0 <= scale_fn(x) <= 1.
+            If specified, then 'mode' will be ignored. Default: None.
+        scale_mode (str, optional): One of 'cycle' or 'iterations'. Defines whether scale_fn is evaluated on cycle
+            number or cycle iterations (total iterations since start of training). Default: 'cycle'.
+        last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        by_epoch (bool): Learning rate decays by epoch when by_epoch is True, else by iter.
+        verbose: (bool, optional): If True, prints a message to stdout for each update. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 base_learning_rate,
+                 max_learning_rate,
+                 warmup_epoch,
+                 warmup_start_lr,
+                 step_size_up,
+                 step_size_down=None,
+                 mode='triangular',
+                 exp_gamma=1.0,
+                 scale_fn=None,
+                 scale_mode='cycle',
+                 by_epoch=False,
+                 last_epoch=-1,
+                 verbose=False):
+        super(Cyclic, self).__init__(
+            epochs, step_each_epoch, base_learning_rate, warmup_epoch,
+            warmup_start_lr, last_epoch, by_epoch, verbose)
+        self.base_learning_rate = base_learning_rate
+        self.max_learning_rate = max_learning_rate
+        self.step_size_up = step_size_up
+        self.step_size_down = step_size_down
+        self.mode = mode
+        self.exp_gamma = exp_gamma
+        self.scale_fn = scale_fn
+        self.scale_mode = scale_mode
+
+    def __call__(self):
+        learning_rate = lr.CyclicLR(
+            base_learning_rate=self.base_learning_rate,
+            max_learning_rate=self.max_learning_rate,
+            step_size_up=self.step_size_up,
+            step_size_down=self.step_size_down,
+            mode=self.mode,
+            exp_gamma=self.exp_gamma,
+            scale_fn=self.scale_fn,
+            scale_mode=self.scale_mode,
+            last_epoch=self.last_epoch,
+            verbose=self.verbose)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Step(LRBase):
+    """Step learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        step_size (int|float): the interval to update.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma``. It should be less than 1.0. Default: 0.1.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 step_size,
+                 gamma,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(Step, self).__init__(epochs, step_each_epoch, learning_rate,
+                                   warmup_epoch, warmup_start_lr, last_epoch,
+                                   by_epoch)
+        self.step_size = int(step_size * step_each_epoch)
+        self.gamma = gamma
+        if self.by_epoch:
+            self.step_size = step_size
+
+    def __call__(self):
+        learning_rate = lr.StepDecay(
+            learning_rate=self.learning_rate,
+            step_size=self.step_size,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Piecewise(LRBase):
+    """Piecewise learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        decay_epochs (List[int]): A list of steps numbers. The type of element in the list is python int.
+        values (List[float]): A list of learning rate values that will be picked during different epoch boundaries.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 decay_epochs,
+                 values,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 learning_rate=None,
+                 **kwargs):
+        if learning_rate:
+            decay_epochs = list(range(0, epochs, 30))
+            values = [
+                learning_rate * (0.1**i) for i in range(len(decay_epochs))
+            ]
+            # when total epochs < 30, decay_epochs and values should be
+            # [] and [lr] respectively, but paddle dont support.
+            if len(decay_epochs) == 1:
+                decay_epochs = [epochs]
+                values = [values[0], values[0]]
+            else:
+                decay_epochs = decay_epochs[1:]
+            logger.warning(
+                "When 'learning_rate' of Piecewise has beed set, "
+                "the learning rate scheduler would be set by the rule that lr decay 10 times every 30 epochs. "
+                f"So, the 'decay_epochs' and 'values' have been set to {decay_epochs} and {values} respectively."
+            )
+        super(Piecewise,
+              self).__init__(epochs, step_each_epoch, values[0], warmup_epoch,
+                             warmup_start_lr, last_epoch, by_epoch)
+
+        self.values = values
+        self.boundaries_steps = [e * step_each_epoch for e in decay_epochs]
+        if self.by_epoch is True:
+            self.boundaries_steps = decay_epochs
+
+    def __call__(self):
+        learning_rate = lr.PiecewiseDecay(
+            boundaries=self.boundaries_steps,
+            values=self.values,
+            last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class MultiStepDecay(LRBase):
+    """MultiStepDecay learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        milestones (List[int]): List of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma``. It should be less than 1.0. Defaults to 0.1.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 milestones,
+                 gamma=0.1,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(MultiStepDecay, self).__init__(
+            epochs, step_each_epoch, learning_rate, warmup_epoch,
+            warmup_start_lr, last_epoch, by_epoch)
+        self.milestones = [x * step_each_epoch for x in milestones]
+        self.gamma = gamma
+        if self.by_epoch:
+            self.milestones = milestones
+
+    def __call__(self):
+        learning_rate = lr.MultiStepDecay(
+            learning_rate=self.learning_rate,
+            milestones=self.milestones,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class ReduceOnPlateau(LRBase):
+    """ReduceOnPlateau learning rate decay
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
+            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'``, the learning
+            rate will reduce when ``loss`` stops ascending. Defaults to ``'min'``.
+        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
+            It should be less than 1.0. Defaults to 0.1.
+        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
+            Defaults to 10.
+        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
+            This make tiny changes of ``loss`` will be ignored. Defaults to 1e-4.
+        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
+            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
+            change of ``loss`` is ``threshold`` . Defaults to ``'rel'`` .
+        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Defaults to 0.
+        min_lr (float, optional): The lower bound of the learning rate after reduction. Defaults to 0.
+        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
+            the update is ignored. Defaults to 1e-8.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 mode='min',
+                 factor=0.1,
+                 patience=10,
+                 threshold=1e-4,
+                 threshold_mode='rel',
+                 cooldown=0,
+                 min_lr=0,
+                 epsilon=1e-8,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(ReduceOnPlateau, self).__init__(
+            epochs, step_each_epoch, learning_rate, warmup_epoch,
+            warmup_start_lr, last_epoch, by_epoch)
+        self.mode = mode
+        self.factor = factor
+        self.patience = patience
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.cooldown = cooldown
+        self.min_lr = min_lr
+        self.epsilon = epsilon
+
+    def __call__(self):
+        learning_rate = lr.ReduceOnPlateau(
+            learning_rate=self.learning_rate,
+            mode=self.mode,
+            factor=self.factor,
+            patience=self.patience,
+            threshold=self.threshold,
+            threshold_mode=self.threshold_mode,
+            cooldown=self.cooldown,
+            min_lr=self.min_lr,
+            epsilon=self.epsilon)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        # NOTE: Implement get_lr() method for class `ReduceOnPlateau`,
+        # which is called in `log_info` function
+        def get_lr(self):
+            return self.last_lr
+
+        learning_rate.get_lr = types.MethodType(get_lr, learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class CosineFixmatch(LRBase):
+    """Cosine decay in FixMatch style
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        num_warmup_steps (int): the number warmup steps.
+        warmunum_cycles (float, optional): the factor for cosine in FixMatch learning rate. Defaults to 7 / 16.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 num_warmup_steps,
+                 num_cycles=7 / 16,
+                 last_epoch=-1,
+                 by_epoch=False):
+        self.epochs = epochs
+        self.step_each_epoch = step_each_epoch
+        self.learning_rate = learning_rate
+        self.num_warmup_steps = num_warmup_steps
+        self.num_cycles = num_cycles
+        self.last_epoch = last_epoch
+        self.by_epoch = by_epoch
+
+    def __call__(self):
+        def _lr_lambda(current_step):
+            if current_step < self.num_warmup_steps:
+                return float(current_step) / float(
+                    max(1, self.num_warmup_steps))
+            no_progress = float(current_step - self.num_warmup_steps) / \
+                        float(max(1, self.epochs * self.step_each_epoch - self.num_warmup_steps))
+            return max(0., math.cos(math.pi * self.num_cycles * no_progress))
+
+        learning_rate = lr.LambdaDecay(
+            learning_rate=self.learning_rate,
+            lr_lambda=_lr_lambda,
+            last_epoch=self.last_epoch)
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class OneCycleLR(LRBase):
+    """OneCycleLR learning rate decay
+
+    Args:
+        epochs (int): Total epoch(s).
+        step_each_epoch (int): Number of iterations within an epoch.
+        max_learning_rate (float): Maximum learning rate in the cycle. It defines the cycle amplitude as above.
+            Since there is some scaling operation during process of learning rate adjustment,
+            max_learning_rate may not actually be reached.
+        warmup_epoch (int): Number of warmup epoch(s).
+        warmup_start_lr (float): Start learning rate within warmup.
+        divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
+        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
+        three_phase (bool, optional): Whether to use three-phase.
+        last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        by_epoch (bool): Learning rate decays by epoch when by_epoch is True, else by iter.
+        verbose: (bool, optional): If True, prints a message to stdout for each update. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 max_learning_rate,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 divide_factor=25.0,
+                 end_learning_rate=1e-10,
+                 phase_pct=0.3,
+                 anneal_strategy='cos',
+                 three_phase=False,
+                 by_epoch=False,
+                 last_epoch=-1,
+                 verbose=False):
+        super().__init__(
+            epochs, step_each_epoch, max_learning_rate,
+            warmup_epoch, warmup_start_lr, last_epoch, by_epoch, verbose)
+        self.max_learning_rate = max_learning_rate
+        self.total_steps = epochs * step_each_epoch
+        self.divide_factor = divide_factor
+        self.end_learning_rate = end_learning_rate
+        self.phase_pct = phase_pct
+        self.anneal_strategy = anneal_strategy
+        self.three_phase = three_phase
+        self.last_epoch = last_epoch
+
+    def __call__(self):
+        learning_rate = lr.OneCycleLR(
+            max_learning_rate=self.max_learning_rate,
+            total_steps=self.total_steps,
+            end_learning_rate=self.end_learning_rate,
+            divide_factor=self.divide_factor,
+            phase_pct=self.phase_pct,
+            anneal_strategy=self.anneal_strategy,
+            three_phase=self.three_phase,
+            last_epoch=self.last_epoch,
+            verbose=self.verbose)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
diff --git a/example/low_precision_training/ppcls/optimizer/optimizer.py b/example/low_precision_training/ppcls/optimizer/optimizer.py
new file mode 100755
index 000000000..adc417c87
--- /dev/null
+++ b/example/low_precision_training/ppcls/optimizer/optimizer.py
@@ -0,0 +1,518 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import paddle
+from paddle import optimizer as optim
+from ppcls.utils import logger
+from functools import partial
+
+
+class SGD(object):
+    """
+    Args:
+    learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+        It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+    parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+        This parameter is required in dygraph mode. \
+        The default value is None in static mode, at this time all parameters will be updated.
+    weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+    grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+        some derived class of ``GradientClipBase`` . There are three cliping strategies
+        ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+        :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+    name (str, optional): The default value is None. Normally there is no need for user
+            to set this property.
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=False,
+                 name=None):
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.multi_precision = multi_precision
+        self.name = name
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        argspec = inspect.getfullargspec(optim.SGD.__init__).args
+        if 'multi_precision' in argspec:
+            opt = optim.SGD(learning_rate=self.learning_rate,
+                            parameters=parameters,
+                            weight_decay=self.weight_decay,
+                            grad_clip=self.grad_clip,
+                            multi_precision=self.multi_precision,
+                            name=self.name)
+        else:
+            opt = optim.SGD(learning_rate=self.learning_rate,
+                            parameters=parameters,
+                            weight_decay=self.weight_decay,
+                            grad_clip=self.grad_clip,
+                            name=self.name)
+        return opt
+
+
+class Momentum(object):
+    """
+    Simple Momentum optimizer with velocity state.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 weight_decay=None,
+                 grad_clip=None,
+                 use_nesterov=False,
+                 multi_precision=True,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.multi_precision = multi_precision
+        self.use_nesterov = use_nesterov
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = None
+        if model_list:
+            # TODO(gaotingquan): to avoid cause issues for unset no_weight_decay models
+            if len(self.no_weight_decay_name_list) > 0:
+                params_with_decay = []
+                params_without_decay = []
+                for m in model_list:
+                    for n, p in m.named_parameters():
+                        if any(nd in n for nd in self.no_weight_decay_name_list) \
+                            or (self.one_dim_param_no_weight_decay and len(p.shape) == 1):
+                            params_without_decay.append(p)
+                        else:
+                            params_with_decay.append(p)
+                parameters = [{
+                    "params": params_with_decay,
+                    "weight_decay": self.weight_decay
+                }, {
+                    "params": params_without_decay,
+                    "weight_decay": 0.0
+                }]
+            else:
+                parameters = sum([m.parameters() for m in model_list], [])
+        opt = optim.Momentum(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            multi_precision=self.multi_precision,
+            use_nesterov=self.use_nesterov,
+            parameters=parameters)
+        if hasattr(opt, '_use_multi_tensor'):
+            opt = optim.Momentum(
+                learning_rate=self.learning_rate,
+                momentum=self.momentum,
+                weight_decay=self.weight_decay,
+                grad_clip=self.grad_clip,
+                multi_precision=self.multi_precision,
+                parameters=parameters,
+                use_nesterov=self.use_nesterov,
+                use_multi_tensor=True)
+        return opt
+
+
+class Adam(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-08,
+                 parameter_list=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False,
+                 multi_precision=False):
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.parameter_list = parameter_list
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.name = name
+        self.lazy_mode = lazy_mode
+        self.multi_precision = multi_precision
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.Adam(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            name=self.name,
+            lazy_mode=self.lazy_mode,
+            multi_precision=self.multi_precision,
+            parameters=parameters)
+        return opt
+
+
+class RMSProp(object):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        rho (float) - rho value in equation.
+        epsilon (float) - avoid division by zero, default is 1e-6.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum=0.0,
+                 rho=0.95,
+                 epsilon=1e-6,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=False,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.rho = rho
+        self.epsilon = epsilon
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = None
+        if model_list:
+            params_with_decay = []
+            params_without_decay = []
+            for m in model_list:
+                for n, p in m.named_parameters():
+                    if any(nd in n for nd in self.no_weight_decay_name_list) \
+                        or (self.one_dim_param_no_weight_decay and len(p.shape) == 1):
+                        params_without_decay.append(p)
+                    else:
+                        params_with_decay.append(p)
+            if params_without_decay:
+                parameters = [{
+                    "params": params_with_decay,
+                    "weight_decay": self.weight_decay
+                }, {
+                    "params": params_without_decay,
+                    "weight_decay": 0.0
+                }]
+            else:
+                parameters = params_with_decay
+        opt = optim.RMSProp(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            rho=self.rho,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            parameters=parameters)
+        return opt
+
+
+class AdamW(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=None,
+                 multi_precision=False,
+                 grad_clip=None,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False,
+                 **args):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.grad_clip = grad_clip
+        self.weight_decay = weight_decay
+        self.multi_precision = multi_precision
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+
+        # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work.
+        if model_list is None:
+            if self.one_dim_param_no_weight_decay or len(
+                    self.no_weight_decay_name_list) != 0:
+                msg = "\"AdamW\" does not support setting \"no_weight_decay\" in static graph. Please use dynamic graph."
+                logger.error(Exception(msg))
+                raise Exception(msg)
+
+        self.no_weight_decay_param_name_list = [
+            p.name for model in model_list for n, p in model.named_parameters()
+            if any(nd in n for nd in self.no_weight_decay_name_list)
+        ] if model_list else []
+
+        if self.one_dim_param_no_weight_decay:
+            self.no_weight_decay_param_name_list += [
+                p.name
+                for model in model_list for n, p in model.named_parameters()
+                if len(p.shape) == 1
+            ] if model_list else []
+
+        opt = optim.AdamW(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            parameters=parameters,
+            weight_decay=self.weight_decay,
+            multi_precision=self.multi_precision,
+            grad_clip=self.grad_clip,
+            apply_decay_param_fun=self._apply_decay_param_fun)
+        return opt
+
+    def _apply_decay_param_fun(self, name):
+        return name not in self.no_weight_decay_param_name_list
+
+
+class AdamWDL(object):
+    """
+    The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting.
+    Generally it's used for transformer model.
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=None,
+                 multi_precision=False,
+                 grad_clip=None,
+                 layerwise_decay=None,
+                 filter_bias_and_bn=True,
+                 **args):
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.grad_clip = grad_clip
+        self.weight_decay = weight_decay
+        self.multi_precision = multi_precision
+        self.layerwise_decay = layerwise_decay
+        self.filter_bias_and_bn = filter_bias_and_bn
+
+    class AdamWDLImpl(optim.AdamW):
+        def __init__(self,
+                     learning_rate=0.001,
+                     beta1=0.9,
+                     beta2=0.999,
+                     epsilon=1e-8,
+                     parameters=None,
+                     weight_decay=0.01,
+                     apply_decay_param_fun=None,
+                     grad_clip=None,
+                     lazy_mode=False,
+                     multi_precision=False,
+                     layerwise_decay=1.0,
+                     n_layers=12,
+                     name_dict=None,
+                     name=None):
+            if not isinstance(layerwise_decay, float) and \
+                    not isinstance(layerwise_decay, paddle.static.Variable):
+                raise TypeError("coeff should be float or Tensor.")
+            self.layerwise_decay = layerwise_decay
+            self.name_dict = name_dict
+            self.n_layers = n_layers
+            self._coeff = weight_decay
+            self._lr_to_coeff = dict()
+            self.set_param_lr_func = partial(
+                self._layerwise_lr_decay, layerwise_decay, name_dict, n_layers)
+            super().__init__(
+                learning_rate=learning_rate,
+                parameters=parameters,
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                grad_clip=grad_clip,
+                name=name,
+                apply_decay_param_fun=apply_decay_param_fun,
+                weight_decay=weight_decay,
+                lazy_mode=lazy_mode,
+                multi_precision=multi_precision,)
+
+        # Layerwise decay
+        def _layerwise_lr_decay(self, decay_rate, name_dict, n_layers, param):
+            """
+            Args:
+                decay_rate (float):
+                    The layer-wise decay ratio.
+                name_dict (dict):
+                    The keys of name_dict is dynamic name of model while the value
+                    of name_dict is static name.
+                    Use model.named_parameters() to get name_dict.
+                n_layers (int):
+                    Total number of layers in the transformer encoder.
+            """
+            ratio = 1.0
+            static_name = name_dict[param.name]
+            if "blocks" in static_name:
+                idx = static_name.find("blocks.")
+                layer = int(static_name[idx:].split(".")[1])
+                ratio = decay_rate**(n_layers - layer)
+            elif any([
+                    key in static_name
+                    for key in ["embed", "token", "conv1", "ln_pre"]
+            ]):
+                ratio = decay_rate**(n_layers + 1)
+            # param.optimize_attr["learning_rate"] *= ratio
+            return ratio
+        def _append_decoupled_weight_decay(self, block, param_and_grad):
+            """
+            Add decoupled weight decay op.
+                parameter = parameter - parameter * coeff * lr
+            Args:
+                block: block in which variable is to be created
+                param_and_grad: (parameters, gradients) pairs,
+                    the parameters need to decay.
+            Raises:
+                Exception: The type of coeff and parameter is not consistent.
+            """
+            if isinstance(param_and_grad, dict):
+                param_and_grad = self._update_param_group(param_and_grad)
+            param, grad = param_and_grad
+
+            if self._apply_decay_param_fun is not None and not self._apply_decay_param_fun(param.name):
+                return
+
+            if isinstance(self._learning_rate, float):
+                learning_rate = self._learning_rate
+            else:
+                # NOTE. We add this function to the _append_optimize_op(),
+                # for we must make sure _create_param_lr() be called after
+                # optimizer._create_global_learning_rate().
+                learning_rate = self._create_param_lr(param_and_grad)
+
+            with block.program._optimized_guard([param, grad]), paddle.static.name_scope("weight decay"):
+                self._params_name.add(param.name)
+
+                # If it has been calculated, the result will be reused.
+                # NOTE(wangxi): In dygraph mode, apply_gradient will be executed
+                # every step, so need clear _lr_to_coeff every step,
+                # we do this in _create_optimization_pass
+                decay_coeff = self._lr_to_coeff.get(learning_rate, None)
+                if decay_coeff is None:
+                    # NOTE(wangxi): for pipeline to set device:all
+                    with paddle.static.device_guard(None):
+                        decay_coeff = 1.0 - learning_rate * self._coeff
+                    self._lr_to_coeff[learning_rate] = decay_coeff
+
+                find_master = self._multi_precision and param.dtype == paddle.float16
+                if find_master:
+                    master_weight = self._master_weights[param.name]
+                    scaled_param = master_weight * decay_coeff
+                    paddle.assign(scaled_param, output=master_weight)
+                else:
+                    scaled_param = param * decay_coeff
+                    paddle.assign(scaled_param, output=param)
+
+        def _append_optimize_op(self, block, param_and_grad):
+            if self.set_param_lr_func is None:
+                return super()._append_optimize_op(block, param_and_grad)
+
+            self._append_decoupled_weight_decay(block, param_and_grad)
+            prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
+            ratio = self.set_param_lr_func(param_and_grad[0])
+            param_and_grad[0].optimize_attr["learning_rate"] *= ratio
+
+            # excute Adam op
+            res = super()._append_optimize_op(block, param_and_grad)
+            param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
+            return res
+
+    def __call__(self, model_list):
+        model = model_list[0]
+        if self.weight_decay and self.filter_bias_and_bn:
+            skip = {}
+            if hasattr(model, 'no_weight_decay'):
+                skip = model.no_weight_decay()
+            decay_dict = {
+                param.name: not (len(param.shape) == 1 or
+                                 name.endswith(".bias") or name in skip)
+                for name, param in model.named_parameters()
+                if not 'teacher' in name
+            }
+            parameters = [
+                param for param in model.parameters()
+                if 'teacher' not in param.name
+            ]
+            weight_decay = 0.
+        else:
+            parameters = model.parameters()
+        opt_args = dict(
+            learning_rate=self.learning_rate, weight_decay=self.weight_decay)
+        opt_args['parameters'] = parameters
+        if decay_dict is not None:
+            opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n]
+        opt_args['epsilon'] = self.epsilon
+        opt_args['beta1'] = self.beta1
+        opt_args['beta2'] = self.beta2
+        if self.layerwise_decay and self.layerwise_decay < 1.0:
+            opt_args['layerwise_decay'] = self.layerwise_decay
+            name_dict = dict()
+            for n, p in model.named_parameters():
+                name_dict[p.name] = n
+            opt_args['name_dict'] = name_dict
+            opt_args['n_layers'] = model.get_num_layers()
+        optimizer = self.AdamWDLImpl(**opt_args)
+
+        return optimizer
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/static/README.md b/example/low_precision_training/ppcls/static/README.md
new file mode 100755
index 000000000..24093f4d0
--- /dev/null
+++ b/example/low_precision_training/ppcls/static/README.md
@@ -0,0 +1,19 @@
+# 使用静态图模式训练
+
+飞桨框架支持动态图模式和静态图模式，通常情况下使用动态图训练，即可满足大部分场景需求。飞桨经过多个版本的持续优化，动态图模型训练的性能已经可以和静态图媲美。如果在某些场景下确实需要使用静态图模式训练，我们推荐优先使用动转静训练功能，即仍然采用更易用的动态图方式构建模型，再转为静态图模式进行训练。同时，考虑到对静态图训练的兼容，PaddleClas 同样提供了静态图训练功能。
+
+## 一、动转静训练
+
+仅需在训练配置文件中设置参数 `Global.to_static` 为 `True`，同时通过参数 `Global.image_shape` 输入数据 shape 即可，如 `[3, 224, 224]`。
+
+## 二、静态图训练
+
+在 PaddleClas 中，静态图训练与动态图类似，同样使用配置文件的方式指定训练参数，训练入口脚本为 `ppcls/static/train.py`，以 ResNet50 模型为例，训练启动命令如下：
+
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+python3.7 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    ppcls/static/train.py \
+    -c ./ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
+```
diff --git a/example/low_precision_training/ppcls/static/program.py b/example/low_precision_training/ppcls/static/program.py
new file mode 100755
index 000000000..0e8d0990d
--- /dev/null
+++ b/example/low_precision_training/ppcls/static/program.py
@@ -0,0 +1,445 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import numpy as np
+
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+
+from paddle.distributed import fleet
+from paddle.distributed.fleet import DistributedStrategy
+
+# from ppcls.optimizer import OptimizerBuilder
+# from ppcls.optimizer.learning_rate import LearningRateBuilder
+
+from ppcls.arch import build_model
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.optimizer import build_lr_scheduler
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger, profiler
+
+
+def create_feeds(image_shape, use_mix=False, class_num=None, dtype="float32"):
+    """
+    Create feeds as model input
+
+    Args:
+        image_shape(list[int]): model input shape, such as [3, 224, 224]
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        class_num(int): the class number of network, required if use_mix
+
+    Returns:
+        feeds(dict): dict of model input variables
+    """
+    feeds = OrderedDict()
+    feeds['data'] = paddle.static.data(
+        name="data", shape=[None] + image_shape, dtype=dtype)
+
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        feeds['target'] = paddle.static.data(
+            name="target", shape=[None, class_num], dtype="float32")
+    else:
+        feeds['label'] = paddle.static.data(
+            name="label", shape=[None, 1], dtype="int64")
+
+    return feeds
+
+
+def create_fetchs(out,
+                  feeds,
+                  architecture,
+                  topk=5,
+                  epsilon=None,
+                  class_num=None,
+                  use_mix=False,
+                  config=None,
+                  mode="Train"):
+    """
+    Create fetchs as model outputs(included loss and measures),
+    will call create_loss and create_metric(if use_mix).
+    Args:
+        out(variable): model output variable
+        feeds(dict): dict of model input variables.
+            If use mix_up, it will not include label.
+        architecture(dict): architecture information,
+            name(such as ResNet50) is needed
+        topk(int): usually top5
+        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
+        class_num(int): the class number of network, required if use_mix
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        config(dict): model config
+
+    Returns:
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    fetchs = OrderedDict()
+    # build loss
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        target = paddle.reshape(feeds['target'], [-1, class_num])
+    else:
+        target = paddle.reshape(feeds['label'], [-1, 1])
+
+    loss_func = build_loss(config["Loss"][mode])
+    loss_dict = loss_func(out, target)
+
+    loss_out = loss_dict["loss"]
+    fetchs['loss'] = (loss_out, AverageMeter('loss', '7.4f', need_avg=True))
+
+    # build metric
+    if not use_mix:
+        metric_func = build_metrics(config["Metric"][mode])
+
+        metric_dict = metric_func(out, target)
+
+        for key in metric_dict:
+            if mode != "Train" and paddle.distributed.get_world_size() > 1:
+                paddle.distributed.all_reduce(
+                    metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
+                metric_dict[key] = metric_dict[
+                    key] / paddle.distributed.get_world_size()
+
+            fetchs[key] = (metric_dict[key], AverageMeter(
+                key, '7.4f', need_avg=True))
+
+    return fetchs
+
+
+def create_optimizer(config, step_each_epoch):
+    # create learning_rate instance
+    optimizer, lr_sch = build_optimizer(
+        config["Optimizer"], config["Global"]["epochs"], step_each_epoch)
+    return optimizer, lr_sch
+
+
+def create_strategy(config):
+    """
+    Create build strategy and exec strategy.
+
+    Args:
+        config(dict): config
+
+    Returns:
+        build_strategy: build strategy
+        exec_strategy: exec strategy
+    """
+    build_strategy = paddle.static.BuildStrategy()
+
+    fuse_op = True if 'AMP' in config and config['AMP'].get('use_amp',
+                                                            True) else False
+
+    fuse_bn_act_ops = config.get('fuse_bn_act_ops', fuse_op)
+    fuse_elewise_add_act_ops = config.get('fuse_elewise_add_act_ops', fuse_op)
+    fuse_bn_add_act_ops = config.get('fuse_bn_add_act_ops', fuse_op)
+    enable_addto = config.get('enable_addto', fuse_op)
+
+    build_strategy.fuse_bn_act_ops = fuse_bn_act_ops
+    build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+    build_strategy.fuse_bn_add_act_ops = fuse_bn_add_act_ops
+    build_strategy.enable_addto = enable_addto
+
+    return build_strategy
+
+
+def dist_optimizer(config, optimizer):
+    """
+    Create a distributed optimizer based on a normal optimizer
+
+    Args:
+        config(dict):
+        optimizer(): a normal optimizer
+
+    Returns:
+        optimizer: a distributed optimizer
+    """
+    build_strategy = create_strategy(config)
+
+    dist_strategy = DistributedStrategy()
+    dist_strategy.build_strategy = build_strategy
+
+    dist_strategy.nccl_comm_num = 1
+    dist_strategy.fuse_all_reduce_ops = True
+    dist_strategy.fuse_grad_size_in_MB = 16
+    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+
+    return optimizer
+
+
+def mixed_precision_optimizer(config, optimizer):
+    if 'AMP' in config and config['AMP'].get('use_amp', True):
+        amp_cfg = config.AMP if config.AMP else dict()
+        scale_loss = amp_cfg.get('scale_loss', 1.0)
+        use_dynamic_loss_scaling = amp_cfg.get('use_dynamic_loss_scaling',
+                                               False)
+        use_pure_fp16 = amp_cfg.get("level", "O1") == "O2"
+        optimizer = paddle.static.amp.decorate(
+            optimizer,
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling,
+            use_pure_fp16=use_pure_fp16,
+            use_fp16_guard=True)
+
+    return optimizer
+
+
+def build(config,
+          main_prog,
+          startup_prog,
+          class_num=None,
+          step_each_epoch=100,
+          is_train=True,
+          is_distributed=True):
+    """
+    Build a program using a model and an optimizer
+        1. create feeds
+        2. create a dataloader
+        3. create a model
+        4. create fetchs
+        5. create an optimizer
+
+    Args:
+        config(dict): config
+        main_prog(): main program
+        startup_prog(): startup program
+        class_num(int): the class number of network, required if use_mix
+        is_train(bool): train or eval
+        is_distributed(bool): whether to use distributed training method
+
+    Returns:
+        dataloader(): a bridge between the model and the data
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    with paddle.static.program_guard(main_prog, startup_prog):
+        with paddle.utils.unique_name.guard():
+            mode = "Train" if is_train else "Eval"
+            use_mix = "batch_transform_ops" in config["DataLoader"][mode][
+                "dataset"]
+            data_dtype = "float32"
+            if 'AMP' in config and config['AMP'].get(
+                    'use_amp', True) and config["AMP"]["level"] == 'O2':
+                data_dtype = "float16"
+            feeds = create_feeds(
+                config["Global"]["image_shape"],
+                use_mix,
+                class_num=class_num,
+                dtype=data_dtype)
+
+            # build model
+            # data_format should be assigned in arch-dict
+            input_image_channel = config["Global"]["image_shape"][
+                0]  # default as [3, 224, 224]
+            model = build_model(config)
+            out = model(feeds["data"])
+            # end of build model
+
+            fetchs = create_fetchs(
+                out,
+                feeds,
+                config["Arch"],
+                epsilon=config.get('ls_epsilon'),
+                class_num=class_num,
+                use_mix=use_mix,
+                config=config,
+                mode=mode)
+            lr_scheduler = None
+            optimizer = None
+            if is_train:
+                optimizer, lr_scheduler = build_optimizer(
+                    config["Optimizer"], config["Global"]["epochs"],
+                    step_each_epoch)
+                optimizer = mixed_precision_optimizer(config, optimizer)
+                if is_distributed:
+                    optimizer = dist_optimizer(config, optimizer)
+                optimizer.minimize(fetchs['loss'][0])
+    return fetchs, lr_scheduler, feeds, optimizer
+
+
+def compile(config, program, loss_name=None, share_prog=None):
+    """
+    Compile the program
+
+    Args:
+        config(dict): config
+        program(): the program which is wrapped by
+        loss_name(str): loss name
+        share_prog(): the shared program, used for evaluation during training
+
+    Returns:
+        compiled_program(): a compiled program
+    """
+    build_strategy = create_strategy(config)
+
+    compiled_program = paddle.static.CompiledProgram(
+        program, build_strategy=build_strategy)
+
+    return compiled_program
+
+
+total_step = 0
+
+
+def run(dataloader,
+        exe,
+        program,
+        feeds,
+        fetchs,
+        epoch=0,
+        mode='train',
+        config=None,
+        vdl_writer=None,
+        lr_scheduler=None,
+        profiler_options=None):
+    """
+    Feed data to the model and fetch the measures and loss
+
+    Args:
+        dataloader(paddle io dataloader):
+        exe():
+        program():
+        fetchs(dict): dict of measures and the loss
+        epoch(int): epoch of training or evaluation
+        model(str): log only
+
+    Returns:
+    """
+    fetch_list = [f[0] for f in fetchs.values()]
+    metric_dict = OrderedDict([("lr", AverageMeter(
+        'lr', 'f', postfix=",", need_avg=False))])
+
+    for k in fetchs:
+        metric_dict[k] = fetchs[k][1]
+
+    metric_dict["batch_time"] = AverageMeter(
+        'batch_cost', '.5f', postfix=" s,")
+    metric_dict["reader_time"] = AverageMeter(
+        'reader_cost', '.5f', postfix=" s,")
+
+    for m in metric_dict.values():
+        m.reset()
+
+    use_dali = config["Global"].get('use_dali', False)
+    tic = time.time()
+
+    if not use_dali:
+        dataloader = dataloader()
+
+    idx = 0
+    batch_size = None
+    while True:
+        # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG
+        try:
+            batch = next(dataloader)
+        except StopIteration:
+            break
+        except RuntimeError:
+            logger.warning(
+                "Except RuntimeError when reading data from dataloader, try to read once again..."
+            )
+            continue
+        except IndexError:
+            logger.warning(
+                "Except IndexError when reading data from dataloader, try to read once again..."
+            )
+            continue
+        idx += 1
+        # ignore the warmup iters
+        if idx == 5:
+            metric_dict["batch_time"].reset()
+            metric_dict["reader_time"].reset()
+
+        metric_dict['reader_time'].update(time.time() - tic)
+
+        profiler.add_profiler_step(profiler_options)
+
+        batch_size = batch[0].shape()[0]
+        feed_dict = {
+            key.name: batch[idx]
+            for idx, key in enumerate(feeds.values())
+        }
+
+        metrics = exe.run(program=program,
+                          feed=feed_dict,
+                          fetch_list=fetch_list)
+
+        for name, m in zip(fetchs.keys(), metrics):
+            metric_dict[name].update(np.mean(m), batch_size)
+        metric_dict["batch_time"].update(time.time() - tic)
+        if mode == "train":
+            metric_dict['lr'].update(lr_scheduler.get_lr())
+
+        fetchs_str = ' '.join([
+            str(metric_dict[key].mean)
+            if "time" in key else str(metric_dict[key].value)
+            for key in metric_dict
+        ])
+        ips_info = " ips: {:.5f} samples/sec.".format(
+            batch_size / metric_dict["batch_time"].avg)
+        fetchs_str += ips_info
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        if vdl_writer:
+            global total_step
+            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
+            total_step += 1
+        if mode == 'eval':
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info("{:s} step:{:<4d} {:s}".format(mode, idx,
+                                                           fetchs_str))
+        else:
+            epoch_str = "epoch:{:<3d}".format(epoch)
+            step_str = "{:s} step:{:<4d}".format(mode, idx)
+            max_mem_reserved_str = f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved()}"
+            max_mem_allocated_str = f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated()}"
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info(
+                    f"{epoch_str} {step_str} {fetchs_str} {max_mem_reserved_str} {max_mem_allocated_str}"
+                )
+
+        tic = time.time()
+
+    end_str = ' '.join([str(m.mean) for m in metric_dict.values()] +
+                       [metric_dict["batch_time"].total])
+    ips_info = "ips: {:.5f} samples/sec.".format(batch_size /
+                                                 metric_dict["batch_time"].avg)
+    if mode == 'eval':
+        logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
+    else:
+        end_epoch_str = "END epoch:{:<3d}".format(epoch)
+        logger.info("{:s} {:s} {:s}".format(end_epoch_str, mode, end_str))
+    if use_dali:
+        dataloader.reset()
+
+    # return top1_acc in order to save the best model
+    if mode == 'eval':
+        return fetchs["top1"][1].avg
diff --git a/example/low_precision_training/ppcls/static/run_dali.sh b/example/low_precision_training/ppcls/static/run_dali.sh
new file mode 100755
index 000000000..5bf0ef4ca
--- /dev/null
+++ b/example/low_precision_training/ppcls/static/run_dali.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python3.7 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    ppcls/static/train.py \
+    -c ./ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
diff --git a/example/low_precision_training/ppcls/static/save_load.py b/example/low_precision_training/ppcls/static/save_load.py
new file mode 100755
index 000000000..5d124fcf7
--- /dev/null
+++ b/example/low_precision_training/ppcls/static/save_load.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import re
+import shutil
+import tempfile
+
+import paddle
+
+from ppcls.utils import logger
+
+__all__ = ['init_model', 'save_model']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def _load_state(path):
+    if os.path.exists(path + '.pdopt'):
+        # XXX another hack to ignore the optimizer state
+        tmp = tempfile.mkdtemp()
+        dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
+        shutil.copy(path + '.pdparams', dst + '.pdparams')
+        state = paddle.static.load_program_state(dst)
+        shutil.rmtree(tmp)
+    else:
+        state = paddle.static.load_program_state(path)
+    return state
+
+
+def load_params(exe, prog, path, ignore_params=None):
+    """
+    Load model from the given path.
+    Args:
+        exe (paddle.static.Executor): The paddle.static.Executor object.
+        prog (paddle.static.Program): load weight to which Program object.
+        path (string): URL string or loca model path.
+        ignore_params (list): ignore variable to load when finetuning.
+            It can be specified by finetune_exclude_pretrained_params
+            and the usage can refer to the document
+            docs/advanced_tutorials/TRANSFER_LEARNING.md
+    """
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+
+    logger.info("Loading parameters from {}...".format(path))
+
+    ignore_set = set()
+    state = _load_state(path)
+
+    # ignore the parameter which mismatch the shape
+    # between the model and pretrain weight.
+    all_var_shape = {}
+    for block in prog.blocks:
+        for param in block.all_parameters():
+            all_var_shape[param.name] = param.shape
+    ignore_set.update([
+        name for name, shape in all_var_shape.items()
+        if name in state and shape != state[name].shape
+    ])
+
+    if ignore_params:
+        all_var_names = [var.name for var in prog.list_vars()]
+        ignore_list = filter(
+            lambda var: any([re.match(name, var) for name in ignore_params]),
+            all_var_names)
+        ignore_set.update(list(ignore_list))
+
+    if len(ignore_set) > 0:
+        for k in ignore_set:
+            if k in state:
+                logger.warning(
+                    'variable {} is already excluded automatically'.format(k))
+                del state[k]
+
+    paddle.static.set_program_state(prog, state)
+
+
+def init_model(config, program, exe):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints:
+        paddle.static.load(program, checkpoints, exe)
+        logger.info("Finish initing model from {}".format(checkpoints))
+        return
+
+    pretrained_model = config.get('pretrained_model')
+    if pretrained_model:
+        if not isinstance(pretrained_model, list):
+            pretrained_model = [pretrained_model]
+        for pretrain in pretrained_model:
+            load_params(exe, program, pretrain)
+        logger.info("Finish initing model from {}".format(pretrained_model))
+
+
+def save_model(program, model_path, epoch_id, prefix='ppcls'):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    model_path = os.path.join(model_path, str(epoch_id))
+    _mkdir_if_not_exist(model_path)
+    model_prefix = os.path.join(model_path, prefix)
+    paddle.static.save(program, model_prefix)
+    logger.info("Already save model in {}".format(model_path))
diff --git a/example/low_precision_training/ppcls/static/train.py b/example/low_precision_training/ppcls/static/train.py
new file mode 100755
index 000000000..dce781390
--- /dev/null
+++ b/example/low_precision_training/ppcls/static/train.py
@@ -0,0 +1,227 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+import numpy as np
+import random
+
+import paddle
+from paddle.distributed import fleet
+from visualdl import LogWriter
+
+from ppcls.data import build_dataloader
+from ppcls.utils.config import get_config, print_config
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.static.save_load import init_model, save_model
+from ppcls.static import program
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleClas train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/ResNet/ResNet50.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    """
+    all the config of training paradigm should be in config["Global"]
+    """
+    config = get_config(args.config, overrides=args.override, show=False)
+
+    # set seed
+    seed = config["Global"].get("seed", False)
+    if seed or seed == 0:
+        assert isinstance(seed, int), "The 'seed' must be a integer!"
+        paddle.seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+
+    global_config = config["Global"]
+
+    mode = "train"
+
+    log_file = os.path.join(global_config['output_dir'],
+                            config["Arch"]["name"], f"{mode}.log")
+    log_ranks = config["Global"].get("log_ranks", "0")
+    init_logger(log_file=log_file, log_ranks=log_ranks)
+    print_config(config)
+
+    if global_config.get("is_distributed", True):
+        fleet.init(is_collective=True)
+
+    # assign the device
+    assert global_config["device"] in [
+        "cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"
+    ]
+    device = paddle.set_device(global_config["device"])
+
+    # amp related config
+    amp_config = config.get("AMP", None)
+    use_amp = True if amp_config and amp_config.get("use_amp", True) else False
+    if use_amp:
+        AMP_RELATED_FLAGS_SETTING = {
+            'FLAGS_cudnn_exhaustive_search': 1,
+            'FLAGS_conv_workspace_size_limit': 1500,
+            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+            'FLAGS_max_inplace_grad_add': 8,
+        }
+        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
+        paddle.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+    # visualDL
+    # vdl_writer = None
+    # if global_config["use_visualdl"]:
+    #     vdl_dir = global_config["output_dir"]
+    #     vdl_writer = LogWriter(vdl_dir)
+
+    # build dataloader
+    eval_dataloader = None
+    use_dali = global_config.get('use_dali', False)
+
+    class_num = config["Arch"].get("class_num", None)
+    config["DataLoader"].update({"class_num": class_num})
+    train_dataloader = build_dataloader(
+        config["DataLoader"], "Train", device=device, use_dali=use_dali)
+    if global_config["eval_during_train"]:
+        eval_dataloader = build_dataloader(
+            config["DataLoader"], "Eval", device=device, use_dali=use_dali)
+
+    step_each_epoch = len(train_dataloader)
+
+    # startup_prog is used to do some parameter init work,
+    # and train prog is used to hold the network
+    startup_prog = paddle.static.Program()
+    train_prog = paddle.static.Program()
+
+    best_top1_acc = 0.0  # best top1 acc record
+
+    train_fetchs, lr_scheduler, train_feeds, optimizer = program.build(
+        config,
+        train_prog,
+        startup_prog,
+        class_num,
+        step_each_epoch=step_each_epoch,
+        is_train=True,
+        is_distributed=global_config.get("is_distributed", True))
+
+    if global_config["eval_during_train"]:
+        eval_prog = paddle.static.Program()
+        eval_fetchs, _, eval_feeds, _ = program.build(
+            config,
+            eval_prog,
+            startup_prog,
+            is_train=False,
+            is_distributed=global_config.get("is_distributed", True))
+        # clone to prune some content which is irrelevant in eval_prog
+        eval_prog = eval_prog.clone(for_test=True)
+
+    # create the "Executor" with the statement of which device
+    exe = paddle.static.Executor(device)
+    # Parameter initialization
+    exe.run(startup_prog)
+    # load pretrained models or checkpoints
+    init_model(global_config, train_prog, exe)
+
+    if use_amp:
+        # for AMP O2
+        if config["AMP"].get("level", "O1").upper() == "O2":
+            use_fp16_test = True
+            msg = "Only support FP16 evaluation when AMP O2 is enabled."
+            logger.warning(msg)
+        # for AMP O1
+        else:
+            use_fp16_test = config["AMP"].get("use_fp16_test", False)
+
+        optimizer.amp_init(
+            device,
+            scope=paddle.static.global_scope(),
+            test_program=eval_prog
+            if global_config["eval_during_train"] else None,
+            use_fp16_test=use_fp16_test)
+
+    if not global_config.get("is_distributed", True):
+        compiled_train_prog = program.compile(
+            config, train_prog, loss_name=train_fetchs["loss"][0].name)
+    else:
+        compiled_train_prog = train_prog
+
+    if eval_dataloader is not None:
+        if not global_config.get("is_distributed", True):
+            compiled_eval_prog = program.compile(config, eval_prog)
+        else:
+            compiled_eval_prog = eval_prog
+
+    for epoch_id in range(global_config["epochs"]):
+        # 1. train with train dataset
+        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
+                    train_fetchs, epoch_id, 'train', config, vdl_writer,
+                    lr_scheduler, args.profiler_options)
+        # 2. evaluate with eval dataset
+        if global_config["eval_during_train"] and epoch_id % global_config[
+                "eval_interval"] == 0:
+            top1_acc = program.run(eval_dataloader, exe, compiled_eval_prog,
+                                   eval_feeds, eval_fetchs, epoch_id, "eval",
+                                   config)
+            if top1_acc > best_top1_acc:
+                best_top1_acc = top1_acc
+                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
+                    best_top1_acc, epoch_id)
+                logger.info(message)
+                if epoch_id % global_config["save_interval"] == 0:
+
+                    model_path = os.path.join(global_config["output_dir"],
+                                              config["Arch"]["name"])
+                    save_model(train_prog, model_path, "best_model")
+
+        # 3. save the persistable model
+        if epoch_id % global_config["save_interval"] == 0:
+            model_path = os.path.join(global_config["output_dir"],
+                                      config["Arch"]["name"])
+            save_model(train_prog, model_path, epoch_id)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    args = parse_args()
+    main(args)
diff --git a/example/low_precision_training/ppcls/utils/COCO2017_label_list.txt b/example/low_precision_training/ppcls/utils/COCO2017_label_list.txt
new file mode 100755
index 000000000..51dba82b4
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/COCO2017_label_list.txt
@@ -0,0 +1,80 @@
+0 person
+1 bicycle
+2 car
+3 motorcycle
+4 airplane
+5 bus
+6 train
+7 truck
+8 boat
+9 traffic light
+10 fire hydrant
+11 stop sign
+12 parking meter
+13 bench
+14 bird
+15 cat
+16 dog
+17 horse
+18 sheep
+19 cow
+20 elephant
+21 bear
+22 zebra
+23 giraffe
+24 backpack
+25 umbrella
+26 handbag
+27 tie
+28 suitcase
+29 frisbee
+30 skis
+31 snowboard
+32 sports ball
+33 kite
+34 baseball bat
+35 baseball glove
+36 skateboard
+37 surfboard
+38 tennis racket
+39 bottle
+40 wine glass
+41 cup
+42 fork
+43 knife
+44 spoon
+45 bowl
+46 banana
+47 apple
+48 sandwich
+49 orange
+50 broccoli
+51 carrot
+52 hot dog
+53 pizza
+54 donut
+55 cake
+56 chair
+57 couch
+58 potted plant
+59 bed
+60 dining table
+61 toilet
+62 tv
+63 laptop
+64 mouse
+65 remote
+66 keyboard
+67 cell phone
+68 microwave
+69 oven
+70 toaster
+71 sink
+72 refrigerator
+73 book
+74 clock
+75 vase
+76 scissors
+77 teddy bear
+78 hair drier
+79 toothbrush
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/utils/NUS-WIDE-SCENE_label_list.txt b/example/low_precision_training/ppcls/utils/NUS-WIDE-SCENE_label_list.txt
new file mode 100755
index 000000000..f4ee66ba7
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/NUS-WIDE-SCENE_label_list.txt
@@ -0,0 +1,33 @@
+0 airport
+1 beach
+2 bridge
+3 buildings
+4 castle
+5 cityscape
+6 clouds
+7 frost
+8 garden
+9 glacier
+10 grass
+11 harbor
+12 house
+13 lake
+14 moon
+15 mountain
+16 nighttime
+17 ocean
+18 plants
+19 railroad
+20 rainbow
+21 reflection
+22 road
+23 sky
+24 snow
+25 street
+26 sunset
+27 temple
+28 town
+29 valley
+30 water
+31 waterfall
+32 window
diff --git a/example/low_precision_training/ppcls/utils/PULC_label_list/image_orientation_label_list.txt b/example/low_precision_training/ppcls/utils/PULC_label_list/image_orientation_label_list.txt
new file mode 100755
index 000000000..f6c8ef322
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/PULC_label_list/image_orientation_label_list.txt
@@ -0,0 +1,4 @@
+0 0°
+1 90°
+2 180°
+3 270°
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/utils/PULC_label_list/language_classification_label_list.txt b/example/low_precision_training/ppcls/utils/PULC_label_list/language_classification_label_list.txt
new file mode 100755
index 000000000..8d9ee9dd8
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/PULC_label_list/language_classification_label_list.txt
@@ -0,0 +1,10 @@
+0 arabic
+1 chinese_cht
+2 cyrillic
+3 devanagari
+4 japan
+5 ka
+6 korean
+7 ta
+8 te
+9 latin
\ No newline at end of file
diff --git a/example/low_precision_training/ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt b/example/low_precision_training/ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
new file mode 100755
index 000000000..051944a92
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
@@ -0,0 +1,4 @@
+0 0
+1 90
+2 180
+3 270
diff --git a/example/low_precision_training/ppcls/utils/PULC_label_list/textline_orientation_label_list.txt b/example/low_precision_training/ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
new file mode 100755
index 000000000..207b70c6b
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
@@ -0,0 +1,2 @@
+0 0_degree
+1 180_degree
diff --git a/example/low_precision_training/ppcls/utils/PULC_label_list/traffic_sign_label_list.txt b/example/low_precision_training/ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
new file mode 100755
index 000000000..c1e41d539
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
@@ -0,0 +1,232 @@
+0 pl80
+1 w9
+2 p6
+3 ph4.2
+4 i8
+5 w14
+6 w33
+7 pa13
+8 im
+9 w58
+10 pl90
+11 il70
+12 p5
+13 pm55
+14 pl60
+15 ip
+16 p11
+17 pdd
+18 wc
+19 i2r
+20 w30
+21 pmr
+22 p23
+23 pl15
+24 pm10
+25 pss
+26 w1
+27 p4
+28 w38
+29 w50
+30 w34
+31 pw3.5
+32 iz
+33 w39
+34 w11
+35 p1n
+36 pr70
+37 pd
+38 pnl
+39 pg
+40 ph5.3
+41 w66
+42 il80
+43 pb
+44 pbm
+45 pm5
+46 w24
+47 w67
+48 w49
+49 pm40
+50 ph4
+51 w45
+52 i4
+53 w37
+54 ph2.6
+55 pl70
+56 ph5.5
+57 i14
+58 i11
+59 p7
+60 p29
+61 pne
+62 pr60
+63 pm13
+64 ph4.5
+65 p12
+66 p3
+67 w40
+68 pl5
+69 w13
+70 pr10
+71 p14
+72 i4l
+73 pr30
+74 pw4.2
+75 w16
+76 p17
+77 ph3
+78 i9
+79 w15
+80 w35
+81 pa8
+82 pt
+83 pr45
+84 w17
+85 pl30
+86 pcs
+87 pctl
+88 pr50
+89 ph4.4
+90 pm46
+91 pm35
+92 i15
+93 pa12
+94 pclr
+95 i1
+96 pcd
+97 pbp
+98 pcr
+99 w28
+100 ps
+101 pm8
+102 w18
+103 w2
+104 w52
+105 ph2.9
+106 ph1.8
+107 pe
+108 p20
+109 w36
+110 p10
+111 pn
+112 pa14
+113 w54
+114 ph3.2
+115 p2
+116 ph2.5
+117 w62
+118 w55
+119 pw3
+120 pw4.5
+121 i12
+122 ph4.3
+123 phclr
+124 i10
+125 pr5
+126 i13
+127 w10
+128 p26
+129 w26
+130 p8
+131 w5
+132 w42
+133 il50
+134 p13
+135 pr40
+136 p25
+137 w41
+138 pl20
+139 ph4.8
+140 pnlc
+141 ph3.3
+142 w29
+143 ph2.1
+144 w53
+145 pm30
+146 p24
+147 p21
+148 pl40
+149 w27
+150 pmb
+151 pc
+152 i6
+153 pr20
+154 p18
+155 ph3.8
+156 pm50
+157 pm25
+158 i2
+159 w22
+160 w47
+161 w56
+162 pl120
+163 ph2.8
+164 i7
+165 w12
+166 pm1.5
+167 pm2.5
+168 w32
+169 pm15
+170 ph5
+171 w19
+172 pw3.2
+173 pw2.5
+174 pl10
+175 il60
+176 w57
+177 w48
+178 w60
+179 pl100
+180 pr80
+181 p16
+182 pl110
+183 w59
+184 w64
+185 w20
+186 ph2
+187 p9
+188 il100
+189 w31
+190 w65
+191 ph2.4
+192 pr100
+193 p19
+194 ph3.5
+195 pa10
+196 pcl
+197 pl35
+198 p15
+199 w7
+200 pa6
+201 phcs
+202 w43
+203 p28
+204 w6
+205 w3
+206 w25
+207 pl25
+208 il110
+209 p1
+210 w46
+211 pn-2
+212 w51
+213 w44
+214 w63
+215 w23
+216 pm20
+217 w8
+218 pmblr
+219 w4
+220 i5
+221 il90
+222 w21
+223 p27
+224 pl50
+225 pl65
+226 w61
+227 ph2.2
+228 pm2
+229 i3
+230 pa18
+231 pw4
diff --git a/example/low_precision_training/ppcls/utils/__init__.py b/example/low_precision_training/ppcls/utils/__init__.py
new file mode 100755
index 000000000..294fbb125
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import logger
+from . import metrics
+from . import misc
+from . import model_zoo
+
+from .config import get_config, convert_to_dict
+from .dist_utils import all_gather
+from .metrics import accuracy_score
+from .metrics import hamming_distance
+from .metrics import mean_average_precision
+from .metrics import multi_hot_encode
+from .metrics import precision_recall_fscore
+from .misc import AverageMeter
+from .save_load import init_model, save_model
+from .save_result import save_predict_result
diff --git a/example/low_precision_training/ppcls/utils/amp.py b/example/low_precision_training/ppcls/utils/amp.py
new file mode 100755
index 000000000..7ada7a404
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/amp.py
@@ -0,0 +1,61 @@
+from functools import partial
+import contextlib
+import paddle
+
+
+class AutoCast:
+    def __init__(self,
+                 use_amp=False,
+                 amp_level="O1",
+                 use_promote=False,
+                 amp_eval=False):
+        self.use_amp = use_amp
+        self.amp_eval = amp_eval
+
+        if self.use_amp:
+            # compatible with paddle 2.5 and older version
+            paddle_version = paddle.__version__[:3]
+            # paddle version >= 2.5.0 or develop
+            if paddle_version in ["2.5", "0.0"]:
+                self.cast_context = partial(
+                    paddle.amp.auto_cast,
+                    level=amp_level,
+                    use_promote=use_promote)
+            # paddle version <= 2.4.x and not develop
+            else:
+                self.cast_context = partial(
+                    paddle.amp.auto_cast, level=amp_level)
+
+    def __call__(self, is_eval=False):
+        if self.use_amp:
+            # not is_eval: cast for all training
+            # is_eval and self.amp_eval: cast for evaluation only when amp_eval is True
+            if not is_eval or (is_eval and self.amp_eval):
+                return self.cast_context()
+
+        return contextlib.nullcontext()
+
+
+def build_scaler(use_amp=False, scale_loss=1.0,
+                 use_dynamic_loss_scaling=False):
+    class Foo:
+        def __init__(self):
+            pass
+
+        def scale(self, loss):
+            return loss
+
+        def step(self, optimizer):
+            optimizer.step()
+
+        def update(self):
+            return
+
+        def minimize(self, optimizer, loss):
+            optimizer.step()
+
+    if use_amp:
+        return paddle.amp.GradScaler(
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling)
+    return Foo()
diff --git a/example/low_precision_training/ppcls/utils/check.py b/example/low_precision_training/ppcls/utils/check.py
new file mode 100755
index 000000000..8b743335e
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/check.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import paddle
+from paddle import is_compiled_with_cuda
+
+from ..arch.utils import get_architectures, similar_architectures, get_blacklist_model_in_static_mode
+from . import logger
+
+
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.8.0 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code."
+    try:
+        pass
+        # paddle.utils.require_version('0.0.0')
+    except Exception:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_gpu():
+    """
+    Log error and exit when using paddlepaddle cpu version.
+    """
+    err = "You are using paddlepaddle cpu version! Please try to " \
+          "install paddlepaddle-gpu to run model on GPU."
+
+    try:
+        assert is_compiled_with_cuda()
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_architecture(architecture):
+    """
+    check architecture and recommend similar architectures
+    """
+    assert isinstance(architecture, dict), \
+        ("the type of architecture({}) should be dict". format(architecture))
+    assert "name" in architecture, \
+        ("name must be in the architecture keys, just contains: {}". format(
+            architecture.keys()))
+
+    similar_names = similar_architectures(architecture["name"],
+                                          get_architectures())
+    model_list = ', '.join(similar_names)
+    err = "Architecture [{}] is not exist! Maybe you want: [{}]" \
+          "".format(architecture["name"], model_list)
+    try:
+        assert architecture["name"] in similar_names
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_model_with_running_mode(architecture):
+    """
+    check whether the model is consistent with the operating mode 
+    """
+    # some model are not supported in the static mode
+    blacklist = get_blacklist_model_in_static_mode()
+    if not paddle.in_dynamic_mode() and architecture["name"] in blacklist:
+        logger.error("Model: {} is not supported in the staic mode.".format(
+            architecture["name"]))
+        sys.exit(1)
+    return
+
+
+def check_mix(architecture, use_mix=False):
+    """
+    check mix parameter
+    """
+    err = "Cannot use mix processing in GoogLeNet, " \
+          "please set use_mix = False."
+    try:
+        if architecture["name"] == "GoogLeNet":
+            assert use_mix is not True
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_classes_num(classes_num):
+    """
+    check classes_num
+    """
+    err = "classes_num({}) should be a positive integer" \
+        "and larger than 1".format(classes_num)
+    try:
+        assert isinstance(classes_num, int)
+        assert classes_num > 1
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_data_dir(path):
+    """
+    check cata_dir
+    """
+    err = "Data path is not exist, please given a right path" \
+          "".format(path)
+    try:
+        assert os.isdir(path)
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_function_params(config, key):
+    """
+    check specify config
+    """
+    k_config = config.get(key)
+    assert k_config is not None, \
+        ('{} is required in config'.format(key))
+
+    assert k_config.get('function'), \
+        ('function is required {} config'.format(key))
+    params = k_config.get('params')
+    assert params is not None, \
+        ('params is required in {} config'.format(key))
+    assert isinstance(params, dict), \
+        ('the params in {} config should be a dict'.format(key))
diff --git a/example/low_precision_training/ppcls/utils/config.py b/example/low_precision_training/ppcls/utils/config.py
new file mode 100755
index 000000000..5df438c40
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/config.py
@@ -0,0 +1,323 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+import argparse
+import yaml
+from . import logger
+from . import check
+from collections import OrderedDict
+
+__all__ = ['get_config', 'convert_to_dict']
+
+
+def convert_to_dict(obj):
+    if isinstance(obj, dict):
+        return {k: convert_to_dict(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_to_dict(i) for i in obj]
+    else:
+        return obj
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+    def __deepcopy__(self, content):
+        return AttrDict(copy.deepcopy(dict(self)))
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in d.items():
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ", k, v))
+
+        if k[0].isupper() and delimiter == 0:
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    logger.advertise()
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    check.check_version()
+    use_gpu = config.get('use_gpu', True)
+    if use_gpu:
+        check.check_gpu()
+    architecture = config.get('ARCHITECTURE')
+    #check.check_architecture(architecture)
+    use_mix = config.get('use_mix', False)
+    check.check_mix(architecture, use_mix)
+    classes_num = config.get('classes_num')
+    check.check_classes_num(classes_num)
+    mode = config.get('mode', 'train')
+    if mode.lower() == 'train':
+        check.check_function_params(config, 'LEARNING_RATE')
+        check.check_function_params(config, 'OPTIMIZER')
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                print('A new field ({}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            if ks[0] not in dl.keys():
+                dl[ks[0]] = {}
+                print("A new Series field ({}) detected!".format(ks[0], dl))
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                'topk=2',
+                'VALID.transforms.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt, str), (
+                "option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+    return config
+
+
+def get_config(fname, overrides=None, show=False):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    # check_config(config)
+    return config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("generic-image-rec train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/config.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    args = parser.parse_args()
+    return args
+
+
+def represent_dictionary_order(self, dict_data):
+    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())
+
+
+def setup_orderdict():
+    yaml.add_representer(OrderedDict, represent_dictionary_order)
+
+
+def dump_infer_config(inference_config, path):
+    setup_orderdict()
+    infer_cfg = OrderedDict()
+    config = copy.deepcopy(inference_config)
+    if config["Global"].get("pdx_model_name", None):
+        infer_cfg["Global"] = {"model_name": config["Global"]["pdx_model_name"]}
+    if config.get("Infer"):
+        transforms = config["Infer"]["transforms"]
+    elif config["DataLoader"]["Eval"].get("Query"):
+        transforms = config["DataLoader"]["Eval"]["Query"]["dataset"][
+            "transform_ops"]
+        transforms.append({"ToCHWImage": None})
+    else:
+        logger.error("This config does not support dump transform config!")
+    transform = next((item for item in transforms if 'CropImage' in item), None)
+    if transform:
+        dynamic_shapes = transform["CropImage"]["size"]
+    else:
+        transform = next((item for item in transforms
+                          if 'ResizeImage' in item), None)
+        if transform:
+            if isinstance(transform["ResizeImage"]["size"], list):
+                dynamic_shapes = transform["ResizeImage"]["size"][0]
+            elif isinstance(transform["ResizeImage"]["size"], int):
+                dynamic_shapes = transform["ResizeImage"]["size"]
+            else:
+                raise ValueError(
+                    "ResizeImage size must be either a list or an int.")
+        else:
+            raise ValueError("No valid transform found.")
+    # Configuration required config for high-performance inference.
+    if config["Global"].get("hpi_config_path", None):
+        hpi_config = convert_to_dict(
+            parse_config(config["Global"]["hpi_config_path"]))
+        if hpi_config["Hpi"]["backend_config"].get("paddle_tensorrt", None):
+            hpi_config["Hpi"]["backend_config"]["paddle_tensorrt"][
+                "dynamic_shapes"]["x"] = [[
+                    1, 3, dynamic_shapes, dynamic_shapes
+                ] for i in range(3)]
+            hpi_config["Hpi"]["backend_config"]["paddle_tensorrt"][
+                "max_batch_size"] = 1
+        if hpi_config["Hpi"]["backend_config"].get("tensorrt", None):
+            hpi_config["Hpi"]["backend_config"]["tensorrt"]["dynamic_shapes"][
+                "x"] = [[1, 3, dynamic_shapes, dynamic_shapes]
+                        for i in range(3)]
+            hpi_config["Hpi"]["backend_config"]["tensorrt"][
+                "max_batch_size"] = 1
+        infer_cfg["Hpi"] = hpi_config["Hpi"]
+    for transform in transforms:
+        if "NormalizeImage" in transform:
+            transform["NormalizeImage"]["channel_num"] = 3
+            scale_str = transform["NormalizeImage"]["scale"]
+            numerator, denominator = scale_str.split('/')
+            numerator, denominator = float(numerator), float(denominator)
+            transform["NormalizeImage"]["scale"] = float(numerator /
+                                                         denominator)
+    infer_cfg["PreProcess"] = {
+        "transform_ops": [
+            infer_preprocess for infer_preprocess in transforms
+            if "DecodeImage" not in infer_preprocess
+        ]
+    }
+    if config.get("Infer"):
+        postprocess_dict = config["Infer"]["PostProcess"]
+
+        with open(postprocess_dict["class_id_map_file"], 'r') as f:
+            label_id_maps = f.readlines()
+        label_names = []
+        for line in label_id_maps:
+            line = line.strip().split(' ', 1)
+            label_names.append(line[1:][0])
+
+        postprocess_name = postprocess_dict.get("name", None)
+        postprocess_dict.pop("class_id_map_file")
+        postprocess_dict.pop("name")
+        dic = OrderedDict()
+        for item in postprocess_dict.items():
+            dic[item[0]] = item[1]
+        dic['label_list'] = label_names
+
+        if postprocess_name:
+            infer_cfg["PostProcess"] = {postprocess_name: dic}
+        else:
+            raise ValueError("PostProcess name is not specified")
+    else:
+        infer_cfg["PostProcess"] = {"NormalizeFeatures": None}
+    with open(path, 'w') as f:
+        yaml.dump(infer_cfg, f)
+    logger.info("Export inference config file to {}".format(os.path.join(path)))
diff --git a/example/low_precision_training/ppcls/utils/create_cls_trainval_lists.py b/example/low_precision_training/ppcls/utils/create_cls_trainval_lists.py
new file mode 100755
index 000000000..11aa32f10
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/create_cls_trainval_lists.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import string
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset_path', type=str, default='./data')
+    parser.add_argument('--save_img_list_path', type=str, default='train.txt')
+    parser.add_argument(
+        '--train', action='store_true', help='Create train list.')
+    parser.add_argument('--val', action='store_true', help='Create val list.')
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_class_id_map(class_id_map_file):
+    class_id_map = {}
+    with open(class_id_map_file, "r") as f:
+        lines = f.readlines()
+        for line in lines:
+            partition = line.split("\n")[0].partition(" ")
+            class_id_map[str(partition[-1])] = int(partition[0])
+    return class_id_map
+
+
+def main(args):
+    img_list = []
+    label_list = []
+    img_end = ['jpg', 'JPG', 'png', 'PNG', 'jpeg', 'JPEG', 'bmp']
+    if args.dataset_path[-1] == "/":
+        args.dataset_path = args.dataset_path[:-1]
+
+    if not os.path.exists(args.dataset_path):
+        raise Exception(f"The data path {args.dataset_path} not exists.")
+    else:
+        label_name_list = [
+            label for label in os.listdir(args.dataset_path)
+            if os.path.isdir(os.path.join(args.dataset_path, label))
+        ]
+
+    if not os.path.exists(
+            os.path.join(os.path.dirname(args.dataset_path),
+                         'label.txt')) and args.val:
+        raise Exception(
+            'The label file is not exist. Please set "--train" first.')
+
+    for index, label_name in enumerate(label_name_list):
+        for root, dirs, files in os.walk(
+                os.path.join(args.dataset_path, label_name)):
+            for single_file in files:
+                if single_file.split('.')[-1] in img_end:
+                    img_path = os.path.relpath(
+                        os.path.join(root, single_file),
+                        os.path.dirname(args.dataset_path))
+                    if args.val:
+                        class_id_map = parse_class_id_map(
+                            os.path.join(
+                                os.path.dirname(args.dataset_path),
+                                'label.txt'))
+                        img_list.append(
+                            f'{img_path} {class_id_map[label_name]}')
+                    else:
+                        img_list.append(f'{img_path} {index}')
+                else:
+                    print(
+                        f'WARNING: File {os.path.join(root, single_file)} end with {single_file.split(".")[-1]} is not supported.'
+                    )
+        label_list.append(f'{index} {label_name}')
+
+    if len(img_list) == 0:
+        raise Exception(f"Not found any images file in {args.dataset_path}.")
+
+    with open(
+            os.path.join(
+                os.path.dirname(args.dataset_path), args.save_img_list_path),
+            'w') as f:
+        f.write('\n'.join(img_list))
+    print(
+        f'Already save {args.save_img_list_path} in {os.path.join(os.path.dirname(args.dataset_path), args.save_img_list_path)}.'
+    )
+
+    if not args.val:
+        with open(
+                os.path.join(os.path.dirname(args.dataset_path), 'label.txt'),
+                'w') as f:
+            f.write('\n'.join(label_list))
+        print(
+            f'Already save label.txt in {os.path.join(os.path.dirname(args.dataset_path), "label.txt")}.'
+        )
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/example/low_precision_training/ppcls/utils/create_coco_multilabel_lists.py b/example/low_precision_training/ppcls/utils/create_coco_multilabel_lists.py
new file mode 100755
index 000000000..3082b4a6c
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/create_coco_multilabel_lists.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+init_logger()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--dataset_dir',
+        required=True,
+        help='root directory for dataset')
+    parser.add_argument(
+        '--image_dir',
+        required=True,
+        help='directory for images')
+    parser.add_argument(
+        '--anno_path',
+        required=True,
+        help='coco annotation file path')
+    parser.add_argument(
+        '--save_name',
+        default=None,
+        help='will same as anno_path if got None')
+    parser.add_argument(
+        '--output_dir',
+        default=None,
+        help='output directory, and will same as '
+             'dataset_dir if got None')
+    parser.add_argument(
+        '--save_label_name',
+        action='store_true',
+        help='save label name file')
+
+    args = parser.parse_args()
+    if args.output_dir is None:
+        args.output_dir = args.dataset_dir
+    else:
+        os.makedirs(args.dataset_dir, exist_ok=True)
+    if args.save_name is None:
+        args.save_name = os.path.splitext(os.path.basename(args.anno_path))[0]
+
+    image_dir = os.path.join(args.dataset_dir, args.image_dir)
+    anno_path = os.path.join(args.dataset_dir, args.anno_path)
+    assert os.path.exists(image_dir) and os.path.exists(anno_path), \
+        ValueError("The dataset is not Found or "
+                   "the folder structure is non-conformance.")
+    coco = COCO(anno_path)
+    cat_id_map = {
+        old_cat_id: new_cat_id
+        for new_cat_id, old_cat_id in enumerate(coco.getCatIds())
+    }
+    num_classes = len(list(cat_id_map.keys()))
+
+    assert 'annotations' in coco.dataset, \
+        'Annotation file: {} does not contains ground truth!!!'.format(anno_path)
+
+    save_path = os.path.join(args.dataset_dir, args.save_name + '.txt')
+    logger.info("Start converting {}:".format(anno_path))
+    with open(save_path, 'w') as fp:
+        lines = []
+        for img_id in tqdm(sorted(coco.getImgIds())):
+            img_info = coco.loadImgs([img_id])[0]
+            img_filename = img_info['file_name']
+            img_w = img_info['width']
+            img_h = img_info['height']
+
+            img_filepath = os.path.join(image_dir, img_filename)
+            if not os.path.exists(img_filepath):
+                logger.warning('Illegal image file: {}, '
+                               'and it will be ignored'.format(img_filepath))
+                continue
+
+            if img_w < 0 or img_h < 0:
+                logger.warning(
+                    'Illegal width: {} or height: {} in annotation, '
+                    'and im_id: {} will be ignored'.format(img_w, img_h, img_id))
+                continue
+
+            ins_anno_ids = coco.getAnnIds(imgIds=[img_id])
+            instances = coco.loadAnns(ins_anno_ids)
+
+            label = [0] * num_classes
+            for instance in instances:
+                label[cat_id_map[instance['category_id']]] = 1
+            lines.append(img_filename + '\t' + ','.join(map(str, label)))
+
+        fp.write('\n'.join(lines))
+        fp.close()
+    logger.info("Conversion completed, save to {}:".format(save_path))
+
+    if args.save_label_name:
+        label_txt_save_name = os.path.basename(
+            os.path.abspath(args.dataset_dir)) + '_labels.txt'
+        label_txt_save_path = os.path.join(args.dataset_dir, label_txt_save_name)
+        with open(label_txt_save_path, 'w') as fp:
+            label_name_list = []
+            for cat in coco.cats.values():
+                label_name_list.append(cat['name'])
+            fp.write('\n'.join(label_name_list))
+            fp.close()
+        logger.info("Save label names to {}.".format(label_txt_save_path))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/example/low_precision_training/ppcls/utils/dist_utils.py b/example/low_precision_training/ppcls/utils/dist_utils.py
new file mode 100755
index 000000000..6b7d889d6
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/dist_utils.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+import paddle
+
+
+def all_gather(tensor: paddle.Tensor, concat: bool=True,
+               axis: int=0) -> Union[paddle.Tensor, List[paddle.Tensor]]:
+    """Gather tensor from all devices, concatenate them along given axis if specified.
+
+    Args:
+        tensor (paddle.Tensor): Tensor to be gathered from all GPUs.
+        concat (bool, optional): Whether to concatenate gathered Tensors. Defaults to True.
+        axis (int, optional): Axis which concatenated along. Defaults to 0.
+
+    Returns:
+        Union[paddle.Tensor, List[paddle.Tensor]]: Gathered Tensors
+    """
+    result = []
+    paddle.distributed.all_gather(result, tensor)
+    if concat:
+        return paddle.concat(result, axis)
+    return result
diff --git a/example/low_precision_training/ppcls/utils/download.py b/example/low_precision_training/ppcls/utils/download.py
new file mode 100755
index 000000000..decad654a
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/download.py
@@ -0,0 +1,304 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import os.path as osp
+import shutil
+import requests
+import hashlib
+import tarfile
+import zipfile
+import time
+from collections import OrderedDict
+from tqdm import tqdm
+
+from . import logger
+
+__all__ = ['get_weights_path_from_url']
+
+WEIGHTS_HOME = osp.expanduser("~/.paddleclas/weights")
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded weights.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.utils.download import get_weights_path_from_url
+
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    from paddle.distributed import ParallelEnv
+
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+    # Mainly used to solve the problem of downloading data from different 
+    # machines in the case of multiple machines. Different nodes will download 
+    # data, and the same node will only download data once.
+    rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0))
+
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if rank_id_curr_node == 0:
+            fullpath = _download(url, root_dir, md5sum)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+
+    if rank_id_curr_node == 0:
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
+            fullpath = _decompress(fullpath)
+
+    return fullpath
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info(
+                "Downloading {} from {} failed {} times with exception {}".
+                format(fname, url, retry_cnt + 1, str(e)))
+            time.sleep(1)
+            continue
+
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
diff --git a/example/low_precision_training/ppcls/utils/ema.py b/example/low_precision_training/ppcls/utils/ema.py
new file mode 100755
index 000000000..8cdb3dfab
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/ema.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+
+import paddle
+
+
+class ExponentialMovingAverage():
+    """
+    Exponential Moving Average
+    Code was heavily based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py
+    """
+
+    def __init__(self, model, decay=0.9999):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+
+    @paddle.no_grad()
+    def _update(self, model, update_fn):
+        for ema_v, model_v in zip(self.module.state_dict().values(),
+                                  model.state_dict().values()):
+            paddle.assign(update_fn(ema_v, model_v), ema_v)
+
+    def update(self, model):
+        self._update(
+            model,
+            update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)
+
+    def set(self, model):
+        self._update(model, update_fn=lambda e, m: m)
diff --git a/example/low_precision_training/ppcls/utils/feature_maps_visualization/fm_vis.py b/example/low_precision_training/ppcls/utils/feature_maps_visualization/fm_vis.py
new file mode 100755
index 000000000..a5368b10e
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/feature_maps_visualization/fm_vis.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import cv2
+import utils
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../..')))
+
+import paddle
+from paddle.distributed import ParallelEnv
+
+from resnet import ResNet50
+from ppcls.utils.save_load import load_dygraph_pretrain
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_file", required=True, type=str)
+    parser.add_argument("-c", "--channel_num", type=int)
+    parser.add_argument("-p", "--pretrained_model", type=str)
+    parser.add_argument("--show", type=str2bool, default=False)
+    parser.add_argument("--interpolation", type=int, default=1)
+    parser.add_argument("--save_path", type=str, default=None)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+
+    return parser.parse_args()
+
+
+def create_operators(interpolation=1):
+    size = 224
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    img_scale = 1.0 / 255.0
+
+    resize_op = utils.ResizeImage(
+        resize_short=256, interpolation=interpolation)
+    crop_op = utils.CropImage(size=(size, size))
+    normalize_op = utils.NormalizeImage(
+        scale=img_scale, mean=img_mean, std=img_std)
+    totensor_op = utils.ToTensor()
+
+    return [resize_op, crop_op, normalize_op, totensor_op]
+
+
+def preprocess(data, ops):
+    for op in ops:
+        data = op(data)
+    return data
+
+
+def main():
+    args = parse_args()
+    operators = create_operators(args.interpolation)
+    # assign the place
+    place = 'gpu:{}'.format(ParallelEnv().dev_id) if args.use_gpu else 'cpu'
+    place = paddle.set_device(place)
+
+    net = ResNet50()
+    load_dygraph_pretrain(net, args.pretrained_model)
+
+    img = cv2.imread(args.image_file, cv2.IMREAD_COLOR)
+    data = preprocess(img, operators)
+    data = np.expand_dims(data, axis=0)
+    data = paddle.to_tensor(data)
+    net.eval()
+    _, fm = net(data)
+    assert args.channel_num >= 0 and args.channel_num <= fm.shape[
+        1], "the channel is out of the range, should be in {} but got {}".format(
+            [0, fm.shape[1]], args.channel_num)
+
+    fm = (np.squeeze(fm[0][args.channel_num].numpy()) * 255).astype(np.uint8)
+    fm = cv2.resize(fm, (img.shape[1], img.shape[0]))
+    if args.save_path is not None:
+        print("the feature map is saved in path: {}".format(args.save_path))
+        cv2.imwrite(args.save_path, fm)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/example/low_precision_training/ppcls/utils/feature_maps_visualization/resnet.py b/example/low_precision_training/ppcls/utils/feature_maps_visualization/resnet.py
new file mode 100755
index 000000000..0bea3401a
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/feature_maps_visualization/resnet.py
@@ -0,0 +1,535 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better.
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+        if return_patterns is not None:
+            self.update_res(return_patterns)
+            self.register_forward_post_hook(self._return_dict_hook)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            fm = x
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
+        return x, fm
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["200"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/example/low_precision_training/ppcls/utils/feature_maps_visualization/utils.py b/example/low_precision_training/ppcls/utils/feature_maps_visualization/utils.py
new file mode 100755
index 000000000..7c7014932
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/feature_maps_visualization/utils.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    def __init__(self, to_rgb=True):
+        self.to_rgb = to_rgb
+
+    def __call__(self, img):
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+
+        return img
+
+
+class ResizeImage(object):
+    def __init__(self, resize_short=None, interpolation=1):
+        self.resize_short = resize_short
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        percent = float(self.resize_short) / min(img_w, img_h)
+        w = int(round(img_w * percent))
+        h = int(round(img_h * percent))
+        return cv2.resize(img, (w, h), interpolation=self.interpolation)
+
+
+class CropImage(object):
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class NormalizeImage(object):
+    def __init__(self, scale=None, mean=None, std=None):
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, img):
+        return (img.astype('float32') * self.scale - self.mean) / self.std
+
+
+class ToTensor(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        img = img.transpose((2, 0, 1))
+        return img
diff --git a/example/low_precision_training/ppcls/utils/imagenet1k_label_list.txt b/example/low_precision_training/ppcls/utils/imagenet1k_label_list.txt
new file mode 100755
index 000000000..376e18021
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/imagenet1k_label_list.txt
@@ -0,0 +1,1000 @@
+0 tench, Tinca tinca
+1 goldfish, Carassius auratus
+2 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+3 tiger shark, Galeocerdo cuvieri
+4 hammerhead, hammerhead shark
+5 electric ray, crampfish, numbfish, torpedo
+6 stingray
+7 cock
+8 hen
+9 ostrich, Struthio camelus
+10 brambling, Fringilla montifringilla
+11 goldfinch, Carduelis carduelis
+12 house finch, linnet, Carpodacus mexicanus
+13 junco, snowbird
+14 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+15 robin, American robin, Turdus migratorius
+16 bulbul
+17 jay
+18 magpie
+19 chickadee
+20 water ouzel, dipper
+21 kite
+22 bald eagle, American eagle, Haliaeetus leucocephalus
+23 vulture
+24 great grey owl, great gray owl, Strix nebulosa
+25 European fire salamander, Salamandra salamandra
+26 common newt, Triturus vulgaris
+27 eft
+28 spotted salamander, Ambystoma maculatum
+29 axolotl, mud puppy, Ambystoma mexicanum
+30 bullfrog, Rana catesbeiana
+31 tree frog, tree-frog
+32 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+33 loggerhead, loggerhead turtle, Caretta caretta
+34 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+35 mud turtle
+36 terrapin
+37 box turtle, box tortoise
+38 banded gecko
+39 common iguana, iguana, Iguana iguana
+40 American chameleon, anole, Anolis carolinensis
+41 whiptail, whiptail lizard
+42 agama
+43 frilled lizard, Chlamydosaurus kingi
+44 alligator lizard
+45 Gila monster, Heloderma suspectum
+46 green lizard, Lacerta viridis
+47 African chameleon, Chamaeleo chamaeleon
+48 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+49 African crocodile, Nile crocodile, Crocodylus niloticus
+50 American alligator, Alligator mississipiensis
+51 triceratops
+52 thunder snake, worm snake, Carphophis amoenus
+53 ringneck snake, ring-necked snake, ring snake
+54 hognose snake, puff adder, sand viper
+55 green snake, grass snake
+56 king snake, kingsnake
+57 garter snake, grass snake
+58 water snake
+59 vine snake
+60 night snake, Hypsiglena torquata
+61 boa constrictor, Constrictor constrictor
+62 rock python, rock snake, Python sebae
+63 Indian cobra, Naja naja
+64 green mamba
+65 sea snake
+66 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+67 diamondback, diamondback rattlesnake, Crotalus adamanteus
+68 sidewinder, horned rattlesnake, Crotalus cerastes
+69 trilobite
+70 harvestman, daddy longlegs, Phalangium opilio
+71 scorpion
+72 black and gold garden spider, Argiope aurantia
+73 barn spider, Araneus cavaticus
+74 garden spider, Aranea diademata
+75 black widow, Latrodectus mactans
+76 tarantula
+77 wolf spider, hunting spider
+78 tick
+79 centipede
+80 black grouse
+81 ptarmigan
+82 ruffed grouse, partridge, Bonasa umbellus
+83 prairie chicken, prairie grouse, prairie fowl
+84 peacock
+85 quail
+86 partridge
+87 African grey, African gray, Psittacus erithacus
+88 macaw
+89 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+90 lorikeet
+91 coucal
+92 bee eater
+93 hornbill
+94 hummingbird
+95 jacamar
+96 toucan
+97 drake
+98 red-breasted merganser, Mergus serrator
+99 goose
+100 black swan, Cygnus atratus
+101 tusker
+102 echidna, spiny anteater, anteater
+103 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+104 wallaby, brush kangaroo
+105 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+106 wombat
+107 jellyfish
+108 sea anemone, anemone
+109 brain coral
+110 flatworm, platyhelminth
+111 nematode, nematode worm, roundworm
+112 conch
+113 snail
+114 slug
+115 sea slug, nudibranch
+116 chiton, coat-of-mail shell, sea cradle, polyplacophore
+117 chambered nautilus, pearly nautilus, nautilus
+118 Dungeness crab, Cancer magister
+119 rock crab, Cancer irroratus
+120 fiddler crab
+121 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+122 American lobster, Northern lobster, Maine lobster, Homarus americanus
+123 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+124 crayfish, crawfish, crawdad, crawdaddy
+125 hermit crab
+126 isopod
+127 white stork, Ciconia ciconia
+128 black stork, Ciconia nigra
+129 spoonbill
+130 flamingo
+131 little blue heron, Egretta caerulea
+132 American egret, great white heron, Egretta albus
+133 bittern
+134 crane
+135 limpkin, Aramus pictus
+136 European gallinule, Porphyrio porphyrio
+137 American coot, marsh hen, mud hen, water hen, Fulica americana
+138 bustard
+139 ruddy turnstone, Arenaria interpres
+140 red-backed sandpiper, dunlin, Erolia alpina
+141 redshank, Tringa totanus
+142 dowitcher
+143 oystercatcher, oyster catcher
+144 pelican
+145 king penguin, Aptenodytes patagonica
+146 albatross, mollymawk
+147 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+148 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+149 dugong, Dugong dugon
+150 sea lion
+151 Chihuahua
+152 Japanese spaniel
+153 Maltese dog, Maltese terrier, Maltese
+154 Pekinese, Pekingese, Peke
+155 Shih-Tzu
+156 Blenheim spaniel
+157 papillon
+158 toy terrier
+159 Rhodesian ridgeback
+160 Afghan hound, Afghan
+161 basset, basset hound
+162 beagle
+163 bloodhound, sleuthhound
+164 bluetick
+165 black-and-tan coonhound
+166 Walker hound, Walker foxhound
+167 English foxhound
+168 redbone
+169 borzoi, Russian wolfhound
+170 Irish wolfhound
+171 Italian greyhound
+172 whippet
+173 Ibizan hound, Ibizan Podenco
+174 Norwegian elkhound, elkhound
+175 otterhound, otter hound
+176 Saluki, gazelle hound
+177 Scottish deerhound, deerhound
+178 Weimaraner
+179 Staffordshire bullterrier, Staffordshire bull terrier
+180 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+181 Bedlington terrier
+182 Border terrier
+183 Kerry blue terrier
+184 Irish terrier
+185 Norfolk terrier
+186 Norwich terrier
+187 Yorkshire terrier
+188 wire-haired fox terrier
+189 Lakeland terrier
+190 Sealyham terrier, Sealyham
+191 Airedale, Airedale terrier
+192 cairn, cairn terrier
+193 Australian terrier
+194 Dandie Dinmont, Dandie Dinmont terrier
+195 Boston bull, Boston terrier
+196 miniature schnauzer
+197 giant schnauzer
+198 standard schnauzer
+199 Scotch terrier, Scottish terrier, Scottie
+200 Tibetan terrier, chrysanthemum dog
+201 silky terrier, Sydney silky
+202 soft-coated wheaten terrier
+203 West Highland white terrier
+204 Lhasa, Lhasa apso
+205 flat-coated retriever
+206 curly-coated retriever
+207 golden retriever
+208 Labrador retriever
+209 Chesapeake Bay retriever
+210 German short-haired pointer
+211 vizsla, Hungarian pointer
+212 English setter
+213 Irish setter, red setter
+214 Gordon setter
+215 Brittany spaniel
+216 clumber, clumber spaniel
+217 English springer, English springer spaniel
+218 Welsh springer spaniel
+219 cocker spaniel, English cocker spaniel, cocker
+220 Sussex spaniel
+221 Irish water spaniel
+222 kuvasz
+223 schipperke
+224 groenendael
+225 malinois
+226 briard
+227 kelpie
+228 komondor
+229 Old English sheepdog, bobtail
+230 Shetland sheepdog, Shetland sheep dog, Shetland
+231 collie
+232 Border collie
+233 Bouvier des Flandres, Bouviers des Flandres
+234 Rottweiler
+235 German shepherd, German shepherd dog, German police dog, alsatian
+236 Doberman, Doberman pinscher
+237 miniature pinscher
+238 Greater Swiss Mountain dog
+239 Bernese mountain dog
+240 Appenzeller
+241 EntleBucher
+242 boxer
+243 bull mastiff
+244 Tibetan mastiff
+245 French bulldog
+246 Great Dane
+247 Saint Bernard, St Bernard
+248 Eskimo dog, husky
+249 malamute, malemute, Alaskan malamute
+250 Siberian husky
+251 dalmatian, coach dog, carriage dog
+252 affenpinscher, monkey pinscher, monkey dog
+253 basenji
+254 pug, pug-dog
+255 Leonberg
+256 Newfoundland, Newfoundland dog
+257 Great Pyrenees
+258 Samoyed, Samoyede
+259 Pomeranian
+260 chow, chow chow
+261 keeshond
+262 Brabancon griffon
+263 Pembroke, Pembroke Welsh corgi
+264 Cardigan, Cardigan Welsh corgi
+265 toy poodle
+266 miniature poodle
+267 standard poodle
+268 Mexican hairless
+269 timber wolf, grey wolf, gray wolf, Canis lupus
+270 white wolf, Arctic wolf, Canis lupus tundrarum
+271 red wolf, maned wolf, Canis rufus, Canis niger
+272 coyote, prairie wolf, brush wolf, Canis latrans
+273 dingo, warrigal, warragal, Canis dingo
+274 dhole, Cuon alpinus
+275 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+276 hyena, hyaena
+277 red fox, Vulpes vulpes
+278 kit fox, Vulpes macrotis
+279 Arctic fox, white fox, Alopex lagopus
+280 grey fox, gray fox, Urocyon cinereoargenteus
+281 tabby, tabby cat
+282 tiger cat
+283 Persian cat
+284 Siamese cat, Siamese
+285 Egyptian cat
+286 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+287 lynx, catamount
+288 leopard, Panthera pardus
+289 snow leopard, ounce, Panthera uncia
+290 jaguar, panther, Panthera onca, Felis onca
+291 lion, king of beasts, Panthera leo
+292 tiger, Panthera tigris
+293 cheetah, chetah, Acinonyx jubatus
+294 brown bear, bruin, Ursus arctos
+295 American black bear, black bear, Ursus americanus, Euarctos americanus
+296 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+297 sloth bear, Melursus ursinus, Ursus ursinus
+298 mongoose
+299 meerkat, mierkat
+300 tiger beetle
+301 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+302 ground beetle, carabid beetle
+303 long-horned beetle, longicorn, longicorn beetle
+304 leaf beetle, chrysomelid
+305 dung beetle
+306 rhinoceros beetle
+307 weevil
+308 fly
+309 bee
+310 ant, emmet, pismire
+311 grasshopper, hopper
+312 cricket
+313 walking stick, walkingstick, stick insect
+314 cockroach, roach
+315 mantis, mantid
+316 cicada, cicala
+317 leafhopper
+318 lacewing, lacewing fly
+319 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+320 damselfly
+321 admiral
+322 ringlet, ringlet butterfly
+323 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+324 cabbage butterfly
+325 sulphur butterfly, sulfur butterfly
+326 lycaenid, lycaenid butterfly
+327 starfish, sea star
+328 sea urchin
+329 sea cucumber, holothurian
+330 wood rabbit, cottontail, cottontail rabbit
+331 hare
+332 Angora, Angora rabbit
+333 hamster
+334 porcupine, hedgehog
+335 fox squirrel, eastern fox squirrel, Sciurus niger
+336 marmot
+337 beaver
+338 guinea pig, Cavia cobaya
+339 sorrel
+340 zebra
+341 hog, pig, grunter, squealer, Sus scrofa
+342 wild boar, boar, Sus scrofa
+343 warthog
+344 hippopotamus, hippo, river horse, Hippopotamus amphibius
+345 ox
+346 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+347 bison
+348 ram, tup
+349 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+350 ibex, Capra ibex
+351 hartebeest
+352 impala, Aepyceros melampus
+353 gazelle
+354 Arabian camel, dromedary, Camelus dromedarius
+355 llama
+356 weasel
+357 mink
+358 polecat, fitch, foulmart, foumart, Mustela putorius
+359 black-footed ferret, ferret, Mustela nigripes
+360 otter
+361 skunk, polecat, wood pussy
+362 badger
+363 armadillo
+364 three-toed sloth, ai, Bradypus tridactylus
+365 orangutan, orang, orangutang, Pongo pygmaeus
+366 gorilla, Gorilla gorilla
+367 chimpanzee, chimp, Pan troglodytes
+368 gibbon, Hylobates lar
+369 siamang, Hylobates syndactylus, Symphalangus syndactylus
+370 guenon, guenon monkey
+371 patas, hussar monkey, Erythrocebus patas
+372 baboon
+373 macaque
+374 langur
+375 colobus, colobus monkey
+376 proboscis monkey, Nasalis larvatus
+377 marmoset
+378 capuchin, ringtail, Cebus capucinus
+379 howler monkey, howler
+380 titi, titi monkey
+381 spider monkey, Ateles geoffroyi
+382 squirrel monkey, Saimiri sciureus
+383 Madagascar cat, ring-tailed lemur, Lemur catta
+384 indri, indris, Indri indri, Indri brevicaudatus
+385 Indian elephant, Elephas maximus
+386 African elephant, Loxodonta africana
+387 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+388 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+389 barracouta, snoek
+390 eel
+391 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+392 rock beauty, Holocanthus tricolor
+393 anemone fish
+394 sturgeon
+395 gar, garfish, garpike, billfish, Lepisosteus osseus
+396 lionfish
+397 puffer, pufferfish, blowfish, globefish
+398 abacus
+399 abaya
+400 academic gown, academic robe, judge's robe
+401 accordion, piano accordion, squeeze box
+402 acoustic guitar
+403 aircraft carrier, carrier, flattop, attack aircraft carrier
+404 airliner
+405 airship, dirigible
+406 altar
+407 ambulance
+408 amphibian, amphibious vehicle
+409 analog clock
+410 apiary, bee house
+411 apron
+412 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+413 assault rifle, assault gun
+414 backpack, back pack, knapsack, packsack, rucksack, haversack
+415 bakery, bakeshop, bakehouse
+416 balance beam, beam
+417 balloon
+418 ballpoint, ballpoint pen, ballpen, Biro
+419 Band Aid
+420 banjo
+421 bannister, banister, balustrade, balusters, handrail
+422 barbell
+423 barber chair
+424 barbershop
+425 barn
+426 barometer
+427 barrel, cask
+428 barrow, garden cart, lawn cart, wheelbarrow
+429 baseball
+430 basketball
+431 bassinet
+432 bassoon
+433 bathing cap, swimming cap
+434 bath towel
+435 bathtub, bathing tub, bath, tub
+436 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+437 beacon, lighthouse, beacon light, pharos
+438 beaker
+439 bearskin, busby, shako
+440 beer bottle
+441 beer glass
+442 bell cote, bell cot
+443 bib
+444 bicycle-built-for-two, tandem bicycle, tandem
+445 bikini, two-piece
+446 binder, ring-binder
+447 binoculars, field glasses, opera glasses
+448 birdhouse
+449 boathouse
+450 bobsled, bobsleigh, bob
+451 bolo tie, bolo, bola tie, bola
+452 bonnet, poke bonnet
+453 bookcase
+454 bookshop, bookstore, bookstall
+455 bottlecap
+456 bow
+457 bow tie, bow-tie, bowtie
+458 brass, memorial tablet, plaque
+459 brassiere, bra, bandeau
+460 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+461 breastplate, aegis, egis
+462 broom
+463 bucket, pail
+464 buckle
+465 bulletproof vest
+466 bullet train, bullet
+467 butcher shop, meat market
+468 cab, hack, taxi, taxicab
+469 caldron, cauldron
+470 candle, taper, wax light
+471 cannon
+472 canoe
+473 can opener, tin opener
+474 cardigan
+475 car mirror
+476 carousel, carrousel, merry-go-round, roundabout, whirligig
+477 carpenter's kit, tool kit
+478 carton
+479 car wheel
+480 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+481 cassette
+482 cassette player
+483 castle
+484 catamaran
+485 CD player
+486 cello, violoncello
+487 cellular telephone, cellular phone, cellphone, cell, mobile phone
+488 chain
+489 chainlink fence
+490 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+491 chain saw, chainsaw
+492 chest
+493 chiffonier, commode
+494 chime, bell, gong
+495 china cabinet, china closet
+496 Christmas stocking
+497 church, church building
+498 cinema, movie theater, movie theatre, movie house, picture palace
+499 cleaver, meat cleaver, chopper
+500 cliff dwelling
+501 cloak
+502 clog, geta, patten, sabot
+503 cocktail shaker
+504 coffee mug
+505 coffeepot
+506 coil, spiral, volute, whorl, helix
+507 combination lock
+508 computer keyboard, keypad
+509 confectionery, confectionary, candy store
+510 container ship, containership, container vessel
+511 convertible
+512 corkscrew, bottle screw
+513 cornet, horn, trumpet, trump
+514 cowboy boot
+515 cowboy hat, ten-gallon hat
+516 cradle
+517 crane
+518 crash helmet
+519 crate
+520 crib, cot
+521 Crock Pot
+522 croquet ball
+523 crutch
+524 cuirass
+525 dam, dike, dyke
+526 desk
+527 desktop computer
+528 dial telephone, dial phone
+529 diaper, nappy, napkin
+530 digital clock
+531 digital watch
+532 dining table, board
+533 dishrag, dishcloth
+534 dishwasher, dish washer, dishwashing machine
+535 disk brake, disc brake
+536 dock, dockage, docking facility
+537 dogsled, dog sled, dog sleigh
+538 dome
+539 doormat, welcome mat
+540 drilling platform, offshore rig
+541 drum, membranophone, tympan
+542 drumstick
+543 dumbbell
+544 Dutch oven
+545 electric fan, blower
+546 electric guitar
+547 electric locomotive
+548 entertainment center
+549 envelope
+550 espresso maker
+551 face powder
+552 feather boa, boa
+553 file, file cabinet, filing cabinet
+554 fireboat
+555 fire engine, fire truck
+556 fire screen, fireguard
+557 flagpole, flagstaff
+558 flute, transverse flute
+559 folding chair
+560 football helmet
+561 forklift
+562 fountain
+563 fountain pen
+564 four-poster
+565 freight car
+566 French horn, horn
+567 frying pan, frypan, skillet
+568 fur coat
+569 garbage truck, dustcart
+570 gasmask, respirator, gas helmet
+571 gas pump, gasoline pump, petrol pump, island dispenser
+572 goblet
+573 go-kart
+574 golf ball
+575 golfcart, golf cart
+576 gondola
+577 gong, tam-tam
+578 gown
+579 grand piano, grand
+580 greenhouse, nursery, glasshouse
+581 grille, radiator grille
+582 grocery store, grocery, food market, market
+583 guillotine
+584 hair slide
+585 hair spray
+586 half track
+587 hammer
+588 hamper
+589 hand blower, blow dryer, blow drier, hair dryer, hair drier
+590 hand-held computer, hand-held microcomputer
+591 handkerchief, hankie, hanky, hankey
+592 hard disc, hard disk, fixed disk
+593 harmonica, mouth organ, harp, mouth harp
+594 harp
+595 harvester, reaper
+596 hatchet
+597 holster
+598 home theater, home theatre
+599 honeycomb
+600 hook, claw
+601 hoopskirt, crinoline
+602 horizontal bar, high bar
+603 horse cart, horse-cart
+604 hourglass
+605 iPod
+606 iron, smoothing iron
+607 jack-o'-lantern
+608 jean, blue jean, denim
+609 jeep, landrover
+610 jersey, T-shirt, tee shirt
+611 jigsaw puzzle
+612 jinrikisha, ricksha, rickshaw
+613 joystick
+614 kimono
+615 knee pad
+616 knot
+617 lab coat, laboratory coat
+618 ladle
+619 lampshade, lamp shade
+620 laptop, laptop computer
+621 lawn mower, mower
+622 lens cap, lens cover
+623 letter opener, paper knife, paperknife
+624 library
+625 lifeboat
+626 lighter, light, igniter, ignitor
+627 limousine, limo
+628 liner, ocean liner
+629 lipstick, lip rouge
+630 Loafer
+631 lotion
+632 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+633 loupe, jeweler's loupe
+634 lumbermill, sawmill
+635 magnetic compass
+636 mailbag, postbag
+637 mailbox, letter box
+638 maillot
+639 maillot, tank suit
+640 manhole cover
+641 maraca
+642 marimba, xylophone
+643 mask
+644 matchstick
+645 maypole
+646 maze, labyrinth
+647 measuring cup
+648 medicine chest, medicine cabinet
+649 megalith, megalithic structure
+650 microphone, mike
+651 microwave, microwave oven
+652 military uniform
+653 milk can
+654 minibus
+655 miniskirt, mini
+656 minivan
+657 missile
+658 mitten
+659 mixing bowl
+660 mobile home, manufactured home
+661 Model T
+662 modem
+663 monastery
+664 monitor
+665 moped
+666 mortar
+667 mortarboard
+668 mosque
+669 mosquito net
+670 motor scooter, scooter
+671 mountain bike, all-terrain bike, off-roader
+672 mountain tent
+673 mouse, computer mouse
+674 mousetrap
+675 moving van
+676 muzzle
+677 nail
+678 neck brace
+679 necklace
+680 nipple
+681 notebook, notebook computer
+682 obelisk
+683 oboe, hautboy, hautbois
+684 ocarina, sweet potato
+685 odometer, hodometer, mileometer, milometer
+686 oil filter
+687 organ, pipe organ
+688 oscilloscope, scope, cathode-ray oscilloscope, CRO
+689 overskirt
+690 oxcart
+691 oxygen mask
+692 packet
+693 paddle, boat paddle
+694 paddlewheel, paddle wheel
+695 padlock
+696 paintbrush
+697 pajama, pyjama, pj's, jammies
+698 palace
+699 panpipe, pandean pipe, syrinx
+700 paper towel
+701 parachute, chute
+702 parallel bars, bars
+703 park bench
+704 parking meter
+705 passenger car, coach, carriage
+706 patio, terrace
+707 pay-phone, pay-station
+708 pedestal, plinth, footstall
+709 pencil box, pencil case
+710 pencil sharpener
+711 perfume, essence
+712 Petri dish
+713 photocopier
+714 pick, plectrum, plectron
+715 pickelhaube
+716 picket fence, paling
+717 pickup, pickup truck
+718 pier
+719 piggy bank, penny bank
+720 pill bottle
+721 pillow
+722 ping-pong ball
+723 pinwheel
+724 pirate, pirate ship
+725 pitcher, ewer
+726 plane, carpenter's plane, woodworking plane
+727 planetarium
+728 plastic bag
+729 plate rack
+730 plow, plough
+731 plunger, plumber's helper
+732 Polaroid camera, Polaroid Land camera
+733 pole
+734 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+735 poncho
+736 pool table, billiard table, snooker table
+737 pop bottle, soda bottle
+738 pot, flowerpot
+739 potter's wheel
+740 power drill
+741 prayer rug, prayer mat
+742 printer
+743 prison, prison house
+744 projectile, missile
+745 projector
+746 puck, hockey puck
+747 punching bag, punch bag, punching ball, punchball
+748 purse
+749 quill, quill pen
+750 quilt, comforter, comfort, puff
+751 racer, race car, racing car
+752 racket, racquet
+753 radiator
+754 radio, wireless
+755 radio telescope, radio reflector
+756 rain barrel
+757 recreational vehicle, RV, R.V.
+758 reel
+759 reflex camera
+760 refrigerator, icebox
+761 remote control, remote
+762 restaurant, eating house, eating place, eatery
+763 revolver, six-gun, six-shooter
+764 rifle
+765 rocking chair, rocker
+766 rotisserie
+767 rubber eraser, rubber, pencil eraser
+768 rugby ball
+769 rule, ruler
+770 running shoe
+771 safe
+772 safety pin
+773 saltshaker, salt shaker
+774 sandal
+775 sarong
+776 sax, saxophone
+777 scabbard
+778 scale, weighing machine
+779 school bus
+780 schooner
+781 scoreboard
+782 screen, CRT screen
+783 screw
+784 screwdriver
+785 seat belt, seatbelt
+786 sewing machine
+787 shield, buckler
+788 shoe shop, shoe-shop, shoe store
+789 shoji
+790 shopping basket
+791 shopping cart
+792 shovel
+793 shower cap
+794 shower curtain
+795 ski
+796 ski mask
+797 sleeping bag
+798 slide rule, slipstick
+799 sliding door
+800 slot, one-armed bandit
+801 snorkel
+802 snowmobile
+803 snowplow, snowplough
+804 soap dispenser
+805 soccer ball
+806 sock
+807 solar dish, solar collector, solar furnace
+808 sombrero
+809 soup bowl
+810 space bar
+811 space heater
+812 space shuttle
+813 spatula
+814 speedboat
+815 spider web, spider's web
+816 spindle
+817 sports car, sport car
+818 spotlight, spot
+819 stage
+820 steam locomotive
+821 steel arch bridge
+822 steel drum
+823 stethoscope
+824 stole
+825 stone wall
+826 stopwatch, stop watch
+827 stove
+828 strainer
+829 streetcar, tram, tramcar, trolley, trolley car
+830 stretcher
+831 studio couch, day bed
+832 stupa, tope
+833 submarine, pigboat, sub, U-boat
+834 suit, suit of clothes
+835 sundial
+836 sunglass
+837 sunglasses, dark glasses, shades
+838 sunscreen, sunblock, sun blocker
+839 suspension bridge
+840 swab, swob, mop
+841 sweatshirt
+842 swimming trunks, bathing trunks
+843 swing
+844 switch, electric switch, electrical switch
+845 syringe
+846 table lamp
+847 tank, army tank, armored combat vehicle, armoured combat vehicle
+848 tape player
+849 teapot
+850 teddy, teddy bear
+851 television, television system
+852 tennis ball
+853 thatch, thatched roof
+854 theater curtain, theatre curtain
+855 thimble
+856 thresher, thrasher, threshing machine
+857 throne
+858 tile roof
+859 toaster
+860 tobacco shop, tobacconist shop, tobacconist
+861 toilet seat
+862 torch
+863 totem pole
+864 tow truck, tow car, wrecker
+865 toyshop
+866 tractor
+867 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+868 tray
+869 trench coat
+870 tricycle, trike, velocipede
+871 trimaran
+872 tripod
+873 triumphal arch
+874 trolleybus, trolley coach, trackless trolley
+875 trombone
+876 tub, vat
+877 turnstile
+878 typewriter keyboard
+879 umbrella
+880 unicycle, monocycle
+881 upright, upright piano
+882 vacuum, vacuum cleaner
+883 vase
+884 vault
+885 velvet
+886 vending machine
+887 vestment
+888 viaduct
+889 violin, fiddle
+890 volleyball
+891 waffle iron
+892 wall clock
+893 wallet, billfold, notecase, pocketbook
+894 wardrobe, closet, press
+895 warplane, military plane
+896 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+897 washer, automatic washer, washing machine
+898 water bottle
+899 water jug
+900 water tower
+901 whiskey jug
+902 whistle
+903 wig
+904 window screen
+905 window shade
+906 Windsor tie
+907 wine bottle
+908 wing
+909 wok
+910 wooden spoon
+911 wool, woolen, woollen
+912 worm fence, snake fence, snake-rail fence, Virginia fence
+913 wreck
+914 yawl
+915 yurt
+916 web site, website, internet site, site
+917 comic book
+918 crossword puzzle, crossword
+919 street sign
+920 traffic light, traffic signal, stoplight
+921 book jacket, dust cover, dust jacket, dust wrapper
+922 menu
+923 plate
+924 guacamole
+925 consomme
+926 hot pot, hotpot
+927 trifle
+928 ice cream, icecream
+929 ice lolly, lolly, lollipop, popsicle
+930 French loaf
+931 bagel, beigel
+932 pretzel
+933 cheeseburger
+934 hotdog, hot dog, red hot
+935 mashed potato
+936 head cabbage
+937 broccoli
+938 cauliflower
+939 zucchini, courgette
+940 spaghetti squash
+941 acorn squash
+942 butternut squash
+943 cucumber, cuke
+944 artichoke, globe artichoke
+945 bell pepper
+946 cardoon
+947 mushroom
+948 Granny Smith
+949 strawberry
+950 orange
+951 lemon
+952 fig
+953 pineapple, ananas
+954 banana
+955 jackfruit, jak, jack
+956 custard apple
+957 pomegranate
+958 hay
+959 carbonara
+960 chocolate sauce, chocolate syrup
+961 dough
+962 meat loaf, meatloaf
+963 pizza, pizza pie
+964 potpie
+965 burrito
+966 red wine
+967 espresso
+968 cup
+969 eggnog
+970 alp
+971 bubble
+972 cliff, drop, drop-off
+973 coral reef
+974 geyser
+975 lakeside, lakeshore
+976 promontory, headland, head, foreland
+977 sandbar, sand bar
+978 seashore, coast, seacoast, sea-coast
+979 valley, vale
+980 volcano
+981 ballplayer, baseball player
+982 groom, bridegroom
+983 scuba diver
+984 rapeseed
+985 daisy
+986 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+987 corn
+988 acorn
+989 hip, rose hip, rosehip
+990 buckeye, horse chestnut, conker
+991 coral fungus
+992 agaric
+993 gyromitra
+994 stinkhorn, carrion fungus
+995 earthstar
+996 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+997 bolete
+998 ear, spike, capitulum
+999 toilet tissue, toilet paper, bathroom tissue
diff --git a/example/low_precision_training/ppcls/utils/initializer.py b/example/low_precision_training/ppcls/utils/initializer.py
new file mode 100755
index 000000000..b044e8088
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/initializer.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    'uniform_',
+    'normal_',
+    'constant_',
+    'ones_',
+    'zeros_',
+    'xavier_uniform_',
+    'xavier_normal_',
+    'kaiming_uniform_',
+    'kaiming_normal_',
+    'linear_init_',
+    'conv_init_',
+    'reset_initialized_parameter',
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(
+                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0., std=1.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0., std=1.):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+        'conv_transpose2d', 'conv_transpose3d'
+    ]
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(
+                param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+                     a=0,
+                     mode='fan_in',
+                     nonlinearity='leaky_relu',
+                     reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+                    a=0,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+                                    m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1. / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0., std=1.)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_fill_(m.bias, 0)
diff --git a/example/low_precision_training/ppcls/utils/logger.py b/example/low_precision_training/ppcls/utils/logger.py
new file mode 100755
index 000000000..b8d1ebaf8
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/logger.py
@@ -0,0 +1,173 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import logging
+import os
+import sys
+
+import paddle.distributed as dist
+
+_logger = None
+
+
+class LoggerHook(object):
+    """
+    logs will print multi-times when calling Fleet API.
+    Commonly, only need to display single log at rank0 and ignore the others.
+    """
+    block = False
+
+    def __init__(self, log):
+        self.log = log
+
+    def __call__(self, *args, **kwargs):
+        if not self.block:
+            self.log(*args, **kwargs)
+
+
+def init_logger(name='ppcls',
+                log_file=None,
+                log_level=logging.INFO,
+                log_ranks="0"):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified a FileHandler will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        log_ranks (str): The ids of gpu to log which are separated by "," when more than 1, "0" by default.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    global _logger
+
+    #  solve mutiple init issue when using paddleclas.py and engin.engin
+    init_flag = False
+    if _logger is None:
+        _logger = logging.getLogger(name)
+        init_flag = True
+
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
+        datefmt="%Y/%m/%d %H:%M:%S")
+
+    stream_handler = logging.StreamHandler(stream=sys.stdout)
+    stream_handler.setFormatter(formatter)
+    stream_handler._name = 'stream_handler'
+
+    # add stream_handler when _logger dose not contain stream_handler
+    for i, h in enumerate(_logger.handlers):
+        if h.get_name() == stream_handler.get_name():
+            break
+        if i == len(_logger.handlers) - 1:
+            _logger.addHandler(stream_handler)
+    if init_flag:
+        _logger.addHandler(stream_handler)
+
+    if log_file is not None and dist.get_rank() == 0:
+        log_file_folder = os.path.split(log_file)[0]
+        os.makedirs(log_file_folder, exist_ok=True)
+        file_handler = logging.FileHandler(log_file, 'a')
+        file_handler.setFormatter(formatter)
+        file_handler._name = 'file_handler'
+
+        # add file_handler when _logger dose not contain same file_handler
+        for i, h in enumerate(_logger.handlers):
+            if h.get_name() == file_handler.get_name() and \
+                    h.baseFilename == file_handler.baseFilename:
+                break
+            if i == len(_logger.handlers) - 1:
+                _logger.addHandler(file_handler)
+
+    if isinstance(log_ranks, str):
+        log_ranks = [int(i) for i in log_ranks.split(',')]
+    elif isinstance(log_ranks, int):
+        log_ranks = [log_ranks]
+    if dist.get_rank() in log_ranks:
+        _logger.setLevel(log_level)
+        LoggerHook.block = False
+    else:
+        _logger.setLevel(logging.ERROR)
+        LoggerHook.block = True
+    _logger.propagate = False
+
+
+@LoggerHook
+def info(fmt, *args):
+    _logger.info(fmt, *args)
+
+
+@LoggerHook
+def debug(fmt, *args):
+    _logger.debug(fmt, *args)
+
+
+@LoggerHook
+def warning(fmt, *args):
+    _logger.warning(fmt, *args)
+
+
+@LoggerHook
+def error(fmt, *args):
+    _logger.error(fmt, *args)
+
+
+def scaler(name, value, step, writer):
+    """
+    This function will draw a scalar curve generated by the visualdl.
+    Usage: Install visualdl: pip3 install visualdl==2.0.0b4
+           and then:
+           visualdl --logdir ./scalar --host 0.0.0.0 --port 8830 
+           to preview loss corve in real time.
+    """
+    if writer is None:
+        return
+    writer.add_scalar(tag=name, step=step, value=value)
+
+
+def advertise():
+    """
+    Show the advertising message like the following:
+
+    ===========================================================
+    ==        PaddleClas is powered by PaddlePaddle !        ==
+    ===========================================================
+    ==                                                       ==
+    ==   For more info please go to the following website.   ==
+    ==                                                       ==
+    ==       https://github.com/PaddlePaddle/PaddleClas      ==
+    ===========================================================
+
+    """
+    copyright = "PaddleClas is powered by PaddlePaddle !"
+    ad = "For more info please go to the following website."
+    website = "https://github.com/PaddlePaddle/PaddleClas"
+    AD_LEN = 6 + len(max([copyright, ad, website], key=len))
+
+    info("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format(
+        "=" * (AD_LEN + 4),
+        "=={}==".format(copyright.center(AD_LEN)),
+        "=" * (AD_LEN + 4),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(ad.center(AD_LEN)),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(website.center(AD_LEN)),
+        "=" * (AD_LEN + 4), ))
diff --git a/example/low_precision_training/ppcls/utils/metrics.py b/example/low_precision_training/ppcls/utils/metrics.py
new file mode 100755
index 000000000..b0db68a75
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/metrics.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.metrics import average_precision_score
+from sklearn.preprocessing import binarize
+
+import numpy as np
+
+__all__ = ["multi_hot_encode", "hamming_distance", "accuracy_score", "precision_recall_fscore", "mean_average_precision"]
+
+
+def multi_hot_encode(logits, threshold=0.5):
+    """
+    Encode logits to multi-hot by elementwise for multilabel
+    """
+
+    return binarize(logits, threshold=threshold)
+
+
+def hamming_distance(output, target):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    return hamming_loss(target, output)
+
+
+def accuracy_score(output, target, base="sample"):
+    """
+    Hard metric for multilabel classification
+    Args:
+        output:
+        target:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    assert base in ["sample", "label"], 'must be one of ["sample", "label"]'
+
+    if base == "sample":
+        accuracy = accuracy_metric(target, output)
+    elif base == "label":
+        mcm = multilabel_confusion_matrix(target, output)
+        tns = mcm[:, 0, 0]
+        fns = mcm[:, 1, 0]
+        tps = mcm[:, 1, 1]
+        fps = mcm[:, 0, 1]
+
+        accuracy = (sum(tps) + sum(tns)) / (sum(tps) + sum(tns) + sum(fns) + sum(fps))
+
+    return accuracy
+
+
+def precision_recall_fscore(output, target):
+    """
+    Metric based label for multilabel classification
+    Returns:
+        precisions:
+        recalls:
+        fscores:
+    """
+
+    precisions, recalls, fscores, _ = precision_recall_fscore_support(target, output)
+
+    return precisions, recalls, fscores
+
+
+def mean_average_precision(logits, target):
+    """
+    Calculate average precision
+    Args:
+        logits: probability from network before sigmoid or softmax
+        target: ground truth, 0 or 1
+    """
+    if not (isinstance(logits, np.ndarray) and isinstance(target, np.ndarray)):
+        raise TypeError("logits and target should be np.ndarray.")
+
+    aps = []
+    for i in range(target.shape[1]):
+        ap = average_precision_score(target[:, i], logits[:, i])
+        aps.append(ap)
+
+    return np.mean(aps)
diff --git a/example/low_precision_training/ppcls/utils/misc.py b/example/low_precision_training/ppcls/utils/misc.py
new file mode 100755
index 000000000..b63da7c5f
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/misc.py
@@ -0,0 +1,155 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+__all__ = ['AverageMeter']
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    Code was based on https://github.com/pytorch/examples/blob/master/imagenet/main.py
+    """
+
+    def __init__(self, name='', fmt='f', postfix="", need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.postfix = postfix
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def avg_info(self):
+        if isinstance(self.avg, paddle.Tensor):
+            self.avg = float(self.avg)
+        return "{}: {:.5f}".format(self.name, self.avg)
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}{self.postfix}'.format(
+            self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name} {s:{self.fmt}}{self.postfix} min'.format(
+            s=self.sum / 60, self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}: {self.avg:{self.fmt}}{self.postfix}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}{self.postfix}'.format(
+            self=self)
+
+
+class AttrMeter(object):
+    """
+    Computes and stores the average and current value
+    Code was based on https://github.com/pytorch/examples/blob/master/imagenet/main.py
+    """
+
+    def __init__(self, threshold=0.5):
+        self.threshold = threshold
+        self.reset()
+
+    def reset(self):
+        self.gt_pos = 0
+        self.gt_neg = 0
+        self.true_pos = 0
+        self.true_neg = 0
+        self.false_pos = 0
+        self.false_neg = 0
+
+        self.gt_pos_ins = []
+        self.true_pos_ins = []
+        self.intersect_pos = []
+        self.union_pos = []
+
+    def update(self, metric_dict):
+        self.gt_pos += metric_dict['gt_pos']
+        self.gt_neg += metric_dict['gt_neg']
+        self.true_pos += metric_dict['true_pos']
+        self.true_neg += metric_dict['true_neg']
+        self.false_pos += metric_dict['false_pos']
+        self.false_neg += metric_dict['false_neg']
+
+        self.gt_pos_ins += metric_dict['gt_pos_ins'].tolist()
+        self.true_pos_ins += metric_dict['true_pos_ins'].tolist()
+        self.intersect_pos += metric_dict['intersect_pos'].tolist()
+        self.union_pos += metric_dict['union_pos'].tolist()
+
+    def res(self):
+        import numpy as np
+        eps = 1e-20
+        label_pos_recall = 1.0 * self.true_pos / (
+            self.gt_pos + eps)  # true positive
+        label_neg_recall = 1.0 * self.true_neg / (
+            self.gt_neg + eps)  # true negative
+        # mean accuracy
+        label_ma = (label_pos_recall + label_neg_recall) / 2
+
+        label_pos_recall = np.mean(label_pos_recall)
+        label_neg_recall = np.mean(label_neg_recall)
+        label_prec = (self.true_pos / (self.true_pos + self.false_pos + eps))
+        label_acc = (self.true_pos /
+                     (self.true_pos + self.false_pos + self.false_neg + eps))
+        label_f1 = np.mean(2 * label_prec * label_pos_recall /
+                           (label_prec + label_pos_recall + eps))
+
+        ma = (np.mean(label_ma))
+
+        self.gt_pos_ins = np.array(self.gt_pos_ins)
+        self.true_pos_ins = np.array(self.true_pos_ins)
+        self.intersect_pos = np.array(self.intersect_pos)
+        self.union_pos = np.array(self.union_pos)
+        instance_acc = self.intersect_pos / (self.union_pos + eps)
+        instance_prec = self.intersect_pos / (self.true_pos_ins + eps)
+        instance_recall = self.intersect_pos / (self.gt_pos_ins + eps)
+        instance_f1 = 2 * instance_prec * instance_recall / (
+            instance_prec + instance_recall + eps)
+
+        instance_acc = np.mean(instance_acc)
+        instance_prec = np.mean(instance_prec)
+        instance_recall = np.mean(instance_recall)
+        instance_f1 = 2 * instance_prec * instance_recall / (
+            instance_prec + instance_recall + eps)
+
+        instance_acc = np.mean(instance_acc)
+        instance_prec = np.mean(instance_prec)
+        instance_recall = np.mean(instance_recall)
+        instance_f1 = np.mean(instance_f1)
+
+        res = [
+            ma, label_f1, label_pos_recall, label_neg_recall, instance_f1,
+            instance_acc, instance_prec, instance_recall
+        ]
+        return res
diff --git a/example/low_precision_training/ppcls/utils/model_zoo.py b/example/low_precision_training/ppcls/utils/model_zoo.py
new file mode 100755
index 000000000..e9ab5992d
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/model_zoo.py
@@ -0,0 +1,213 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import requests
+import shutil
+import tarfile
+import tqdm
+import zipfile
+
+from ..arch.utils import similar_architectures
+from . import logger
+
+__all__ = ['get']
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+class UrlError(Exception):
+    """ UrlError
+    """
+
+    def __init__(self, url='', code=''):
+        message = "Downloading from {} failed with code {}!".format(url, code)
+        super(UrlError, self).__init__(message)
+
+
+class ModelNameError(Exception):
+    """ ModelNameError
+    """
+
+    def __init__(self, message=''):
+        super(ModelNameError, self).__init__(message)
+
+
+class RetryError(Exception):
+    """ RetryError
+    """
+
+    def __init__(self, url='', times=''):
+        message = "Download from {} failed. Retry({}) limit reached".format(
+            url, times)
+        super(RetryError, self).__init__(message)
+
+
+def _get_url(architecture, postfix="pdparams"):
+    prefix = "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/"
+    fname = architecture + "_pretrained." + postfix
+    return prefix + fname
+
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not os.path.exists(dst):
+        shutil.move(src, dst)
+    elif os.path.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = os.path.join(src, fp)
+            dst_fp = os.path.join(dst, fp)
+            if os.path.isdir(src_fp):
+                if os.path.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif os.path.isfile(src_fp) and \
+                    not os.path.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
+
+
+def _download(url, path):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    fname = os.path.split(url)[-1]
+    fullname = os.path.join(path, fname)
+    retry_cnt = 0
+
+    while not os.path.exists(fullname):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RetryError(url, DOWNLOAD_RETRY_LIMIT)
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise UrlError(url, req.status_code)
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = os.path.split(fname)[0]
+    fpath_tmp = os.path.join(fpath, 'tmp')
+    if os.path.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    fs = os.listdir(fpath_tmp)
+    assert len(
+        fs
+    ) == 1, "There should just be 1 pretrained path in an archive file but got {}.".format(
+        len(fs))
+
+    f = fs[0]
+    src_dir = os.path.join(fpath_tmp, f)
+    dst_dir = os.path.join(fpath, f)
+    _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+    return f
+
+
+def _get_pretrained():
+    with open('./ppcls/utils/pretrained.list') as flist:
+        pretrained = [line.strip() for line in flist]
+    return pretrained
+
+
+def _check_pretrained_name(architecture):
+    assert isinstance(architecture, str), \
+        ("the type of architecture({}) should be str". format(architecture))
+    pretrained = _get_pretrained()
+    similar_names = similar_architectures(architecture, pretrained)
+    model_list = ', '.join(similar_names)
+    err = "{} is not exist! Maybe you want: [{}]" \
+          "".format(architecture, model_list)
+    if architecture not in similar_names:
+        raise ModelNameError(err)
+
+
+def list_models():
+    pretrained = _get_pretrained()
+    msg = "All avialable pretrained models are as follows: {}".format(
+        pretrained)
+    logger.info(msg)
+    return
+
+
+def get(architecture, path, decompress=False, postfix="pdparams"):
+    """
+    Get the pretrained model.
+    """
+    _check_pretrained_name(architecture)
+    url = _get_url(architecture, postfix=postfix)
+    fname = _download(url, path)
+    if postfix == "tar" and decompress:
+        _decompress(fname)
+    logger.info("download {} finished ".format(fname))
diff --git a/example/low_precision_training/ppcls/utils/pedestrian_attribute_label_list.txt b/example/low_precision_training/ppcls/utils/pedestrian_attribute_label_list.txt
new file mode 100755
index 000000000..af6e0df1c
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/pedestrian_attribute_label_list.txt
@@ -0,0 +1,26 @@
+0 Hat(帽子)
+1 Glasses(眼镜)
+2 ShortSleeve(短袖)
+3 LongSleeve(长袖)
+4 UpperStride(上衣条纹)
+5 UpperLogo(上衣有标志)
+6 UpperPlaid(上衣格子)
+7 UpperSplice(上衣拼接)
+8 LowerStripe(裤子条纹)
+9 LowerPattern(裤子图案)
+10 LongCoat(长外套)
+11 Trousers(长裤)
+12 Shorts(短裤)
+13 Skirt&Dress(裙子或连衣裙)
+14 Boots(靴子)
+15 HandBag(手提包)
+16 ShoulderBag(单肩包)
+17 Backpack(背包)
+18 HoldObjectsInFront(手持物品在前)
+19 AgeLess18(年龄小于18岁)
+20 Age18-60(年龄在18-60岁之间)
+21 AgeOver60(年龄大于60岁)
+22 Female(女性)
+23 Front(面朝前)
+24 Side(侧面)
+25 Back(背面)
diff --git a/example/low_precision_training/ppcls/utils/pretrained.list b/example/low_precision_training/ppcls/utils/pretrained.list
new file mode 100755
index 000000000..36d70f5a2
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/pretrained.list
@@ -0,0 +1,121 @@
+ResNet18
+ResNet34
+ResNet50
+ResNet101
+ResNet152
+ResNet50_vc
+ResNet18_vd
+ResNet34_vd
+ResNet50_vd
+ResNet50_vd_v2
+ResNet101_vd
+ResNet152_vd
+ResNet200_vd
+ResNet50_vd_ssld
+ResNet50_vd_ssld_v2
+Fix_ResNet50_vd_ssld_v2
+ResNet101_vd_ssld
+MobileNetV3_large_x0_35
+MobileNetV3_large_x0_5
+MobileNetV3_large_x0_75
+MobileNetV3_large_x1_0
+MobileNetV3_large_x1_25
+MobileNetV3_small_x0_35
+MobileNetV3_small_x0_5
+MobileNetV3_small_x0_75
+MobileNetV3_small_x1_0
+MobileNetV3_small_x1_25
+MobileNetV3_large_x1_0_ssld
+MobileNetV3_large_x1_0_ssld_int8
+MobileNetV3_small_x1_0_ssld
+MobileNetV2_x0_25
+MobileNetV2_x0_5
+MobileNetV2_x0_75
+MobileNetV2
+MobileNetV2_x1_5
+MobileNetV2_x2_0
+MobileNetV2_ssld
+MobileNetV1_x0_25
+MobileNetV1_x0_5
+MobileNetV1_x0_75
+MobileNetV1
+MobileNetV1_ssld
+ShuffleNetV2_x0_25
+ShuffleNetV2_x0_33
+ShuffleNetV2_x0_5
+ShuffleNetV2
+ShuffleNetV2_x1_5
+ShuffleNetV2_x2_0
+ShuffleNetV2_swish
+ResNeXt50_32x4d
+ResNeXt50_64x4d
+ResNeXt101_32x4d
+ResNeXt101_64x4d
+ResNeXt152_32x4d
+ResNeXt152_64x4d
+ResNeXt50_vd_32x4d
+ResNeXt50_vd_64x4d
+ResNeXt101_vd_32x4d
+ResNeXt101_vd_64x4d
+ResNeXt152_vd_32x4d
+ResNeXt152_vd_64x4d
+SE_ResNet18_vd
+SE_ResNet34_vd
+SE_ResNet50_vd
+SE_ResNeXt50_32x4d
+SE_ResNeXt101_32x4d
+SE_ResNeXt50_vd_32x4d
+SENet154_vd
+Res2Net50_26w_4s
+Res2Net50_vd_26w_4s
+Res2Net50_14w_8s
+Res2Net101_vd_26w_4s
+Res2Net200_vd_26w_4s
+GoogLeNet
+InceptionV4
+Xception41
+Xception41_deeplab
+Xception65
+Xception65_deeplab
+Xception71
+HRNet_W18_C
+HRNet_W30_C
+HRNet_W32_C
+HRNet_W40_C
+HRNet_W44_C
+HRNet_W48_C
+HRNet_W64_C
+DPN68
+DPN92
+DPN98
+DPN107
+DPN131
+DenseNet121
+DenseNet161
+DenseNet169
+DenseNet201
+DenseNet264
+EfficientNetB0_small
+EfficientNetB0
+EfficientNetB1
+EfficientNetB2
+EfficientNetB3
+EfficientNetB4
+EfficientNetB5
+EfficientNetB6
+EfficientNetB7
+ResNeXt101_32x8d_wsl
+ResNeXt101_32x16d_wsl
+ResNeXt101_32x32d_wsl
+ResNeXt101_32x48d_wsl
+Fix_ResNeXt101_32x48d_wsl
+AlexNet
+SqueezeNet1_0
+SqueezeNet1_1
+VGG11
+VGG13
+VGG16
+VGG19
+DarkNet53_ImageNet1k
+ResNet50_ACNet_deploy
+CSPResNet50_leaky
diff --git a/example/low_precision_training/ppcls/utils/profiler.py b/example/low_precision_training/ppcls/utils/profiler.py
new file mode 100755
index 000000000..28ac46736
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/profiler.py
@@ -0,0 +1,129 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+import paddle.profiler as profiler
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+_prof = None
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True,
+            'timer_only': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+            elif key == 'timer_only':
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _prof 
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan
+    # timer_only = True  only the model's throughput and time overhead are displayed
+    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.
+    # timer_only = False the output Timeline information can be found in the profiler_log directory
+    if _prof is None:
+        _timer_only = str(_profiler_options['timer_only']) == str(True)
+        _prof = profiler.Profiler(
+                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),
+                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),
+                   timer_only = _timer_only)
+        _prof.start()
+    else:
+        _prof.step()
+        
+    if _profiler_step_id == _profiler_options['batch_range'][1]:
+        _prof.stop()
+        _prof.summary(
+             op_detail=True,
+             thread_sep=False,
+             time_unit='ms')
+        _prof = None
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/example/low_precision_training/ppcls/utils/save_load.py b/example/low_precision_training/ppcls/utils/save_load.py
new file mode 100755
index 000000000..13ccab2ad
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/save_load.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import json
+
+import paddle
+from . import logger
+from .download import get_weights_path_from_url
+
+__all__ = ['init_model', 'save_model', 'load_dygraph_pretrain']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def _extract_student_weights(all_params, student_prefix="Student."):
+    s_params = {
+        key[len(student_prefix):]: all_params[key]
+        for key in all_params if student_prefix in key
+    }
+    return s_params
+
+
+def _set_ssld_pretrained(pretrained_path,
+                         use_ssld=False,
+                         use_ssld_stage1_pretrained=False):
+    if use_ssld and "ssld" not in pretrained_path:
+        pretrained_path = pretrained_path.replace("_pretrained",
+                                                  "_ssld_pretrained")
+    if use_ssld_stage1_pretrained and "ssld" in pretrained_path:
+        pretrained_path = pretrained_path.replace("ssld_pretrained",
+                                                  "ssld_stage1_pretrained")
+    return pretrained_path
+
+
+def load_dygraph_pretrain(model,
+                          pretrained_path,
+                          use_ssld=False,
+                          use_ssld_stage1_pretrained=False,
+                          use_imagenet22k_pretrained=False,
+                          use_imagenet22kto1k_pretrained=False):
+    if pretrained_path.startswith(("http://", "https://")):
+        pretrained_path = _set_ssld_pretrained(
+            pretrained_path,
+            use_ssld=use_ssld,
+            use_ssld_stage1_pretrained=use_ssld_stage1_pretrained)
+        if use_imagenet22k_pretrained:
+            pretrained_path = pretrained_path.replace("_pretrained",
+                                                      "_22k_pretrained")
+        if use_imagenet22kto1k_pretrained:
+            pretrained_path = pretrained_path.replace("_pretrained",
+                                                      "_22kto1k_pretrained")
+        pretrained_path = get_weights_path_from_url(pretrained_path)
+    if not pretrained_path.endswith('.pdparams'):
+        pretrained_path = pretrained_path + '.pdparams'
+    if not os.path.exists(pretrained_path):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(pretrained_path))
+    param_state_dict = paddle.load(pretrained_path)
+    if isinstance(model, list):
+        for m in model:
+            if hasattr(m, 'set_dict'):
+                m.set_dict(param_state_dict)
+    else:
+        model.set_dict(param_state_dict)
+    logger.info("Finish load pretrained model from {}".format(pretrained_path))
+    return
+
+
+def load_distillation_model(model, pretrained_model):
+    logger.info("In distillation mode, teacher model will be "
+                "loaded firstly before student model.")
+
+    if not isinstance(pretrained_model, list):
+        pretrained_model = [pretrained_model]
+
+    teacher = model.teacher if hasattr(model,
+                                       "teacher") else model._layers.teacher
+    student = model.student if hasattr(model,
+                                       "student") else model._layers.student
+    load_dygraph_pretrain(teacher, path=pretrained_model[0])
+    logger.info("Finish initing teacher model from {}".format(pretrained_model))
+    # load student model
+    if len(pretrained_model) >= 2:
+        load_dygraph_pretrain(student, path=pretrained_model[1])
+        logger.info("Finish initing student model from {}".format(
+            pretrained_model))
+
+
+def init_model(config,
+               net,
+               optimizer=None,
+               loss: paddle.nn.Layer=None,
+               ema=None):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints and optimizer is not None:
+        assert os.path.exists(checkpoints + ".pdparams"), \
+            "Given dir {}.pdparams not exist.".format(checkpoints)
+        assert os.path.exists(checkpoints + ".pdopt"), \
+            "Given dir {}.pdopt not exist.".format(checkpoints)
+        # load state dict
+        opti_dict = paddle.load(checkpoints + ".pdopt")
+        metric_dict = paddle.load(checkpoints + ".pdstates")
+        if ema is not None:
+            assert os.path.exists(checkpoints + ".pdema"), \
+                "Given dir {}.pdema not exist.".format(checkpoints)
+            para_dict = paddle.load(checkpoints + ".pdema")
+            para_ema_dict = paddle.load(checkpoints + ".pdparams")
+            ema.set_state_dict(para_ema_dict)
+        else:
+            para_dict = paddle.load(checkpoints + ".pdparams")
+        metric_dict["metric"] = 0.0
+        # set state dict
+        net.set_state_dict(para_dict)
+        loss.set_state_dict(para_dict)
+        for i in range(len(optimizer)):
+            optimizer[i].set_state_dict(opti_dict[i] if isinstance(
+                opti_dict, list) else opti_dict)
+        logger.info("Finish load checkpoints from {}".format(checkpoints))
+        return metric_dict
+
+    pretrained_model = config.get('pretrained_model')
+    use_distillation = config.get('use_distillation', False)
+    if pretrained_model:
+        if use_distillation:
+            load_distillation_model(net, pretrained_model)
+        else:  # common load
+            load_dygraph_pretrain(net, path=pretrained_model)
+            logger.info("Finish load pretrained model from {}".format(
+                pretrained_model))
+
+
+def save_model(net,
+               optimizer,
+               metric_info,
+               model_path,
+               ema=None,
+               model_name="",
+               prefix='ppcls',
+               loss: paddle.nn.Layer=None,
+               save_student_model=False):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+
+    if prefix == 'best_model':
+        best_model_path = os.path.join(model_path, 'best_model')
+        _mkdir_if_not_exist(best_model_path)
+
+    _mkdir_if_not_exist(model_path)
+    model_path = os.path.join(model_path, prefix)
+
+    params_state_dict = net.state_dict()
+    if loss is not None:
+        loss_state_dict = loss.state_dict()
+        keys_inter = set(params_state_dict.keys()) & set(loss_state_dict.keys())
+        assert len(keys_inter) == 0, \
+            f"keys in model and loss state_dict must be unique, but got intersection {keys_inter}"
+        params_state_dict.update(loss_state_dict)
+
+    if save_student_model:
+        s_params = _extract_student_weights(params_state_dict)
+        if len(s_params) > 0:
+            paddle.save(s_params, model_path + "_student.pdparams")
+    if ema is not None:
+        paddle.save(params_state_dict, model_path + ".pdema")
+        paddle.save(ema.state_dict(), model_path + ".pdparams")
+    else:
+        paddle.save(params_state_dict, model_path + ".pdparams")
+
+    if prefix == 'best_model':
+        best_model_path = os.path.join(best_model_path, 'model')
+        paddle.save(params_state_dict, best_model_path + ".pdparams")
+    paddle.save([opt.state_dict() for opt in optimizer], model_path + ".pdopt")
+    paddle.save(metric_info, model_path + ".pdstates")
+    logger.info("Already save model in {}".format(model_path))
+
+
+def save_model_info(model_info, save_path, prefix):
+    """
+    save model info to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    save_path = os.path.join(save_path, prefix)
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    with open(os.path.join(save_path, f'{prefix}.info.json'), 'w') as f:
+        json.dump(model_info, f)
+    logger.info("Already save model info in {}".format(save_path))
diff --git a/example/low_precision_training/ppcls/utils/save_result.py b/example/low_precision_training/ppcls/utils/save_result.py
new file mode 100755
index 000000000..f7613db36
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/save_result.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import yaml
+import paddle
+
+from . import logger
+
+
+def save_predict_result(save_path, result):
+    if os.path.splitext(save_path)[-1] == '':
+        if save_path[-1] == "/":
+            save_path = save_path[:-1]
+        save_path = save_path + '.json'
+    elif os.path.splitext(save_path)[-1] == '.json':
+        save_path = save_path
+    else:
+        raise Exception(
+            f"{save_path} is invalid input path, only files in json format are supported."
+        )
+
+    if os.path.exists(save_path):
+        logger.warning(f"The file {save_path} will be overwritten.")
+    with open(save_path, 'w', encoding='utf-8') as f:
+        json.dump(result, f)
+
+
+def update_train_results(config,
+                         prefix,
+                         metric_info,
+                         done_flag=False,
+                         last_num=5,
+                         ema=False):
+
+    if paddle.distributed.get_rank() != 0:
+        return
+
+    assert last_num >= 1
+    train_results_path = os.path.join(config["Global"]["output_dir"],
+                                      "train_results.json")
+    save_model_tag = ["pdparams", "pdopt", "pdstates"]
+    save_inference_tag = [
+        "inference_config", "pdmodel", "pdiparams", "pdiparams.info"
+    ]
+    if ema:
+        save_model_tag.append("pdema")
+    if os.path.exists(train_results_path):
+        with open(train_results_path, "r") as fp:
+            train_results = json.load(fp)
+    else:
+        train_results = {}
+        train_results["model_name"] = config["Global"].get("pdx_model_name",
+                                                           None)
+        if config.get("infer", None):
+            train_results["label_dict"] = config["Infer"]["PostProcess"].get(
+                "class_id_map_file", "")
+        else:
+            train_results["label_dict"] = ""
+        train_results["train_log"] = "train.log"
+        train_results["visualdl_log"] = ""
+        train_results["config"] = "config.yaml"
+        train_results["models"] = {}
+        for i in range(1, last_num + 1):
+            train_results["models"][f"last_{i}"] = {}
+        train_results["models"]["best"] = {}
+    train_results["done_flag"] = done_flag
+    if prefix == "best_model":
+        train_results["models"]["best"]["score"] = metric_info["metric"]
+        for tag in save_model_tag:
+            train_results["models"]["best"][tag] = os.path.join(
+                prefix, f"{prefix}.{tag}")
+        for tag in save_inference_tag:
+            train_results["models"]["best"][tag] = os.path.join(
+                prefix, "inference", f"inference.{tag}"
+                if tag != "inference_config" else "inference.yml")
+    else:
+        for i in range(last_num - 1, 0, -1):
+            train_results["models"][f"last_{i + 1}"] = train_results["models"][
+                f"last_{i}"].copy()
+        train_results["models"][f"last_{1}"]["score"] = metric_info["metric"]
+        for tag in save_model_tag:
+            train_results["models"][f"last_{1}"][tag] = os.path.join(
+                prefix, f"{prefix}.{tag}")
+        for tag in save_inference_tag:
+            train_results["models"][f"last_{1}"][tag] = os.path.join(
+                prefix, "inference", f"inference.{tag}"
+                if tag != "inference_config" else "inference.yml")
+
+    with open(train_results_path, "w") as fp:
+        json.dump(train_results, fp)
diff --git a/example/low_precision_training/ppcls/utils/vehicle_attribute_label_list.txt b/example/low_precision_training/ppcls/utils/vehicle_attribute_label_list.txt
new file mode 100755
index 000000000..03ad382fe
--- /dev/null
+++ b/example/low_precision_training/ppcls/utils/vehicle_attribute_label_list.txt
@@ -0,0 +1,19 @@
+0 yellow(黄色)
+1 orange(橙色)
+2 green(绿色)
+3 gray(灰色)
+4 red(红色)
+5 blue(蓝色)
+6 white(白色)
+7 golden(金色)
+8 brown(棕色)
+9 black(黑色)
+10 sedan(轿车)
+11 suv(SUV)
+12 van(厢式车)
+13 hatchback(掀背车)
+14 mpv(多用途车)
+15 pickup(皮卡)
+16 bus(公共汽车)
+17 truck(卡车)
+18 estate(旅行车)
diff --git a/example/low_precision_training/qat_config/qat_a4_mse.yaml b/example/low_precision_training/qat_config/qat_a4_mse.yaml
new file mode 100755
index 000000000..19c20a696
--- /dev/null
+++ b/example/low_precision_training/qat_config/qat_a4_mse.yaml
@@ -0,0 +1,15 @@
+quantization_manager: qat_quantization_manager
+
+weight_config:
+  qtype: "full"
+  granularity: "per-layer"
+  filters:
+    - 
+      id: 0
+      qtype: "full"
+    - 
+      name: "fc"
+      qtype: "full"
+
+calibration_config:
+  step: 50
\ No newline at end of file
diff --git a/example/low_precision_training/qt_config/qt_W4_mse_det__A4_mse_det__G4_minmax_sto.yaml b/example/low_precision_training/qt_config/qt_W4_mse_det__A4_mse_det__G4_minmax_sto.yaml
new file mode 100755
index 000000000..7a5805a48
--- /dev/null
+++ b/example/low_precision_training/qt_config/qt_W4_mse_det__A4_mse_det__G4_minmax_sto.yaml
@@ -0,0 +1,71 @@
+quantization_manager: qt_quantization_manager
+
+weight_forward_config:
+  qtype: "int4"
+  granularity: "N"
+  scaling: "dynamic"
+  observer: "mse"
+  stochastic: False
+  filters:
+    - 
+      id: 0
+      qtype: "full"
+    - 
+      name: "fc"
+      qtype: "full"
+
+weight_backward_config:
+  qtype: "int4"
+  granularity: "C"
+  scaling: "dynamic"
+  observer: "mse"
+  stochastic: False
+  filters:
+    - 
+      id: 0
+      qtype: "full"
+    - 
+      name: "fc"
+      qtype: "full"
+
+input_config:
+  qtype: "int4"
+  granularity: "N"
+  scaling: "dynamic"
+  observer: "mse"
+  stochastic: False
+  filters:
+    - 
+      id: 0
+      qtype: "full"
+    - 
+      name: "fc"
+      qtype: "full"
+
+grad_input_config:
+  qtype: "int4"
+  granularity: "NHW"
+  scaling: "dynamic"
+  observer: "minmax"
+  stochastic: True
+  filters:
+    - 
+      id: 0
+      qtype: "full"
+    - 
+      name: "fc"
+      qtype: "full"
+
+grad_weight_config:
+  qtype: "int4"
+  granularity: "NC"
+  scaling: "dynamic"
+  observer: "minmax"
+  stochastic: True
+  filters:
+    - 
+      id: 0
+      qtype: "full"
+    - 
+      name: "fc"
+      qtype: "full"
diff --git a/example/low_precision_training/quantization/__init__.py b/example/low_precision_training/quantization/__init__.py
new file mode 100755
index 000000000..22d10f86f
--- /dev/null
+++ b/example/low_precision_training/quantization/__init__.py
@@ -0,0 +1,6 @@
+from .config import Config
+from .managers import build_quantization_manager
+
+__all__ = [
+    'Config', 'build_quantization_manager'
+]
\ No newline at end of file
diff --git a/example/low_precision_training/quantization/config.py b/example/low_precision_training/quantization/config.py
new file mode 100755
index 000000000..afb5b36a1
--- /dev/null
+++ b/example/low_precision_training/quantization/config.py
@@ -0,0 +1,743 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import copy
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import tempfile
+import types
+import uuid
+import warnings
+from argparse import Action, ArgumentParser
+from collections import abc
+from importlib import import_module
+from pathlib import Path
+
+from addict import Dict
+from yapf.yapflib.yapf_api import FormatCode
+
+from .utils import check_file_exist, import_modules_from_strings
+
+if platform.system() == 'Windows':
+    import regex as re  # type: ignore
+else:
+    import re  # type: ignore
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+DEPRECATION_KEY = '_deprecation_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text']
+
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super().__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+    def has_key(self, key):
+        return key in self.keys()
+
+
+def add_args(parser, cfg, prefix=''):
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument('--' + prefix + k)
+        elif isinstance(v, int):
+            parser.add_argument('--' + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument('--' + prefix + k, type=float)
+        elif isinstance(v, bool):
+            parser.add_argument('--' + prefix + k, action='store_true')
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + '.')
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
+        else:
+            print(f'cannot parse key {prefix + k} of type {type(v)}')
+    return parser
+
+
+class Config:
+    """A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml. The interface
+    is the same as a dict object and also allows access config values as
+    attributes.
+
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError as e:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}: {e}')
+
+    @staticmethod
+    def _substitute_predefined_vars(filename, temp_config_name):
+        file_dirname = osp.dirname(filename)
+        file_basename = osp.basename(filename)
+        file_basename_no_extension = osp.splitext(file_basename)[0]
+        file_extname = osp.splitext(filename)[1]
+        support_templates = dict(
+            fileDirname=file_dirname,
+            fileBasename=file_basename,
+            fileBasenameNoExtension=file_basename_no_extension,
+            fileExtname=file_extname)
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        for key, value in support_templates.items():
+            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
+            value = value.replace('\\', '/')
+            config_file = re.sub(regexp, value, config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+
+    @staticmethod
+    def _pre_substitute_base_vars(filename, temp_config_name):
+        """Substitute base variable placehoders to string, so that parsing
+        would work."""
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        base_var_dict = {}
+        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
+        base_vars = set(re.findall(regexp, config_file))
+        for base_var in base_vars:
+            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
+            base_var_dict[randstr] = base_var
+            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
+            config_file = re.sub(regexp, f'"{randstr}"', config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return base_var_dict
+
+    @staticmethod
+    def _substitute_base_vars(cfg, base_var_dict, base_cfg):
+        """Substitute variable strings to their actual values."""
+        cfg = copy.deepcopy(cfg)
+
+        if isinstance(cfg, dict):
+            for k, v in cfg.items():
+                if isinstance(v, str) and v in base_var_dict:
+                    new_v = base_cfg
+                    for new_k in base_var_dict[v].split('.'):
+                        new_v = new_v[new_k]
+                    cfg[k] = new_v
+                elif isinstance(v, (list, tuple, dict)):
+                    cfg[k] = Config._substitute_base_vars(
+                        v, base_var_dict, base_cfg)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg
+            ]
+        elif isinstance(cfg, str) and cfg in base_var_dict:
+            new_v = base_cfg
+            for new_k in base_var_dict[cfg].split('.'):
+                new_v = new_v[new_k]
+            cfg = new_v
+
+        return cfg
+
+    @staticmethod
+    def _file2dict(filename, use_predefined_variables=True):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        fileExtname = osp.splitext(filename)[1]
+        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            temp_config_file = tempfile.NamedTemporaryFile(
+                dir=temp_config_dir, suffix=fileExtname)
+            if platform.system() == 'Windows':
+                temp_config_file.close()
+            temp_config_name = osp.basename(temp_config_file.name)
+            # Substitute predefined variables
+            if use_predefined_variables:
+                Config._substitute_predefined_vars(filename,
+                                                   temp_config_file.name)
+            else:
+                shutil.copyfile(filename, temp_config_file.name)
+            # Substitute base variables from placeholders to strings
+            base_var_dict = Config._pre_substitute_base_vars(
+                temp_config_file.name, temp_config_file.name)
+
+            if filename.endswith('.py'):
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                    and not isinstance(value, types.ModuleType)
+                    and not isinstance(value, types.FunctionType)
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+            elif filename.endswith(('.yml', '.yaml', '.json')):
+                import mmcv
+                cfg_dict = mmcv.load(temp_config_file.name)
+            # close temp file
+            temp_config_file.close()
+
+        # check deprecation information
+        if DEPRECATION_KEY in cfg_dict:
+            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
+            warning_msg = f'The config file {filename} will be deprecated ' \
+                'in the future.'
+            if 'expected' in deprecation_info:
+                warning_msg += f' Please use {deprecation_info["expected"]} ' \
+                    'instead.'
+            if 'reference' in deprecation_info:
+                warning_msg += ' More information can be found at ' \
+                    f'{deprecation_info["reference"]}'
+            warnings.warn(warning_msg, DeprecationWarning)
+
+        cfg_text = filename + '\n'
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            cfg_text += f.read()
+
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                duplicate_keys = base_cfg_dict.keys() & c.keys()
+                if len(duplicate_keys) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases. '
+                                   f'Duplicate keys: {duplicate_keys}')
+                base_cfg_dict.update(c)
+
+            # Substitute base variables from strings to their actual values
+            cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
+                                                    base_cfg_dict)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b, allow_list_keys=False):
+        """merge dict ``a`` into dict ``b`` (non-inplace).
+
+        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
+        in-place modifications.
+
+        Args:
+            a (dict): The source dict to be merged into ``b``.
+            b (dict): The origin dict to be fetch keys from ``a``.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in source ``a`` and will replace the element of the
+              corresponding index in b if b is a list. Default: False.
+
+        Returns:
+            dict: The modified dict of ``b`` using ``a``.
+
+        Examples:
+            # Normally merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # Delete b first and merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # b is a list
+            >>> Config._merge_a_into_b(
+            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
+            [{'a': 2}, {'b': 2}]
+        """
+        b = b.copy()
+        for k, v in a.items():
+            if allow_list_keys and k.isdigit() and isinstance(b, list):
+                k = int(k)
+                if len(b) <= k:
+                    raise KeyError(f'Index {k} exceeds the length of list {b}')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v, dict):
+                if k in b and not v.pop(DELETE_KEY, False):
+                    allowed_types = (dict, list) if allow_list_keys else dict
+                    if not isinstance(b[k], allowed_types):
+                        raise TypeError(
+                            f'{k}={v} in child config cannot inherit from '
+                            f'base because {k} is a dict in the child config '
+                            f'but is of type {type(b[k])} in base config. '
+                            f'You may set `{DELETE_KEY}=True` to ignore the '
+                            f'base config.')
+                    b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+                else:
+                    b[k] = ConfigDict(v)
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def fromfile(filename,
+                 use_predefined_variables=True,
+                 import_custom_modules=True):
+        if isinstance(filename, Path):
+            filename = str(filename)
+        cfg_dict, cfg_text = Config._file2dict(filename,
+                                               use_predefined_variables)
+        if import_custom_modules and cfg_dict.get('custom_imports', None):
+            import_modules_from_strings(**cfg_dict['custom_imports'])
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+    @staticmethod
+    def fromstring(cfg_str, file_format):
+        """Generate config from config str.
+
+        Args:
+            cfg_str (str): Config str.
+            file_format (str): Config file format corresponding to the
+               config str. Only py/yml/yaml/json type are supported now!
+
+        Returns:
+            :obj:`Config`: Config obj.
+        """
+        if file_format not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+        if file_format != '.py' and 'dict(' in cfg_str:
+            # check if users specify a wrong suffix for python
+            warnings.warn(
+                'Please check "file_format", the file format may be .py')
+        with tempfile.NamedTemporaryFile(
+                'w', encoding='utf-8', suffix=file_format,
+                delete=False) as temp_file:
+            temp_file.write(cfg_str)
+            # on windows, previous implementation cause error
+            # see PR 1077 for details
+        cfg = Config.fromfile(temp_file.name)
+        os.remove(temp_file.name)
+        return cfg
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument('config', help='config file path')
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument('config', help='config file path')
+        add_args(parser, cfg)
+        return parser, cfg
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        if isinstance(filename, Path):
+            filename = str(filename)
+
+        super().__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super().__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename) as f:
+                text = f.read()
+        else:
+            text = ''
+        super().__setattr__('_text', text)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        # copied from setup.cfg
+        yapf_style = dict(
+            based_on_style='pep8',
+            blank_line_before_nested_class_or_def=True,
+            split_before_expression_after_opening_paren=True)
+        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
+
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def __getstate__(self):
+        return (self._cfg_dict, self._filename, self._text)
+
+    def __copy__(self):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        other.__dict__.update(self.__dict__)
+
+        return other
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+
+        for key, value in self.__dict__.items():
+            super(Config, other).__setattr__(key, copy.deepcopy(value, memo))
+
+        return other
+
+    def __setstate__(self, state):
+        _cfg_dict, _filename, _text = state
+        super().__setattr__('_cfg_dict', _cfg_dict)
+        super().__setattr__('_filename', _filename)
+        super().__setattr__('_text', _text)
+
+    def dump(self, file=None):
+        """Dumps config into a file or returns a string representation of the
+        config.
+
+        If a file argument is given, saves the config to that file using the
+        format defined by the file argument extension.
+
+        Otherwise, returns a string representing the config. The formatting of
+        this returned string is defined by the extension of `self.filename`. If
+        `self.filename` is not defined, returns a string representation of a
+         dict (lowercased and using ' for strings).
+
+        Examples:
+            >>> cfg_dict = dict(item1=[1, 2], item2=dict(a=0),
+            ...     item3=True, item4='test')
+            >>> cfg = Config(cfg_dict=cfg_dict)
+            >>> dump_file = "a.py"
+            >>> cfg.dump(dump_file)
+
+        Args:
+            file (str, optional): Path of the output file where the config
+                will be dumped. Defaults to None.
+        """
+        import mmcv
+        cfg_dict = super().__getattribute__('_cfg_dict').to_dict()
+        if file is None:
+            if self.filename is None or self.filename.endswith('.py'):
+                return self.pretty_text
+            else:
+                file_format = self.filename.split('.')[-1]
+                return mmcv.dump(cfg_dict, file_format=file_format)
+        elif file.endswith('.py'):
+            with open(file, 'w', encoding='utf-8') as f:
+                f.write(self.pretty_text)
+        else:
+            file_format = file.split('.')[-1]
+            return mmcv.dump(cfg_dict, file=file, file_format=file_format)
+
+    def merge_from_dict(self, options, allow_list_keys=True):
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+            >>> # Merge list element
+            >>> cfg = Config(dict(pipeline=[
+            ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
+            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
+            >>> cfg.merge_from_dict(options, allow_list_keys=True)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(pipeline=[
+            ...     dict(type='SelfLoadImage'), dict(type='LoadAnnotations')])
+
+        Args:
+            options (dict): dict of configs to merge from.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in ``options`` and will replace the element of the
+              corresponding index in the config if the config is a list.
+              Default: True.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super().__getattribute__('_cfg_dict')
+        super().__setattr__(
+            '_cfg_dict',
+            Config._merge_a_into_b(
+                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val == 'None':
+            return None
+        return val
+
+    @staticmethod
+    def _parse_iterable(val):
+        """Parse iterable values in the string.
+
+        All elements inside '()' or '[]' are treated as iterable values.
+
+        Args:
+            val (str): Value string.
+
+        Returns:
+            list | tuple: The expanded list or tuple from the string.
+
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                    string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction._parse_int_float_bool(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction._parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+        if is_tuple:
+            values = tuple(values)
+        return values
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            options[key] = self._parse_iterable(val)
+        setattr(namespace, self.dest, options)
\ No newline at end of file
diff --git a/example/low_precision_training/quantization/cudnn_convolution.cpp b/example/low_precision_training/quantization/cudnn_convolution.cpp
new file mode 100755
index 000000000..fb974e453
--- /dev/null
+++ b/example/low_precision_training/quantization/cudnn_convolution.cpp
@@ -0,0 +1,124 @@
+#include <paddle/extension.h>
+#include <vector>
+
+namespace paddle{
+    namespace experimental{
+        extern PADDLE_API void conv2d_grad(const paddle::Tensor& input, const paddle::Tensor& filter,
+                                 const paddle::Tensor& out_grad, const std::vector<int>& strides,
+                                 const std::vector<int>& paddings, const std::string& padding_algorithm,
+                                 const std::vector<int>& dilations, int groups,
+                                 const std::string& data_format, paddle::Tensor* input_grad,
+                                 paddle::Tensor* filter_grad);
+    }
+}
+
+std::vector<paddle::Tensor> convolution_bwd(
+    const paddle::Tensor& input,
+    const paddle::Tensor& weight,
+    const paddle::Tensor& grad_output,
+    std::vector<int> stride,
+    std::vector<int> padding,
+    std::vector<int> dilation,
+    int groups) {
+
+    paddle::Tensor input_grad, weight_grad;
+
+    paddle::experimental::conv2d_grad(input, weight, grad_output, stride, padding, "EXPLICIT", dilation, groups, "NCHW", &input_grad, &weight_grad);
+
+    // return {input_grad, weight_grad};
+    return {input_grad, weight_grad};
+}
+
+
+
+PYBIND11_MODULE(cudnn_convolution_custom, m) {
+  // 将加法函数绑定至 python 端
+  m.def("convolution_bwd", &convolution_bwd, "convolution_bwd");
+}
+
+
+// PD_BUILD_OP(convolution_bwd)
+//     .Inputs({"input", "weight", "grad_output", paddle::Vec("stride"), paddle::Vec("padding"), paddle::Vec("dilation"), "groups"})
+//     .Outputs({"input_grad", "weight_grad"})
+//     .SetKernelFn(PD_KERNEL(Convolution_bwd));
+
+
+// namespace paddle{
+//     namespace experimental{
+//         extern PADDLE_API void conv2d_grad(const paddle::Tensor& input, const paddle::Tensor& filter,
+//                                  const paddle::Tensor& out_grad, const std::vector<int>& strides,
+//                                  const std::vector<int>& paddings, const std::string& padding_algorithm,
+//                                  const std::vector<int>& dilations, int groups,
+//                                  const std::string& data_format, paddle::Tensor* input_grad,
+//                                  paddle::Tensor* filter_grad);
+//     }
+// }
+
+// std::vector<paddle::Tensor> convolution(
+//     const paddle::Tensor& input,
+//     const paddle::Tensor& weight,
+//     std::vector<int> stride,
+//     std::vector<int> padding,
+//     std::vector<int> dilation,
+//     int groups) {
+
+//     paddle::Tensor output = paddle::experimental::conv2d(input, weight, stride, padding, "EXPLICIT", dilation, groups);
+
+//     return {output};
+// }
+
+
+
+// PD_BUILD_OP(cudnn_convolution)
+//     .Inputs({"input", "weight", paddle::Vec("stride"), paddle::Vec("padding"), paddle::Vec("dilation"), "groups"})
+//     .Outputs({"output"})
+//     .SetKernelFn(PD_KERNEL(convolution));
+
+
+// paddle::Tensor convolution_backward_weight(
+//     const paddle::Tensor& input,
+//     const paddle::Tensor& weight,
+//     const paddle::Tensor& grad_output,
+//     std::vector<int> stride,
+//     std::vector<int> padding,
+//     std::vector<int> dilation,
+//     int groups) {
+
+//     paddle::Tensor input_grad, weight_grad;
+
+//     conv2d_grad(input, weight, grad_output, stride, padding, "EXPLICIT", dilation, groups, "NCHW", &input_grad, &weight_grad);
+
+//     return weight_grad;
+// }
+
+// paddle::Tensor convolution_backward_input(
+//     const paddle::Tensor& input,
+//     const paddle::Tensor& weight,
+//     const paddle::Tensor& grad_output,
+//     std::vector<int> stride,
+//     std::vector<int> padding,
+//     std::vector<int> dilation,
+//     int groups) {
+
+//     paddle::Tensor input_grad, weight_grad;
+
+//     conv2d_grad(input, weight, grad_output, stride, padding, "EXPLICIT", dilation, groups, "NCHW", &input_grad, &weight_grad);
+
+//     return input_grad;
+// }
+
+// PD_BUILD_OP(cudnn_convolution)
+//     .Inputs({"input", "weight", paddle::Vec("stride"), paddle::Vec("padding"), paddle::Vec("dilation"), "groups"})
+//     .Outputs({"output"})
+//     .SetKernelFn(PD_KERNEL(convolution));
+
+// PD_BUILD_GRAD_OP(cudnn_convolution)
+//     .Inputs({"input", "weight", paddle::Grad("output"), paddle::Vec("padding"), paddle::Vec("dilation"), "groups"})
+//     .Outputs({paddle::Grad("weight")})
+//     .SetKernelFn(PD_KERNEL(convolution_backward_weight));
+
+// PD_BUILD_GRAD_OP(cudnn_convolution)
+//     .Inputs({"input", "weight", paddle::Grad("output"), paddle::Vec("padding"), paddle::Vec("dilation"), "groups"})
+//     .Outputs({paddle::Grad("input")})
+//     .SetKernelFn(PD_KERNEL(convolution_backward_input));
+
diff --git a/example/low_precision_training/quantization/install_paddle_so.py b/example/low_precision_training/quantization/install_paddle_so.py
new file mode 100755
index 000000000..bbca2a88a
--- /dev/null
+++ b/example/low_precision_training/quantization/install_paddle_so.py
@@ -0,0 +1,29 @@
+import os
+import sys
+import shutil
+import paddle
+from paddle.utils.cpp_extension.extension_utils import CustomOpInfo, custom_write_stub
+
+
+def main(so_path):
+    suffix = ".so"
+    assert so_path.endswith(suffix), f"{so_path} should end with {suffix}"
+    resource = os.path.basename(so_path)[:-len(suffix)]
+
+    install_dir = os.path.abspath(os.path.join(os.path.dirname(paddle.__file__), "..", resource))
+    os.makedirs(install_dir, exist_ok=True)
+    install_so_path = os.path.join(install_dir, f"{resource}_pd_{suffix}")
+    shutil.copyfile(so_path, install_so_path)
+    CustomOpInfo.instance().add(resource, os.path.basename(install_so_path), install_so_path)
+    pyfile = os.path.join(install_dir, f"{resource}.py")
+    custom_write_stub(f"{resource}{suffix}", pyfile) 
+    init_file = os.path.join(install_dir, "__init__.py")
+    with open(init_file, "w") as f:
+        f.write("import paddle\n")
+        f.write(f"from .{resource} import *")
+
+    print(f"Installed in {install_dir}")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1])
diff --git a/example/low_precision_training/quantization/layers.py b/example/low_precision_training/quantization/layers.py
new file mode 100755
index 000000000..5de739cc8
--- /dev/null
+++ b/example/low_precision_training/quantization/layers.py
@@ -0,0 +1,310 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .ops import forward_quantizer, quantized_linear_for_training, quantized_conv2d_for_training
+from .quantization_utils import parse_qtype
+from . import quantizer
+
+# 定义 _pair 函数
+# def _pair(x):
+#     if isinstance(x, (tuple, list)):
+#         return x
+#     return (x, x)
+import collections
+from itertools import repeat
+def _pair(x):
+    if isinstance(x, collections.abc.Iterable):
+        return tuple(x)
+    return tuple(repeat(x, 2))
+
+
+'''
+Quantization-aware Training
+'''
+class QModule(nn.Layer):
+    def __init__(self):
+        super(QModule, self).__init__()
+
+        self.calibrating = False
+        self.input_scale_list = []
+
+        # input quantization info
+        self.input_qtype = None
+        self.input_granularity = None
+        self.register_buffer('input_bitwidth', None)
+        self.register_buffer('input_scale', None)
+        self.input_quantizer = None
+
+        # weight quantization info
+        self.weight_qtype = None
+        self.weight_granularity = None
+        self.register_buffer('weight_bitwidth', None)
+        self.register_buffer('weight_scale', None)
+        self.weight_quantizer = None
+
+    def init(self, qparam):
+        # input quantizer
+        device = self.weight.place
+        qtype_name, bitwidth = parse_qtype(qparam['input_qtype'])
+        self.input_qtype = qtype_name
+        self.input_granularity = qparam['input_granularity']
+        self.register_buffer('input_bitwidth', paddle.to_tensor(bitwidth).astype('int32').place(device))
+        self.register_buffer('input_scale', paddle.to_tensor(1).astype('float32').place(device))
+        self.input_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, self.input_granularity, axis=1)
+
+        # weight quantizer
+        qtype_name, bitwidth = parse_qtype(qparam['weight_qtype'])
+        self.weight_qtype = qtype_name
+        self.weight_granularity = qparam['weight_granularity']
+        self.register_buffer('weight_bitwidth', paddle.to_tensor(bitwidth).astype('int32').place(device))
+        self.register_buffer('weight_scale', paddle.to_tensor(1).astype('float32').place(device))
+        self.weight_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, self.weight_granularity)
+
+    def __setattr__(self, name, value):
+        if name in ('input_bitwidth', 'input_scale', 'weight_bitwidth', 'weight_scale'):
+            device = getattr(self, name).place
+            if name == 'input_bitwidth':
+                if isinstance(value, paddle.Tensor):
+                    del self._buffers['input_bitwidth']
+                    self._buffers['input_bitwidth'] = value.place(device)
+                elif isinstance(value, int):
+                    del self._buffers['input_bitwidth']
+                    self._buffers['input_bitwidth'] = paddle.to_tensor(value).astype('int32').place(device)
+                else:
+                    raise TypeError
+            elif name == 'input_scale':
+                if isinstance(value, paddle.Tensor):
+                    del self._buffers['input_scale']
+                    self._buffers['input_scale'] = value.place(device)
+                elif isinstance(value, int) or isinstance(value, float):
+                    del self._buffers['input_scale']
+                    self._buffers['input_scale'] = paddle.to_tensor(value).astype('float32').place(device)
+                else:
+                    raise TypeError
+            if name == 'weight_bitwidth':
+                if isinstance(value, paddle.Tensor):
+                    del self._buffers['weight_bitwidth']
+                    self._buffers['weight_bitwidth'] = value.place(device)
+                elif isinstance(value, int):
+                    del self._buffers['weight_bitwidth']
+                    self._buffers['weight_bitwidth'] = paddle.to_tensor(value).astype('int32').place(device)
+                else:
+                    raise TypeError
+            elif name == 'weight_scale':
+                if isinstance(value, paddle.Tensor):
+                    del self._buffers['weight_scale']
+                    self._buffers['weight_scale'] = value.place(device)
+                elif isinstance(value, int) or isinstance(value, float):
+                    del self._buffers['weight_scale']
+                    self._buffers['weight_scale'] = paddle.to_tensor(value).astype('float32').place(device)
+                else:
+                    raise TypeError
+        else:
+            super(QModule, self).__setattr__(name, value)
+
+    def initialize_quantizer(self):
+        self.input_scale = paddle.stack(self.input_scale_list).mean(axis=0)
+        self.input_scale_list = []
+        self.input_quantizer.set_scale(self.input_scale)
+
+        weight_scale = self.weight_quantizer.calculate_quantization_scale(self.weight.numpy())
+        self.weight_scale = weight_scale
+        self.weight_quantizer.set_scale(self.weight_scale)
+
+    def forward(self, input):
+        pass
+
+
+class QLinear(nn.Linear, QModule):
+    def __init__(self, input_features, output_features, bias=True):
+        super(QLinear, self).__init__(input_features, output_features, bias)
+
+    def forward(self, input):
+        if self.calibrating:
+            scale = self.input_quantizer.calculate_quantization_scale(input.numpy())
+            self.input_scale_list.append(scale)
+            return super(QLinear, self).forward(input)
+        else:
+            input_q = forward_quantizer.apply(input, self.input_quantizer)
+            weight_q = forward_quantizer.apply(self.weight, self.weight_quantizer)
+
+            return paddle.nn.functional.linear(input_q, weight_q, self.bias)
+
+
+class QConv2d(nn.Conv2D, QModule):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros'):
+        super(QConv2d, self).__init__(in_channels, out_channels, kernel_size,
+                                      stride, padding, dilation, groups, bias, padding_mode)
+
+    def forward(self, input):
+        if self.calibrating:
+            scale = self.input_quantizer.calculate_quantization_scale(input.detach())
+            self.input_scale_list.append(scale)
+            return super(QConv2d, self).forward(input)
+        else:
+            input_q = forward_quantizer.apply(input, self.input_quantizer)
+            weight_q = forward_quantizer.apply(self.weight, self.weight_quantizer)
+
+            if self._padding_mode != 'zeros':
+                return paddle.nn.functional.conv2d(F.pad(input_q, self._reversed_padding_repeated_twice, padding_mode=self._padding_mode),
+                                weight_q, self.bias, self._stride,
+                                _pair(0), self._dilation, self._groups)
+            return paddle.nn.functional.conv2d(input_q, weight_q, self.bias, self._stride,
+                            self._padding, self._dilation, self._groups)
+
+
+
+'''
+Training Quantization
+'''
+class QTModule(nn.Layer):
+    def __init__(self):
+        super(QTModule, self).__init__()
+
+        self.enabled = False
+        # weight quantization info
+        self._hybrid_weight = False
+        self._hybrid_input = False
+        self._hybrid_grad = False
+
+    def init(self, qparam):
+        self.enabled = True
+        device = self.weight.place
+
+        # weight quantizer
+        if qparam.weight_forward == qparam.weight_backward:
+            # single quantizer
+            self._hybrid_weight = False
+            qtype_name, bitwidth = parse_qtype(qparam.weight_forward.qtype)
+            self.weight_forward_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, signed=True, 
+                                                            scaling=qparam.weight_forward.scaling,
+                                                            dims=qparam.weight_forward.granularity,
+                                                            observer=qparam.weight_forward.observer,
+                                                            stochastic=qparam.weight_forward.stochastic)
+            self.weight_backward_quantizer = self.weight_forward_quantizer
+        else:
+            # hybrid quantizer
+            self._hybrid_weight = True
+            qtype_name, bitwidth = parse_qtype(qparam.weight_forward.qtype)
+            self.weight_forward_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, signed=True, 
+                                                            scaling=qparam.weight_forward.scaling,
+                                                            dims=qparam.weight_forward.granularity,
+                                                            observer=qparam.weight_forward.observer,
+                                                            stochastic=qparam.weight_forward.stochastic)
+            qtype_name, bitwidth = parse_qtype(qparam.weight_backward.qtype)
+            self.weight_backward_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, signed=True, 
+                                                            scaling=qparam.weight_backward.scaling,
+                                                            dims=qparam.weight_backward.granularity,
+                                                            observer=qparam.weight_backward.observer,
+                                                            stochastic=qparam.weight_backward.stochastic)
+
+        # input quantizer
+        if qparam.input_forward == qparam.input_backward:
+            # single quantizer
+            self._hybrid_input = False
+            qtype_name, bitwidth = parse_qtype(qparam.input_forward.qtype)
+            self.input_forward_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, 
+                                                            scaling=qparam.input_forward.scaling,
+                                                            dims=qparam.input_forward.granularity,
+                                                            observer=qparam.input_forward.observer,
+                                                            stochastic=qparam.input_forward.stochastic)
+            self.input_backward_quantizer = self.input_forward_quantizer
+        else:
+            # hybrid quantizer
+            self._hybrid_input = True
+            qtype_name, bitwidth = parse_qtype(qparam.input_forward.qtype)
+            self.input_forward_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, 
+                                                            scaling=qparam.input_forward.scaling,
+                                                            dims=qparam.input_forward.granularity,
+                                                            observer=qparam.input_forward.observer,
+                                                            stochastic=qparam.input_forward.stochastic)
+            qtype_name, bitwidth = parse_qtype(qparam.input_backward.qtype)
+            self.input_backward_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, 
+                                                            scaling=qparam.input_backward.scaling,
+                                                            dims=qparam.input_backward.granularity,
+                                                            observer=qparam.input_backward.observer,
+                                                            stochastic=qparam.input_backward.stochastic)
+
+        # grad quantizer
+        if qparam.grad_input == qparam.grad_weight:
+            # single quantizer
+            self._hybrid_grad = False
+            qtype_name, bitwidth = parse_qtype(qparam.grad_input.qtype)
+            self.grad_input_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, signed=True, 
+                                                            scaling=qparam.grad_input.scaling,
+                                                            dims=qparam.grad_input.granularity,
+                                                            observer=qparam.grad_input.observer,
+                                                            stochastic=qparam.grad_input.stochastic)
+            self.grad_weight_quantizer = self.grad_input_quantizer
+        else:
+            # hybrid quantizer
+            self._hybrid_grad = True
+            qtype_name, bitwidth = parse_qtype(qparam.grad_input.qtype)
+            self.grad_input_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, signed=True, 
+                                                            scaling=qparam.grad_input.scaling,
+                                                            dims=qparam.grad_input.granularity,
+                                                            observer=qparam.grad_input.observer,
+                                                            stochastic=qparam.grad_input.stochastic)
+            qtype_name, bitwidth = parse_qtype(qparam.grad_weight.qtype)
+            self.grad_weight_quantizer = quantizer.__dict__[qtype_name + "_quantizer"](bitwidth, signed=True, 
+                                                            scaling=qparam.grad_weight.scaling,
+                                                            dims=qparam.grad_weight.granularity,
+                                                            observer=qparam.grad_weight.observer,
+                                                            stochastic=qparam.grad_weight.stochastic)
+
+    def enable(self):
+        self.enabled = True
+
+    def disable(self):
+        self.enabled = False
+
+
+class QTLinear(nn.Linear, QTModule):
+    def __init__(self, input_features, output_features, bias=True):
+        nn.Linear.__init__(self, input_features, output_features, bias)
+        QTModule.__init__(self)
+
+    def forward(self, input):
+        if self.enabled:
+            # print(input.shape)  # [256, 512]
+            # print(self.weight.shape)  # [512, 1000] 这里torch是 [1000, 512]
+            # assert 0==1
+            return quantized_linear_for_training.apply(input, self.weight, self.bias, 
+                                        self._hybrid_weight, self.weight_forward_quantizer, self.weight_backward_quantizer,
+                                        self._hybrid_input, self.input_forward_quantizer, self.input_backward_quantizer,
+                                        self._hybrid_grad, self.grad_input_quantizer, self.grad_weight_quantizer)
+        else:
+            if self.training:
+                print("Warning: quantization is disabled!!!")
+            return super(QTLinear, self).forward(input)
+
+
+class QTConv2d(nn.Conv2D, QTModule):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros'):
+        nn.Conv2D.__init__(self, in_channels, out_channels, kernel_size,
+                           stride=stride, padding=padding, dilation=dilation, groups=groups, 
+                           bias_attr=bias, padding_mode=padding_mode)
+        QTModule.__init__(self)
+        
+
+
+    def forward(self, input):
+        if self.enabled:
+            if self._padding_mode != 'zeros':
+                return quantized_conv2d_for_training.apply(F.pad(input, self._reversed_padding_repeated_twice, mode=self._padding_mode),
+                                            self.weight, self.bias, self._stride, _pair(0), self._dilation, self._groups, 
+                                            self._hybrid_weight, self.weight_forward_quantizer, self.weight_backward_quantizer,
+                                            self._hybrid_input, self.input_forward_quantizer, self.input_backward_quantizer,
+                                            self._hybrid_grad, self.grad_input_quantizer, self.grad_weight_quantizer)
+            return quantized_conv2d_for_training.apply(input, self.weight, self.bias, self._stride,
+                                                self._padding, self._dilation, self._groups, 
+                                                self._hybrid_weight, self.weight_forward_quantizer, self.weight_backward_quantizer,
+                                                self._hybrid_input, self.input_forward_quantizer, self.input_backward_quantizer,
+                                                self._hybrid_grad, self.grad_input_quantizer, self.grad_weight_quantizer)
+        else:
+            if self.training:
+                print("Warning: quantization is disabled!!!")
+            return super(QTConv2d, self).forward(input)
diff --git a/example/low_precision_training/quantization/managers/__init__.py b/example/low_precision_training/quantization/managers/__init__.py
new file mode 100755
index 000000000..c22788597
--- /dev/null
+++ b/example/low_precision_training/quantization/managers/__init__.py
@@ -0,0 +1,20 @@
+import importlib
+import os
+
+from .. import registry
+from quantization.managers.base_quantization_manager import BaseQuantizationManager
+
+build_quantization_manager, register_quantization_manager, QUANTIZATION_MANAGER_REGISTRY = registry.setup_registry(
+    '--quantization-manager',
+    base_class=BaseQuantizationManager,
+    default=None,
+)
+
+# automatically import Python files in the quantization_managers/ directory
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith('.py') and not file.startswith('_'):
+        module = file[:file.find('.py')]
+        importlib.import_module('quantization.managers.' + module)
+
+# Namespace(data='/home/songmiao/workplace/datasets/imagenet/', arch='resnet18', lr=0.1, momentum=0.9, weight_decay=5e-05, batch_size=256, workers=24, epochs=100, lr_policy='cosine', step_size=40, print_freq=10, seed=None, evaluate=False, qconfig=Config (path: cfg/qt_W4_minmax_det__A4_mse_det__G4_minmax_sto.yaml): {'quantization_manager': 'qt_quantization_manager', 'weight_forward_config': {'qtype': 'int4', 'granularity': 'N', 'scaling': 'dynamic', 'observer': 'minmax', 'stochastic': False, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}, 'weight_backward_config': {'qtype': 'int4', 'granularity': 'C', 'scaling': 'dynamic', 'observer': 'minmax', 'stochastic': False, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}, 'input_config': {'qtype': 'int4', 'granularity': 'N', 'scaling': 'dynamic', 'observer': 'mse', 'stochastic': False, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}, 'grad_input_config': {'qtype': 'int4', 'granularity': 'NHW', 'scaling': 'dynamic', 'observer': 'minmax', 'stochastic': True, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}, 'grad_weight_config': {'qtype': 'int4', 'granularity': 'NC', 'scaling': 'dynamic', 'observer': 'minmax', 'stochastic': True, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}}, eval_quant=False, upbn=False, results_dir='./checkpoint', save='2024-09-29_19-42-02', save_path='./checkpoint/2024-09-29_19-42-02')
+# Config (path: cfg/qt_W4_minmax_det__A4_mse_det__G4_minmax_sto.yaml): {'quantization_manager': 'qt_quantization_manager', 'weight_forward_config': {'qtype': 'int4', 'granularity': 'N', 'scaling': 'dynamic', 'observer': 'minmax', 'stochastic': False, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc','qtype': 'full'}]}, 'weight_backward_config': {'qtype': 'int4', 'granularity': 'C', 'scaling': 'dynamic', 'observer': 'minmax', 'stochastic': False, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}, 'input_config': {'qtype': 'int4', 'granularity': 'N', 'scaling': 'dynamic', 'observer': 'mse', 'stochastic': False, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}, 'grad_input_config': {'qtype': 'int4', 'granularity': 'NHW', 'scaling': 'dynamic', 'observer': 'minmax', 'stochastic': True, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}, 'grad_weight_config': {'qtype': 'int4', 'granularity': 'NC', 'scaling': 'dynamic', 'observer': 'minmax', 'stochastic': True, 'filters': [{'id': 0, 'qtype': 'full'}, {'name': 'fc', 'qtype': 'full'}]}}
\ No newline at end of file
diff --git a/example/low_precision_training/quantization/managers/base_quantization_manager.py b/example/low_precision_training/quantization/managers/base_quantization_manager.py
new file mode 100755
index 000000000..c3ec093ba
--- /dev/null
+++ b/example/low_precision_training/quantization/managers/base_quantization_manager.py
@@ -0,0 +1,23 @@
+class Singleton(type):
+    _instances = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
+        return cls._instances[cls]
+
+class BaseQuantizationManager(metaclass=Singleton):
+    def __init__(self, qconfig=None):
+        self.qconfig = qconfig
+
+    # def __enter__(self):
+    #     self.enable()
+    #     return self
+
+    # def __exit__(self, *args):
+    #     self.disable()
+
+    @classmethod
+    def build_quantization_manager(cls, qconfig):
+        """Create specific quantization_manager"""
+        return cls(qconfig)
\ No newline at end of file
diff --git a/example/low_precision_training/quantization/managers/qat_quantization_manager.py b/example/low_precision_training/quantization/managers/qat_quantization_manager.py
new file mode 100755
index 000000000..432044ab3
--- /dev/null
+++ b/example/low_precision_training/quantization/managers/qat_quantization_manager.py
@@ -0,0 +1,131 @@
+import paddle
+import paddle.nn as nn
+import abc
+import types
+import copy
+
+from .base_quantization_manager import BaseQuantizationManager
+from . import register_quantization_manager
+from ..layers import QLinear, QConv2d
+
+@register_quantization_manager('qat_quantization_manager')
+class QATQuantizationManager(BaseQuantizationManager):
+    def __init__(self, qconfig):
+        super(QATQuantizationManager, self).__init__(qconfig)
+        self.qconfig = qconfig
+
+    def prepare(self, model, inplace=True):
+        if not inplace:
+            model = copy.deepcopy(model)
+
+        # convert Conv2d -> QConv2d, etc
+        self.convert_layers(model)
+
+        # set bitwidth information for each module
+        self.parse_quantization_params(model)
+
+        # attach calibrating func
+        def calibrating(model, state):
+            for m in model.sublayers():
+                if isinstance(m, (QLinear, QConv2d)):
+                    m.calibrating = state
+        model.calibrating = types.MethodType(calibrating, model)
+
+        return model
+
+    def initialize_quantizer(self, model):
+        for m in model.sublayers():
+            if isinstance(m, (QLinear, QConv2d)):
+                m.initialize_quantizer()
+
+    def convert_layers(self, module):
+        for name, child in module.named_sublayers():
+            if isinstance(child, (QLinear, QConv2d)):
+                print('Error! Already have QModules!!!')
+                continue
+
+            if isinstance(child, nn.Linear):
+                new_child = QLinear(child.weight.shape[0], child.weight.shape[1],
+                                     child.bias is not None)
+                new_child.weight = child.weight
+                if child.bias is not None:
+                    new_child.bias = child.bias
+                setattr(module, name, new_child)
+            elif isinstance(child, nn.Conv2D):
+                new_child = QConv2d(child.weight.shape[0], child.weight.shape[1],
+                                     child._kernel_size, child._stride, child._padding, 
+                                     child._dilation, child._groups, child.bias is not None)
+                new_child.weight = child.weight
+                if child.bias is not None:
+                    new_child.bias = child.bias
+                setattr(module, name, new_child)
+            else:
+                self.convert_layers(child)
+
+    def parse_quantization_params(self, model):
+        # collect default quantization information for each module
+        module_list = []
+        for m_name, m in model.named_sublayers():
+            if isinstance(m, (nn.Linear, nn.Conv2D)):
+                # config for input activations
+                qparam = {
+                    'name': m_name,
+                    'input_qtype': self.qconfig.activation_config.qtype,
+                    'input_granularity': self.qconfig.activation_config.granularity,
+                    'weight_qtype': self.qconfig.weight_config.qtype,
+                    'weight_granularity': self.qconfig.weight_config.granularity
+                }
+                module_list.append(qparam)
+
+        # Figure out non-default quantization config for specific modules according to module index/name
+        # For activations
+        if self.qconfig.activation_config.has_key("filters"):
+            for filter_item in self.qconfig.activation_config.filters:
+                if not isinstance(filter_item, dict):
+                    raise TypeError(f'filter item must be a dict, but got {type(filter_item)}')
+
+                if filter_item.has_key('id'):
+                    if filter_item.has_key('qtype'):
+                        module_list[filter_item['id']]['input_qtype'] = filter_item['qtype']
+                    if filter_item.has_key('granularity'):
+                        module_list[filter_item['id']]['input_granularity'] = filter_item['granularity']
+                elif filter_item.has_key('name'):
+                    if filter_item.has_key('qtype'):
+                        for m in module_list:
+                            if filter_item['name'] in m['name']:
+                                m['input_qtype'] = filter_item['qtype']
+                    if filter_item.has_key('granularity'):
+                        for m in module_list:
+                            if filter_item['name'] in m['name']:
+                                m['input_granularity'] = filter_item['granularity']
+                else:
+                    raise KeyError(f'only support filter with id or module name, but got {filter_item.keys()}')
+        
+        # For weights
+        if self.qconfig.weight_config.has_key("filters"):
+            for filter_item in self.qconfig.weight_config.filters:
+                if not isinstance(filter_item, dict):
+                    raise TypeError(f'filter item must be a dict, but got {type(filter_item)}')
+
+                if filter_item.has_key('id'):
+                    if filter_item.has_key('qtype'):
+                        module_list[filter_item['id']]['weight_qtype'] = filter_item['qtype']
+                    if filter_item.has_key('granularity'):
+                        module_list[filter_item['id']]['weight_granularity'] = filter_item['granularity']
+                elif filter_item.has_key('name'):
+                    if filter_item.has_key('qtype'):
+                        for m in module_list:
+                            if filter_item['name'] in m['name']:
+                                m['weight_qtype'] = filter_item['qtype']
+                    if filter_item.has_key('granularity'):
+                        for m in module_list:
+                            if filter_item['name'] in m['name']:
+                                m['weight_granularity'] = filter_item['granularity']
+                else:
+                    raise KeyError(f'only support filter with id or module name, but got {filter_item.keys()}')
+
+        idx = 0
+        for m_name, m in model.named_sublayers():
+            if isinstance(m, (nn.Linear, nn.Conv2D)):
+                m.init(module_list[idx])
+                idx += 1
\ No newline at end of file
diff --git a/example/low_precision_training/quantization/managers/qt_quantization_manager.py b/example/low_precision_training/quantization/managers/qt_quantization_manager.py
new file mode 100755
index 000000000..9f73a33d0
--- /dev/null
+++ b/example/low_precision_training/quantization/managers/qt_quantization_manager.py
@@ -0,0 +1,158 @@
+import paddle
+import paddle.nn as nn
+import abc
+import types
+import copy
+from addict import Dict
+
+from .base_quantization_manager import BaseQuantizationManager
+from . import register_quantization_manager
+from ..layers import QTLinear, QTConv2d
+
+@register_quantization_manager('qt_quantization_manager')
+class QTQuantizationManager(BaseQuantizationManager):
+    def __init__(self, qconfig):
+        super(QTQuantizationManager, self).__init__(qconfig)
+        self.qconfig = qconfig
+
+    def enable_quantization(self, model):
+        for m in model.sublayers():
+            if isinstance(m, (QTLinear, QTConv2d)):
+                m.enabled = True
+
+    def disable_quantization(self, model):
+        for m in model.sublayers():
+            if isinstance(m, (QTLinear, QTConv2d)):
+                m.enabled = False
+
+    def prepare(self, model, inplace=True):
+        if not inplace:
+            model = copy.deepcopy(model)
+
+        # convert Conv2d -> QTConv2d, etc
+        self.convert_layers(model)
+
+        # set bitwidth information for each module
+        self.parse_quantization_params(model)
+
+        model.enable_quantization = types.MethodType(self.enable_quantization, model)
+        model.disable_quantization = types.MethodType(self.disable_quantization, model)
+
+        model.enable_quantization()
+
+        return model
+
+    def convert_layers(self, module):
+        # for name, param in module.named_parameters():
+        #     print(f"Parameter: {name}")
+        #     print(f"Shape: {param.shape}")
+
+        # assert 0==1
+
+        for name, child in module.named_children():
+            if isinstance(child, (QTLinear, QTConv2d)):
+                print('Error! Already have QModules!!!')
+                continue
+
+            if isinstance(child, nn.Linear):
+                # print("child.weight.shape[1]", child.weight.shape[1])  # 1000
+                # print("child.weight.shape[0]", child.weight.shape[0])  # 512
+                new_child = QTLinear(child.weight.shape[1], child.weight.shape[0], #### 代替方法！！！
+                                     child.bias is not None)
+                new_child.weight = child.weight
+
+                if child.bias is not None:
+                    new_child.bias = child.bias
+                setattr(module, name, new_child)
+            elif isinstance(child, nn.Conv2D):
+                new_child = QTConv2d(child._in_channels, child._out_channels,
+                                     child._kernel_size, child._stride, child._padding, 
+                                     child._dilation, child._groups, child.bias is not None)
+                new_child.weight = child.weight
+                if child.bias is not None:
+                    new_child.bias = child.bias
+                setattr(module, name, new_child)
+            else:
+                self.convert_layers(child)
+
+    def parse_quantization_params(self, model):
+        if self.qconfig.has_key('weight_config'):
+            if self.qconfig.has_key('weight_forward_config') or self.qconfig.has_key('weight_backward_config'):
+                raise KeyError(f'can not use weight_config and weight_forward_config \
+                                 or weight_backward_config at the same time')
+            weight_forward_config = weight_backward_config = self.qconfig.weight_config
+        else:
+            assert self.qconfig.has_key('weight_forward_config') and self.qconfig.has_key('weight_backward_config')
+            weight_forward_config = self.qconfig.weight_forward_config
+            weight_backward_config = self.qconfig.weight_backward_config
+        
+        if self.qconfig.has_key('input_config'):
+            if self.qconfig.has_key('input_forward_config') or self.qconfig.has_key('input_backward_config'):
+                raise KeyError(f'can not use input_config and input_forward_config \
+                                 or input_backward_config at the same time')
+            input_forward_config = input_backward_config = self.qconfig.input_config
+        else:
+            assert self.qconfig.has_key('input_forward_config') and self.qconfig.has_key('input_backward_config')
+            input_forward_config = self.qconfig.input_forward_config
+            input_backward_config = self.qconfig.input_backward_config
+
+        if self.qconfig.has_key('grad_config'):
+            if self.qconfig.has_key('grad_input_config') or self.qconfig.has_key('grad_weight_config'):
+                raise KeyError(f'can not use grad_config and grad_input_config \
+                                 or grad_weight_config at the same time')
+            grad_input_config = grad_weight_config = self.qconfig.grad_config
+        else:
+            assert self.qconfig.has_key('grad_input_config') and self.qconfig.has_key('grad_weight_config')
+            grad_input_config = self.qconfig.grad_input_config
+            grad_weight_config = self.qconfig.grad_weight_config
+
+        # collect default quantization information for each module
+        module_list = []
+        for m_name, m in model.named_sublayers():
+            if isinstance(m, (nn.Linear, nn.Conv2D)):
+                qparam = Dict()
+                qparam.name = m_name
+                for var in ['weight_forward', 'weight_backward', 'input_forward', 'input_backward', 'grad_input', 'grad_weight']:
+                    var_config = eval(var + '_config')
+                    if var_config.qtype == 'full':
+                        qparam[var] = Dict({
+                            'qtype': var_config.qtype,
+                            'granularity': None,
+                            'scaling': None,
+                            'observer': None,
+                            'stochastic': None})
+                    else:
+                        qparam[var] = Dict({
+                            'qtype': var_config.qtype,
+                            'granularity': var_config.granularity,
+                            'scaling': var_config.scaling,
+                            'observer': var_config.observer,
+                            'stochastic': var_config.stochastic})
+                module_list.append(qparam)
+
+        # Figure out non-default quantization config for specific modules according to module index/name
+        for var in ['weight_forward', 'weight_backward', 'input_forward', 'input_backward', 'grad_input', 'grad_weight']:
+            var_config = eval(var + '_config')
+            if var_config.has_key('filters'):
+                for filter_item in var_config['filters']:
+                    if not isinstance(filter_item, dict):
+                        raise TypeError(f'filter item must be a dict, but got {type(filter_item)}')
+
+                    if filter_item.has_key('id'):
+                        for key in ['qtype', 'granularity', 'scaling', 'observer', 'stochastic']:
+                            if filter_item.has_key(key):
+                                module_list[filter_item['id']][var][key] = filter_item[key]
+                    elif filter_item.has_key('name'):
+                        for key in ['qtype', 'granularity', 'scaling', 'observer', 'stochastic']:
+                            if filter_item.has_key(key):
+                                for m in module_list:
+                                    if filter_item['name'] in m['name']:
+                                        m[var][key] = filter_item[key]
+                    else:
+                        raise KeyError('only support filter with id or module name')
+
+        idx = 0
+        for m_name, m in model.named_sublayers():
+            if isinstance(m, (nn.Linear, nn.Conv2D)):
+                m.init(module_list[idx])
+                idx += 1
diff --git a/example/low_precision_training/quantization/ops.py b/example/low_precision_training/quantization/ops.py
new file mode 100755
index 000000000..c7a3568d1
--- /dev/null
+++ b/example/low_precision_training/quantization/ops.py
@@ -0,0 +1,207 @@
+import paddle
+import paddle.nn.functional as F
+import cudnn_convolution_custom
+
+class forward_quantizer(paddle.autograd.PyLayer):
+    # qat
+    @staticmethod
+    def forward(ctx, tensor, q):
+        return q(tensor)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+
+class quantized_linear_for_training(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(ctx, input, weight, bias,
+                hw, wfq, wbq,
+                hi, ifq, ibq,
+                hg, gxq, gwq):
+        
+        # weight quantization
+        # print(weight.shape)
+        # assert 0==1
+
+        # print("linear1weight", weight.stop_gradient)
+        # print("linear1input", input.stop_gradient)
+        # if bias is not None:
+        #     print("linear1bias", bias.stop_gradient)
+
+        q_weight_forward = wfq(weight)
+
+        if hw:
+            q_weight_backward = wbq(weight)
+        else:
+            q_weight_backward = q_weight_forward
+
+        # input quantization
+        q_input_forward = ifq(input)
+        if hi:
+            q_input_backward = ibq(input)
+        else:
+            q_input_backward = q_input_forward
+
+        # print("linear_q_weight_backward", q_weight_backward.stop_gradient)
+        # print("linear_q_input_backward", q_input_backward.stop_gradient)
+        # if bias is not None:
+        #     print("linear_bias", bias.stop_gradient)
+
+        # save tensors for backward
+        ctx.save_for_backward(q_weight_backward, bias, q_input_backward)
+        if bias is None:
+            b_grad = None
+        else:
+            b_grad = bias.stop_gradient
+        ctx.saved = hg, gxq, gwq, weight.stop_gradient, input.stop_gradient, b_grad
+
+        # forward linear operation
+        # print("q_input_forward: ", q_input_forward.shape)
+        # print("q_weight_forward: ", q_weight_forward.shape)  # [512, 1000]反的
+        # output = paddle.matmul(q_input_forward, q_weight_forward, transpose_y=True)   ####
+        output = paddle.matmul(q_input_forward, q_weight_forward)
+        # print("output: ", output.shape)
+        # assert 0==1
+        if bias is not None:
+            output += paddle.unsqueeze(bias, axis=0).expand_as(output)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        hg, gxq, gwq, w_grad, i_grad, b_grad = ctx.saved
+        weight, bias, input = ctx.saved_tensor()
+        # print("grad_output \n", grad_output)
+        # print("weight \n", weight)
+        # print("bias \n", bias)
+        # print("input \n", input)
+        # assert 0==1
+
+        # print("linear2weight", weight.stop_gradient)
+        # print("linear2input", input.stop_gradient)
+        # if bias is not None:
+        #     print("linear2bias", bias.stop_gradient)
+
+        del ctx.saved
+
+        # dual path quantization for grad_output
+        q_grad_output_for_x = gxq(grad_output)
+        if hg:
+            q_grad_output_for_w = gwq(grad_output)
+        else:
+            q_grad_output_for_w = q_grad_output_for_x
+
+        # grad calculation
+        grad_input = grad_weight = grad_bias = None
+
+        # print("linear2q_grad_output_for_w", q_grad_output_for_w.stop_gradient)
+        if not i_grad:
+            # print("q_grad_output_for_x: ", q_grad_output_for_x.shape)   # [64, 1000]
+            # print("weight: ", weight.shape)  # [512, 1000]
+            # grad_input = paddle.matmul(q_grad_output_for_x, weight)
+            grad_input = paddle.matmul(q_grad_output_for_x, weight, transpose_y=True)
+            # print("grad_input: ", grad_input.shape)
+        if not w_grad:
+            # print("q_grad_output_for_w: ", q_grad_output_for_w.shape) # [64, 1000]
+            # print("input: ", input.shape) #  [64, 512]
+            grad_weight = paddle.matmul(q_grad_output_for_w, input, transpose_x=True)
+        if bias is not None and not b_grad:
+            grad_bias = paddle.sum(grad_output, axis=0)
+            return grad_input, grad_weight, grad_bias
+
+        return grad_input, grad_weight
+
+
+class quantized_conv2d_for_training(paddle.autograd.PyLayer):
+
+    @staticmethod
+    def forward(ctx, input, weight, bias, stride, padding, dilation, groups,
+                hw, wfq, wbq,
+                hi, ifq, ibq,
+                hg, gxq, gwq):
+        
+        # print("1weight", weight.stop_gradient)
+        # print("1input", input.stop_gradient)
+        # if bias is not None:
+        #     print("1bias", bias.stop_gradient)
+
+        # weight quantization
+        q_weight_forward = wfq(weight)
+
+        if hw:
+            q_weight_backward = wbq(weight)
+        else:
+            q_weight_backward = q_weight_forward
+
+        # input quantization
+        q_input_forward = ifq(input)
+        if hi:
+            q_input_backward = ibq(input)
+        else:
+            q_input_backward = q_input_forward
+
+        # print("q_weight_backward: ", q_weight_backward.stop_gradient)
+        # print("q_weight_forward: ", q_weight_forward.stop_gradient)
+        # print("q_input_backward", q_input_backward.stop_gradient)
+        # print("q_input_forward", q_input_forward.stop_gradient)
+        # if bias is not None:
+        #     print("bias", bias.stop_gradient)
+
+        # print("q_weight_backward", q_weight_backward.stop_gradient)
+        # print("q_input_backward", q_input_backward.stop_gradient)
+        # if bias is not None:
+        #     print("bias", bias.stop_gradient)
+
+
+        # save tensors and parameters for backward
+        # q_weight_backward.stop_gradient = weight.stop_gradient
+        # q_input_backward.stop_gradient = input.stop_gradient
+        ctx.save_for_backward(q_weight_backward, bias, q_input_backward)
+        if bias is None:
+            b_grad = None
+        else:
+            b_grad = bias.stop_gradient
+        ctx.saved =  stride, padding, dilation, groups, hg, gxq, gwq, weight.stop_gradient, input.stop_gradient, b_grad
+        # ctx.saved = stride, padding, dilation, groups, hg, gxq, gwq
+
+        # output = cudnn_convolution.convolution(q_input_forward, q_weight_forward, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+        output = paddle.nn.functional.conv2d(q_input_forward, q_weight_forward, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        stride, padding, dilation, groups, hg, gxq, gwq, w_grad, i_grad, b_grad = ctx.saved
+        # q_weight_backward, bias, q_input_backward = ctx.saved_tensor()
+        weight, bias, input = ctx.saved_tensor()
+
+        # print("2weight", weight.stop_gradient)
+        # print("2input", input.stop_gradient)
+        # if bias is not None:
+        #     print("2bias", bias.stop_gradient)
+
+        del ctx.saved
+
+        # dual path quantization for grad_output
+        q_grad_output_for_x = gxq(grad_output)
+        if hg:
+            q_grad_output_for_w = gwq(grad_output)
+        else:
+            q_grad_output_for_w = q_grad_output_for_x
+
+        # grad back-propagation
+        grad_input = grad_weight = grad_bias = None
+        if not i_grad:
+            grad_input = cudnn_convolution_custom.convolution_bwd(input, weight, q_grad_output_for_x, stride, [padding, padding], dilation, 1)[0]
+
+        if not w_grad:
+            grad_weight = cudnn_convolution_custom.convolution_bwd(input, weight, q_grad_output_for_w, stride, [padding, padding], dilation, 1)[1]
+
+        if bias is not None and not b_grad:
+            grad_bias = paddle.sum(grad_output, axis=[0, 2, 3])
+            return grad_input, grad_weight, grad_bias
+
+        # print("grad_input", grad_input)
+        # print("weight_input", grad_input)
+
+        return grad_input, grad_weight
diff --git a/example/low_precision_training/quantization/qconfig.py b/example/low_precision_training/quantization/qconfig.py
new file mode 100755
index 000000000..457730cf6
--- /dev/null
+++ b/example/low_precision_training/quantization/qconfig.py
@@ -0,0 +1,22 @@
+class QParam(object):
+    def __init__(
+        self, 
+        qtype: str = 'fp',
+        bitwidth: int = 32,
+        granularity: str = 'per-layer',
+        scale = None
+    ):
+        self.qtype = qtype
+        self.bitwidth = bitwidth
+        self.granularity = granularity
+        self.scale = None
+
+class QConfig(object):
+    
+    def __init__(self):
+        self.forward_weight = None
+        self.forward_input = None
+        self.backward_weight = None
+        self.backward_grad_x_weight = None
+        self.backward_input = None
+        self.backward_grad_x_input = None
\ No newline at end of file
diff --git a/example/low_precision_training/quantization/quantization_utils.py b/example/low_precision_training/quantization/quantization_utils.py
new file mode 100755
index 000000000..b6f9650a1
--- /dev/null
+++ b/example/low_precision_training/quantization/quantization_utils.py
@@ -0,0 +1,93 @@
+import math
+import paddle
+
+def parse_qtype(qtype_str):
+    qtype = qtype_str.rstrip('1234567890')
+    bitwidth = qtype_str[len(qtype):]
+    if bitwidth == '':
+        bitwidth = 0
+    elif bitwidth.isdigit():
+        bitwidth = int(bitwidth)
+    else:
+        raise TypeError(f'dtype error for {qtype_str}')
+    return qtype, bitwidth
+
+def stochastic_round(tensor):
+    return paddle.sign(tensor) * paddle.floor(paddle.abs(tensor) + paddle.rand(tensor.shape))
+
+def custom_round(tensor, stochastic):
+    if stochastic:
+        return paddle.sign(tensor) * paddle.floor(paddle.abs(tensor) + paddle.rand(tensor.shape))
+    else:
+        return paddle.round(tensor)
+
+def quantize_by_mse(tensor, bitwidth, quant_type='int', quant_granularity='per-layer', axis=None, max_iter=50):
+    """quantize each layer by minimizing MSE"""
+    if quant_type == 'full':
+        return paddle.to_tensor(1, dtype='float32'), tensor
+    assert int(bitwidth) == bitwidth, 'bitwidth must be integer'
+    if bitwidth >= 2:
+        if quant_type == 'int':
+            MAX_VAL = math.pow(2., bitwidth - 1) - 1.
+            MIN_VAL = -MAX_VAL
+        elif quant_type == 'uint':
+            MIN_VAL = 0
+            MAX_VAL = math.pow(2, bitwidth) - 1
+        else:
+            raise RuntimeError(f"invalid quant_type: {quant_type}")
+
+        if quant_granularity == 'per-layer':
+            size = tensor.shape
+            flatten_tensor = paddle.reshape(tensor, [-1])
+            alpha_old = -1.0
+            alpha = paddle.max(paddle.abs(flatten_tensor)) / MAX_VAL
+            eps = 1e-6
+            for it in range(max_iter):
+                flatten_tensor_q = paddle.clip(paddle.round(flatten_tensor / alpha), MIN_VAL, MAX_VAL)
+                alpha_old = alpha
+                alpha = paddle.dot(flatten_tensor, flatten_tensor_q) / paddle.dot(flatten_tensor_q, flatten_tensor_q)
+                if paddle.abs((alpha - alpha_old) / alpha) >= eps:
+                    break
+            quantized_tensor = paddle.reshape(flatten_tensor_q * alpha, size)
+
+        elif quant_granularity == 'per-channel':
+            size = tensor.shape
+            flatten_tensor = paddle.reshape(tensor, [size[0], -1])
+            alpha = paddle.max(paddle.abs(flatten_tensor), axis=1) / MAX_VAL
+            alpha_old = paddle.ones_like(alpha) * -1.0
+            eps = 1e-6
+            for it in range(max_iter):
+                flatten_tensor_q = paddle.clip(paddle.round(flatten_tensor / alpha[:, None]), MIN_VAL, MAX_VAL)
+                alpha_old = alpha
+                alpha = paddle.sum(flatten_tensor * flatten_tensor_q, axis=1) / paddle.sum(flatten_tensor_q * flatten_tensor_q, axis=1)
+                if paddle.sum(paddle.abs((alpha - alpha_old) / alpha) >= eps).sum() > 0:  # 需要询问
+                    break
+            quantized_tensor = paddle.reshape(flatten_tensor_q * alpha[:, None], size)
+
+        else:
+            raise RuntimeError(f"invalid quant_granularity: {quant_granularity}")
+
+    elif bitwidth == 1:
+        assert quant_type == 'int'
+        MIN_VAL = -1.
+        MAX_VAL = 1.
+
+        if quant_granularity == 'per-layer':
+            alpha = paddle.mean(paddle.abs(tensor))
+            tensor_q = paddle.sign(tensor)
+            quantized_tensor = tensor_q * alpha
+
+        elif quant_granularity == 'per-channel':
+            size = tensor.shape
+            flatten_tensor = paddle.reshape(tensor, [size[0], -1])
+            alpha = paddle.mean(paddle.abs(flatten_tensor), axis=1)
+            flatten_tensor_q = paddle.sign(flatten_tensor)
+            quantized_tensor = paddle.reshape(flatten_tensor_q * alpha[:, None], size)
+
+        else:
+            raise RuntimeError(f"invalid quant_granularity: {quant_granularity}")
+
+    else:
+        raise RuntimeError(f"invalid bitwidth: {bitwidth}")
+    
+    return alpha, quantized_tensor
diff --git a/example/low_precision_training/quantization/quantizer/__init__.py b/example/low_precision_training/quantization/quantizer/__init__.py
new file mode 100755
index 000000000..5dffae55b
--- /dev/null
+++ b/example/low_precision_training/quantization/quantizer/__init__.py
@@ -0,0 +1,3 @@
+from .dummy_quantizer import dummy_quantizer, full_quantizer
+from .int_quantizer import int_quantizer
+
diff --git a/example/low_precision_training/quantization/quantizer/dummy_quantizer.py b/example/low_precision_training/quantization/quantizer/dummy_quantizer.py
new file mode 100755
index 000000000..8d6c094e0
--- /dev/null
+++ b/example/low_precision_training/quantization/quantizer/dummy_quantizer.py
@@ -0,0 +1,22 @@
+import paddle
+
+class DummyQuantizer:
+    def __call__(self, tensor):
+        return tensor
+
+    def __repr__(self):
+        return 'DummyQuantizer - fp32'
+
+    def calculate_quantization_scale(self, tensor, max_iter=50):
+        # PaddlePaddle equivalent to torch.tensor(1.0).to(tensor.device)
+        return paddle.to_tensor(1.0, place=tensor.place)
+
+    def set_scale(self, scale):
+        self.scale = scale
+
+
+def dummy_quantizer(*args, **kwargs):
+    return DummyQuantizer()
+    
+def full_quantizer(*args, **kwargs):
+    return DummyQuantizer()
\ No newline at end of file
diff --git a/example/low_precision_training/quantization/quantizer/int_quantizer.py b/example/low_precision_training/quantization/quantizer/int_quantizer.py
new file mode 100755
index 000000000..b0c81ddf9
--- /dev/null
+++ b/example/low_precision_training/quantization/quantizer/int_quantizer.py
@@ -0,0 +1,232 @@
+import paddle
+from paddle import tensor as T
+from ..quantization_utils import custom_round
+
+class IntQuantizer:
+    r"""Integer quantizer.
+    Args:
+        bitwidth (int): The bit-width for quantization
+        scaling (str): quantization scale type. Support: ``fixed``, ``learned``, ``dynamic``
+        dims (None, int or tuple of ints): the dimensions to be assigned with a set of scales
+        observer (str): quantization metric to calculate scale. Support: ``minmax``, ``mse``, ``cosine``
+        tag (None or str): quantization tag
+    """
+    def __init__(self, bitwidth, signed=None, scaling="fixed", dims=None, observer="minmax", stochastic=False):
+        self.bitwidth = bitwidth
+        self.signed = signed
+        assert scaling in ["fixed", "learned", "dynamic"]
+        self.scaling = scaling
+        if dims is None:
+            self.dims = None
+        elif isinstance(dims, int):
+            self.dims = (dims,)
+        elif isinstance(dims, tuple) or isinstance(dims, list):
+            self.dims = tuple(dims)
+        elif isinstance(dims, str):
+            self.dims = self._parse_readable_dims(dims)
+        else:
+            raise TypeError(f'dims must be None, int, list or tuple, but got {type(dims)}')
+        assert observer in ["minmax", "mse", "cosine"]
+        self.observer = observer
+        self.stochastic = stochastic
+
+        if self.scaling == "fixed":
+            self.fixed_scale = None
+
+        if self.signed is True:
+            self.lower_bound = -2 ** (self.bitwidth - 1)
+            self.upper_bound = -self.lower_bound - 1
+        elif self.signed is False:
+            self.upper_bound = (1 << self.bitwidth) - 1
+            self.lower_bound = 0
+        else:
+            self.upper_bound = self.lower_bound = None
+
+    def _if_is_signed(self, tensor):
+        if self.signed is None:
+            if tensor.min() < 0:
+                self.signed = True
+                self.lower_bound = -2 ** (self.bitwidth - 1)
+                self.upper_bound = -self.lower_bound - 1
+            else:
+                self.signed = False
+                self.upper_bound = (1 << self.bitwidth) - 1
+                self.lower_bound = 0
+
+    def _parse_readable_dims(self, str_dims: str):
+        dims = []
+        dim_map = {'n': 0, 'c': 1, 'h': 2, 'w': 3}
+        str_dims = str_dims.lower()
+        if str_dims in ['per-layer', 'per-tensor', 'none']:
+            return None
+        str_dims = list(str_dims)
+        try:
+            for str_dim in str_dims:
+                dims.append(dim_map[str_dim])
+        except KeyError:
+            ex = AttributeError(f"quantization granularity only support NCHW, but got '{str_dim}'")
+            raise ex
+        except Exception as e:
+            raise e
+        else:
+            return tuple(dims)
+
+    def set_scale(self, scale):
+        assert self.scaling == "fixed"
+        self.fixed_scale = scale
+
+    def calculate_quantization_scale(self, tensor, eps=1e-6, max_iter=50):
+        if self.observer == "minmax":
+            return self.minmax_quantize_with_dynamic_scale(tensor)[1]
+        elif self.observer == "mse":
+            return self.mse_quantize_with_dynamic_scale(tensor, eps, max_iter)[1]
+        else:
+            assert False
+
+    def movedim_to_perm(self, dims):
+        if dims == (0,):
+            return (0,1,2,3), (0,1,2,3)
+        elif dims == (1,):
+            return (1,0,2,3), (1,0,2,3)
+        elif dims == (0,1):
+            return (0,1,2,3), (0,1,2,3)
+        elif dims == (0,2,3):
+            return (0,2,3,1), (0,3,1,2)
+        else:
+            raise TypeError(f'dims {type(dims)} is unexpected!')
+
+
+
+    def quantize_with_fixed_scale(self, tensor):
+        self._if_is_signed(tensor)
+
+        if self.dims is None: 
+            # per-layer quantization
+            quantized_tensor = custom_round(tensor / self.fixed_scale, self.stochastic).clip(self.lower_bound, self.upper_bound) * self.fixed_scale
+        else:
+            tensor_shape = tensor.shape
+            source = self.dims
+            dest = tuple(range(len(source)))
+
+            num_scales = 1
+            for i in source:
+                num_scales *= tensor_shape[i]
+
+            permute_tensor = paddle.transpose(tensor, self.movedim_to_perm(self.dims)[0]).contiguous()
+            permute_tensor_shape = permute_tensor.shape
+            flatten_tensor = permute_tensor.reshape([num_scales, -1])
+
+            flatten_tensor_q = custom_round(flatten_tensor / self.fixed_scale[:, None], self.stochastic).clip(self.lower_bound, self.upper_bound)
+            quantized_tensor = (flatten_tensor_q * self.fixed_scale[:, None]).reshape(permute_tensor_shape).transpose(self.movedim_to_perm(self.dims)[1]).contiguous()
+
+        return quantized_tensor
+
+    def quantize_with_learned_scale(self, tensor):
+        pass
+    
+    def minmax_quantize_with_dynamic_scale(self, tensor):
+        self._if_is_signed(tensor)
+
+        if self.dims is None: 
+            # per-layer quantization
+            alpha = paddle.abs(tensor.flatten()).max() / self.upper_bound
+            alpha = paddle.clip(alpha, min=1e-12)
+            quantized_tensor = custom_round(tensor / alpha, self.stochastic).clip(self.lower_bound, self.upper_bound) * alpha
+            
+        else:
+            tensor_shape = tensor.shape
+            source = self.dims
+            dest = tuple(range(len(source)))
+
+            num_scales = 1
+            for i in source:
+                num_scales *= tensor_shape[i]
+                
+            permute_tensor = paddle.transpose(tensor, self.movedim_to_perm(self.dims)[0]).contiguous()
+            permute_tensor_shape = permute_tensor.shape
+
+            # print("permute_tensor.shape: ", permute_tensor.shape) # [64, 64, 3, 3]
+            flatten_tensor = permute_tensor.reshape([num_scales, -1])
+
+            # print(flatten_tensor.shape)  # [64, 576]
+            alpha = flatten_tensor.abs().max(axis=1) / self.upper_bound
+            # print(alpha.shape)
+            alpha = paddle.clip(alpha, min=1e-12)  ####
+
+            # print(alpha.shape)  # []
+            # print(flatten_tensor.shape) # [36864]
+            # print(1, flatten_tensor.abs().max(axis=1).shape)
+            # print(flatten_tensor.abs().max(axis=1)[0].shape)
+
+            flatten_tensor_q = custom_round(flatten_tensor / alpha[:, None], self.stochastic).clip(self.lower_bound, self.upper_bound)
+
+            quantized_tensor = (flatten_tensor_q * alpha[:, None]).reshape(permute_tensor_shape).transpose(self.movedim_to_perm(self.dims)[1]).contiguous()
+
+        return quantized_tensor, alpha
+
+    def mse_quantize_with_dynamic_scale(self, tensor, eps=1e-6, max_iter=50):
+        self._if_is_signed(tensor)
+        
+        if self.dims is None: 
+            # per-layer quantization
+            flatten_tensor = tensor.flatten()
+            alpha = flatten_tensor.abs().max() / self.upper_bound
+            alpha_old = alpha * 1.2
+            for it in range(max_iter):
+                flatten_tensor_q = (flatten_tensor / alpha).round().clip(self.lower_bound, self.upper_bound)
+                alpha_old = alpha
+                alpha = flatten_tensor.dot(flatten_tensor_q) / flatten_tensor_q.dot(flatten_tensor_q)
+                if paddle.abs((alpha - alpha_old) / alpha).item() <= eps:
+                    break
+            quantized_tensor = custom_round(tensor / alpha, self.stochastic).clip(self.lower_bound, self.upper_bound) * alpha
+        else:
+            tensor_shape = tensor.shape
+            source = self.dims
+            dest = tuple(range(len(source)))
+
+            num_scales = 1
+            for i in source:
+                num_scales *= tensor_shape[i]
+
+            permute_tensor = paddle.transpose(tensor, self.movedim_to_perm(self.dims)[0]).contiguous()
+            permute_tensor_shape = permute_tensor.shape
+
+            # 使用 reshape 替代 view
+            flatten_tensor = permute_tensor.reshape([num_scales, -1])
+
+            alpha = paddle.abs(flatten_tensor).max(axis=1) / self.upper_bound
+            alpha_old = alpha * 1.2
+
+            for it in range(max_iter):
+                flatten_tensor_q = (flatten_tensor / alpha[:, None]).round().clip(self.lower_bound, self.upper_bound)
+                alpha_old = alpha
+                alpha = (flatten_tensor * flatten_tensor_q).sum(axis=1) / (flatten_tensor_q * flatten_tensor_q).sum(axis=1)
+                if ((alpha - alpha_old).norm() / alpha_old.norm()).item() <= eps:
+                    break
+
+            flatten_tensor_q = custom_round(flatten_tensor / alpha[:, None], self.stochastic).clip(self.lower_bound, self.upper_bound)
+            quantized_tensor = (flatten_tensor_q * alpha[:, None]).reshape(permute_tensor_shape).transpose(self.movedim_to_perm(self.dims)[1]).contiguous()
+
+            # 使用 transpose 替代 movedim
+            # quantized_tensor = paddle.transpose(quantized_tensor, perm=[*source, *[j for j in range(len(permute_tensor_shape)) if j not in dest]])
+
+        return quantized_tensor, alpha
+
+
+    def __call__(self, tensor):
+        if self.scaling == "fixed":
+            return self.quantize_with_fixed_scale(tensor)
+        elif self.scaling == "learned":
+            return self.quantize_with_learned_scale(tensor)
+        elif self.scaling == "dynamic":
+            if self.observer == "minmax":  # here
+                return self.minmax_quantize_with_dynamic_scale(tensor)[0]
+            elif self.observer == "mse":
+                return self.mse_quantize_with_dynamic_scale(tensor, max_iter=1)[0]
+            else:
+                assert False
+        else:
+            assert False
+
+def int_quantizer(bitwidth, signed=None, scaling="fixed", dims=None, observer="minmax", stochastic=False):
+    return IntQuantizer(bitwidth, signed, scaling, dims, observer, stochastic)
diff --git a/example/low_precision_training/quantization/registry.py b/example/low_precision_training/quantization/registry.py
new file mode 100755
index 000000000..718faf772
--- /dev/null
+++ b/example/low_precision_training/quantization/registry.py
@@ -0,0 +1,57 @@
+REGISTRIES = {}
+
+def setup_registry(
+    registry_name,
+    base_class=None,
+    default=None
+):
+    assert registry_name.startswith('--')
+    registry_name = registry_name[2:].replace('-', '_')
+
+    REGISTRY = {}
+    REGISTRY_CLASS_NAMES = set()
+
+    # maintain a registry of all registries
+    if registry_name in REGISTRIES:
+        raise RuntimeError('registry: {} already exists'.format(registry_name))
+    
+    REGISTRIES[registry_name] = {
+        'registry': REGISTRY,
+        'default': default,
+    }
+
+    def build_x(args, *extra_args, **extra_kwargs):
+        choice = getattr(args, registry_name, None)
+        if choice is None:
+            return None
+        cls = REGISTRY[choice]
+        if hasattr(cls, 'build_' + registry_name):
+            builder = getattr(cls, 'build_' + registry_name)
+        else:
+            builder = cls
+        
+        return builder(args, *extra_args, **extra_kwargs)
+
+    def register_x(name):
+        
+        def register_x_cls(cls):
+            if name in REGISTRY:
+                raise ValueError('Cannot register duplicate {} ({})'.format(registry_name, name))
+            
+            if cls.__name__ in REGISTRY_CLASS_NAMES:
+                raise ValueError(
+                    'Cannot register {} with duplicate class name ({})'.format(
+                        registry_name, cls.__name__,
+                    )
+                )
+            
+            if base_class is not None and not issubclass(cls, base_class):
+                raise ValueError('{} must extend {}'.format(cls.__name__, base_class.__name__))
+            
+            REGISTRY[name] = cls
+            REGISTRY_CLASS_NAMES.add(cls.__name__)
+            return cls
+        
+        return register_x_cls
+    
+    return build_x, register_x, REGISTRY
diff --git a/example/low_precision_training/quantization/setup.py b/example/low_precision_training/quantization/setup.py
new file mode 100755
index 000000000..5fc873c9f
--- /dev/null
+++ b/example/low_precision_training/quantization/setup.py
@@ -0,0 +1,23 @@
+# from setuptools import setup, Extension
+# from torch.utils.cpp_extension import (CUDAExtension, CppExtension, BuildExtension)
+
+# setup(name='cudnn_convolution',
+#       ext_modules=[CUDAExtension('cudnn_convolution', ['cudnn_convolution.cpp'])],
+#       cmdclass={'build_ext': BuildExtension})
+
+
+from paddle.utils.cpp_extension import CUDAExtension, setup
+
+# setup(
+#     name='cudnn_convolution_custom',
+#     ext_modules=CUDAExtension(
+#         sources=['cudnn_convolution.cpp']
+#     )
+# )
+
+setup(
+    name='cudnn_convolution_custom',
+    ext_modules=CUDAExtension(
+        sources=['cudnn_convolution.cpp']
+    )
+)
diff --git a/example/low_precision_training/quantization/utils.py b/example/low_precision_training/quantization/utils.py
new file mode 100755
index 000000000..91675c83a
--- /dev/null
+++ b/example/low_precision_training/quantization/utils.py
@@ -0,0 +1,52 @@
+import os.path as osp
+from importlib import import_module
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def import_modules_from_strings(imports, allow_failed_imports=False):
+    """Import modules from the given list of strings.
+    Args:
+        imports (list | str | None): The given module names to be imported.
+        allow_failed_imports (bool): If True, the failed imports will return
+            None. Otherwise, an ImportError is raise. Default: False.
+    Returns:
+        list[module] | module | None: The imported modules.
+    Examples:
+        >>> osp, sys = import_modules_from_strings(
+        ...     ['os.path', 'sys'])
+        >>> import os.path as osp_
+        >>> import sys as sys_
+        >>> assert osp == osp_
+        >>> assert sys == sys_
+    """
+    if not imports:
+        return
+    single_import = False
+    if isinstance(imports, str):
+        single_import = True
+        imports = [imports]
+    if not isinstance(imports, list):
+        raise TypeError(
+            f'custom_imports must be a list but got type {type(imports)}')
+    imported = []
+    for imp in imports:
+        if not isinstance(imp, str):
+            raise TypeError(
+                f'{imp} is of type {type(imp)} and cannot be imported.')
+        try:
+            imported_tmp = import_module(imp)
+        except ImportError:
+            if allow_failed_imports:
+                warnings.warn(f'{imp} failed to import and is ignored.',
+                              UserWarning)
+                imported_tmp = None
+            else:
+                raise ImportError
+        imported.append(imported_tmp)
+    if single_import:
+        imported = imported[0]
+    return imported
\ No newline at end of file
diff --git a/example/low_precision_training/requirements.txt b/example/low_precision_training/requirements.txt
new file mode 100755
index 000000000..48b32caba
--- /dev/null
+++ b/example/low_precision_training/requirements.txt
@@ -0,0 +1,213 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+_libgcc_mutex=0.1=conda_forge
+_openmp_mutex=4.5=2_gnu
+_sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
+accelerate=0.30.1=pypi_0
+addict=2.4.0=pypi_0
+aiohttp=3.9.5=pypi_0
+aiosignal=1.3.1=pypi_0
+albumentations=0.4.3=pypi_0
+anyio=4.5.0=pypi_0
+astor=0.8.1=pypi_0
+async-timeout=4.0.3=pypi_0
+attrs=23.2.0=pypi_0
+binutils_impl_linux-64=2.38=h2a08ee3_1
+binutils_linux-64=2.38.0=hc2dff05_0
+blas=1.0=mkl
+blobfile=2.1.1=pypi_0
+brotli-python=1.0.9=py310h6a678d5_7
+bzip2=1.0.8=h5eee18b_5
+ca-certificates=2024.8.30=hbcca054_0
+certifi=2024.8.30=pyhd8ed1ab_0
+charset-normalizer=2.0.4=pyhd3eb1b0_0
+click=8.1.7=pypi_0
+clip=1.0=pypi_0
+contourpy=1.3.0=pypi_0
+cuda-cudart=12.1.105=0
+cuda-cupti=12.1.105=0
+cuda-libraries=12.1.0=0
+cuda-nvrtc=12.1.105=0
+cuda-nvtx=12.1.105=0
+cuda-opencl=12.4.127=0
+cuda-runtime=12.1.0=0
+cuda-version=12.6=h7480c83_3
+cycler=0.12.1=pypi_0
+cython=3.0.10=pypi_0
+datasets=2.19.1=pypi_0
+decorator=5.1.1=pypi_0
+diffusers=0.28.0.dev0=pypi_0
+dill=0.3.8=pypi_0
+docker-pycreds=0.4.0=pypi_0
+easydict=1.13=pypi_0
+einops=0.7.0=pypi_0
+exceptiongroup=1.2.2=pypi_0
+ffmpeg=4.3=hf484d3e_0
+filelock=3.13.1=py310h06a4308_0
+flash-attn=0.2.8=pypi_0
+fonttools=4.54.1=pypi_0
+freetype=2.12.1=h4a9f257_0
+frozenlist=1.4.1=pypi_0
+fsspec=2024.3.1=pypi_0
+ftfy=6.2.0=pypi_0
+gcc_impl_linux-64=11.2.0=h1234567_1
+gcc_linux-64=11.2.0=h5c386dc_0
+gitdb=4.0.11=pypi_0
+gitpython=3.1.43=pypi_0
+gmp=6.2.1=h295c915_3
+gmpy2=2.1.2=py310heeb90bb_0
+gnutls=3.6.15=he1e5248_0
+gxx_impl_linux-64=11.2.0=h1234567_1
+gxx_linux-64=11.2.0=hc2dff05_0
+h11=0.14.0=pypi_0
+httpcore=1.0.5=pypi_0
+httpx=0.27.2=pypi_0
+huggingface-hub=0.23.0=pypi_0
+idna=3.4=py310h06a4308_0
+imageio=2.34.1=pypi_0
+imgaug=0.2.6=pypi_0
+importlib-metadata=7.1.0=pypi_0
+intel-openmp=2023.1.0=hdb19cb5_46306
+jinja2=3.1.3=py310h06a4308_0
+joblib=1.4.2=pypi_0
+jpeg=9e=h5eee18b_1
+kernel-headers_linux-64=3.10.0=h57e8cba_10
+kiwisolver=1.4.7=pypi_0
+lame=3.100=h7b6447c_0
+lazy-loader=0.4=pypi_0
+lcms2=2.12=h3be6417_0
+ld_impl_linux-64=2.38=h1181459_1
+lerc=3.0=h295c915_0
+libcublas=12.1.0.26=0
+libcufft=11.0.2.4=0
+libcufile=1.9.1.3=0
+libcurand=10.3.5.147=0
+libcusolver=11.4.4.55=0
+libcusparse=12.0.2.55=0
+libdeflate=1.17=h5eee18b_1
+libffi=3.4.4=h6a678d5_0
+libgcc=14.2.0=h77fa898_1
+libgcc-devel_linux-64=11.2.0=h1234567_1
+libgcc-ng=14.2.0=h69a702a_1
+libgfortran-ng=7.5.0=ha8ba4b0_17
+libgfortran4=7.5.0=ha8ba4b0_17
+libgomp=14.2.0=h77fa898_1
+libiconv=1.16=h7f8727e_2
+libidn2=2.3.4=h5eee18b_0
+libjpeg-turbo=2.0.0=h9bf148f_0
+libnpp=12.0.2.50=0
+libnvjitlink=12.1.105=0
+libnvjpeg=12.1.1.14=0
+libpng=1.6.39=h5eee18b_0
+libstdcxx=14.2.0=hc0a3c3a_1
+libstdcxx-devel_linux-64=11.2.0=h1234567_1
+libstdcxx-ng=11.2.0=h1234567_1
+libtasn1=4.19.0=h5eee18b_0
+libtiff=4.5.1=h6a678d5_0
+libunistring=0.9.10=h27cfd23_0
+libuuid=1.41.5=h5eee18b_0
+libwebp-base=1.3.2=h5eee18b_0
+llvm-openmp=14.0.6=h9e868ea_0
+lmdb=1.4.1=pypi_0
+lxml=4.9.4=pypi_0
+lz4-c=1.9.4=h6a678d5_0
+markdown-it-py=3.0.0=pypi_0
+markupsafe=2.1.3=py310h5eee18b_0
+matplotlib=3.9.2=pypi_0
+mdurl=0.1.2=pypi_0
+mkl=2023.1.0=h213fc3f_46344
+mkl-service=2.4.0=py310h5eee18b_1
+mkl_fft=1.3.8=py310h5eee18b_0
+mkl_random=1.2.4=py310hdb19cb5_0
+mmcv=2.2.0=pypi_0
+mmcv-full=1.7.2=pypi_0
+mmengine=0.10.5=pypi_0
+mpc=1.1.0=h10f8cd9_1
+mpfr=4.0.2=hb69a4c5_1
+mpi=1.0=mpich
+mpi4py=3.1.4=py310hfc96bbd_0
+mpich=3.3.2=hc856adb_0
+mpmath=1.3.0=py310h06a4308_0
+multidict=6.0.5=pypi_0
+multiprocess=0.70.16=pypi_0
+nccl=2.23.4.1=h2b5d15b_3
+ncurses=6.4=h6a678d5_0
+nettle=3.7.3=hbbd107a_1
+networkx=3.1=py310h06a4308_0
+ninja=1.11.1.1=pypi_0
+numpy=1.26.4=py310h5f9d8c6_0
+numpy-base=1.26.4=py310hb5e798b_0
+nvidia-nccl=2.9.8=pypi_0
+nvidia-pyindex=1.0.9=pypi_0
+opencv-python=4.10.0.84=pypi_0
+opencv-python-headless=4.9.0.80=pypi_0
+openh264=2.1.1=h4ff587b_0
+openjpeg=2.4.0=h3ad879b_0
+openssl=3.4.0=hb9d3cd8_0
+opt-einsum=3.3.0=pypi_0
+packaging=24.0=pypi_0
+paddlepaddle-gpu=2.6.2=pypi_0
+pandas=2.2.2=pypi_0
+pillow=10.2.0=py310h5eee18b_0
+pip=23.3.1=py310h06a4308_0
+piq=0.7.0=pypi_0
+platformdirs=4.2.1=pypi_0
+protobuf=4.25.3=pypi_0
+psutil=5.9.8=pypi_0
+pyarrow=16.0.0=pypi_0
+pyarrow-hotfix=0.6=pypi_0
+pycryptodomex=3.20.0=pypi_0
+pygments=2.18.0=pypi_0
+pyparsing=3.1.4=pypi_0
+pysocks=1.7.1=py310h06a4308_0
+python=3.10.14=h955ad1f_0
+python-dateutil=2.9.0.post0=pypi_0
+pytorch=2.3.0=py3.10_cuda12.1_cudnn8.9.2_0
+pytorch-cuda=12.1=ha16c6d3_5
+pytorch-fid=0.3.0=pypi_0
+pytorch-mutex=1.0=cuda
+pytz=2024.1=pypi_0
+pyyaml=6.0.1=py310h5eee18b_0
+readline=8.2=h5eee18b_0
+regex=2024.4.16=pypi_0
+requests=2.31.0=py310h06a4308_1
+rich=13.8.1=pypi_0
+safetensors=0.4.3=pypi_0
+scikit-image=0.23.2=pypi_0
+scikit-learn=1.5.2=pypi_0
+scipy=1.13.0=pypi_0
+sentry-sdk=2.1.1=pypi_0
+setproctitle=1.3.3=pypi_0
+setuptools=75.3.0=pypi_0
+six=1.16.0=pypi_0
+smmap=5.0.1=pypi_0
+sniffio=1.3.1=pypi_0
+sqlite=3.41.2=h5eee18b_0
+sympy=1.12=py310h06a4308_0
+sysroot_linux-64=2.17=h57e8cba_10
+tbb=2021.8.0=hdb19cb5_0
+termcolor=2.4.0=pypi_0
+threadpoolctl=3.5.0=pypi_0
+tifffile=2024.4.24=pypi_0
+tk=8.6.12=h1ccaba5_0
+tomli=2.0.1=pypi_0
+torch-fidelity=0.3.0=pypi_0
+torchaudio=2.3.0=py310_cu121
+torchtriton=2.3.0=py310
+torchvision=0.18.0=py310_cu121
+tqdm=4.66.2=pypi_0
+typing_extensions=4.9.0=py310h06a4308_1
+tzdata=2024.1=pypi_0
+urllib3=2.1.0=py310h06a4308_1
+wandb=0.17.0=pypi_0
+wcwidth=0.2.13=pypi_0
+wheel=0.41.2=py310h06a4308_0
+xxhash=3.4.1=pypi_0
+xz=5.4.6=h5eee18b_0
+yaml=0.2.5=h7b6447c_0
+yapf=0.40.2=pypi_0
+yarl=1.9.4=pypi_0
+zipp=3.18.1=pypi_0
+zlib=1.2.13=h5eee18b_0
+zstd=1.5.5=hc292b87_0
diff --git a/example/low_precision_training/tools/__init__.py b/example/low_precision_training/tools/__init__.py
new file mode 100755
index 000000000..0519ecba6
--- /dev/null
+++ b/example/low_precision_training/tools/__init__.py
@@ -0,0 +1 @@
+ 
\ No newline at end of file
diff --git a/example/low_precision_training/tools/eval_qt.py b/example/low_precision_training/tools/eval_qt.py
new file mode 100755
index 000000000..7a52b276b
--- /dev/null
+++ b/example/low_precision_training/tools/eval_qt.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+
+# XXX: avoid triggering error on DCU machines
+import tarfile
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from ppcls.utils import config
+from ppcls.engine.engine import Engine_qt, Engine
+
+### quatization
+from quantization import Config
+
+if __name__ == "__main__":
+    args = config.parse_args()
+    config = config.get_config(
+        args.config, overrides=args.override, show=False)
+    
+    ########## qt
+    config.eval_quant=True
+    config.upbn=False
+    config.qconfigdir = '../qt_config/qt_W4_mse_det.yaml'
+    # config.qconfigdir = '../qt_config/qt_W4_minmax_det__A4_mse_det__G4_minmax_sto.yaml'
+    # config.qconfigdir = '../qt_config/qt_W4_mse_det__A4_mse_det__G4_minmax_sto.yaml'
+    config.qconfig = Config.fromfile(config.qconfigdir)
+    ##########
+
+    engine = Engine_qt(config, mode="eval")
+    # engine = Engine(config, mode="eval")
+    engine.eval()
diff --git a/example/low_precision_training/tools/eval_qt.sh b/example/low_precision_training/tools/eval_qt.sh
new file mode 100755
index 000000000..d41822dea
--- /dev/null
+++ b/example/low_precision_training/tools/eval_qt.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+# for single card eval
+# python3.7 tools/eval.py -c ./ppcls/configs/ImageNet/ResNet/ResNet50.yaml
+
+# for multi-cards eval
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch --gpus="0,1,2,3" eval_qt.py -c ../ppcls/configs/ImageNet/ResNet/ResNet18_custom.yaml
+
diff --git a/example/low_precision_training/tools/train.py b/example/low_precision_training/tools/train.py
new file mode 100755
index 000000000..fa6e36bc6
--- /dev/null
+++ b/example/low_precision_training/tools/train.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import yaml
+
+# XXX: avoid triggering error on DCU machines
+import tarfile
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from ppcls.utils import config, convert_to_dict
+from ppcls.engine.engine import Engine
+
+if __name__ == "__main__":
+    args = config.parse_args()
+    config = config.get_config(args.config, overrides=args.override, show=False)
+    config.profiler_options = args.profiler_options
+    uniform_output_enabled = config["Global"].get("uniform_output_enabled",
+                                                  False)
+    if uniform_output_enabled:
+        if os.path.exists(
+                os.path.join(config["Global"]["output_dir"],
+                             "train_results.json")):
+            try:
+                os.remove(
+                    os.path.join(config["Global"]["output_dir"],
+                                 "train_results.json"))
+            except:
+                pass
+        config_dict = convert_to_dict(config)
+        with open(
+                os.path.join(config["Global"]["output_dir"], "config.yaml"),
+                "w") as f:
+            yaml.dump(config_dict, f)
+    engine = Engine(config, mode="train")
+    engine.train()
diff --git a/example/low_precision_training/tools/train.sh b/example/low_precision_training/tools/train.sh
new file mode 100755
index 000000000..3e1922f09
--- /dev/null
+++ b/example/low_precision_training/tools/train.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# for single card train
+# python tools/train.py -c ./ppcls/configs/ImageNet/ResNet/ResNet50.yaml
+
+# for multi-cards train
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch --gpus="0,1,2,3" train.py -c ../ppcls/configs/ImageNet/ResNet/ResNet18.yaml
diff --git a/example/low_precision_training/tools/train_qat.py b/example/low_precision_training/tools/train_qat.py
new file mode 100755
index 000000000..8d8ae4f98
--- /dev/null
+++ b/example/low_precision_training/tools/train_qat.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import yaml
+import argparse
+
+# XXX: avoid triggering error on DCU machines
+import tarfile
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from ppcls.utils import config, convert_to_dict
+from ppcls.engine.engine import Engine_qat
+
+### quatization
+from quantization import Config
+
+if __name__ == "__main__":
+    args = config.parse_args()
+    config = config.get_config(args.config, overrides=args.override, show=False)
+    config.profiler_options = args.profiler_options
+
+    ########### qt
+    # config.qconfigdir = '../qt_config/qt_W4_minmax_det__A4_mse_det__G4_minmax_sto.yaml'
+    config.qconfigdir = '../qt_config/qt_W4_mse_det__A4_mse_det__G4_minmax_sto.yaml'
+    config.qconfig = Config.fromfile(config.qconfigdir)
+    ###########
+    
+    uniform_output_enabled = config["Global"].get("uniform_output_enabled",
+                                                  False)
+    if uniform_output_enabled:
+        if os.path.exists(
+                os.path.join(config["Global"]["output_dir"],
+                             "train_results.json")):
+            try:
+                os.remove(
+                    os.path.join(config["Global"]["output_dir"],
+                                 "train_results.json"))
+            except:
+                pass
+        config_dict = convert_to_dict(config)
+        with open(
+                os.path.join(config["Global"]["output_dir"], "config.yaml"),
+                "w") as f:
+            yaml.dump(config_dict, f)
+    engine = Engine_qat(config, mode="train")
+    engine.train()
diff --git a/example/low_precision_training/tools/train_qt.py b/example/low_precision_training/tools/train_qt.py
new file mode 100755
index 000000000..cba9c3cca
--- /dev/null
+++ b/example/low_precision_training/tools/train_qt.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import yaml
+import argparse
+
+# XXX: avoid triggering error on DCU machines
+import tarfile
+
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from ppcls.utils import config, convert_to_dict
+from ppcls.engine.engine import Engine_qt
+
+### quatization
+from quantization import Config
+
+if __name__ == "__main__":
+    args = config.parse_args()
+    config = config.get_config(args.config, overrides=args.override, show=False)
+    config.profiler_options = args.profiler_options
+
+    ########### qt
+    config.eval_quant=False
+    config.upbn=False
+    # config.qconfigdir = '../qt_config/qt_W4_minmax_det__A4_mse_det__G4_minmax_sto.yaml'
+    config.qconfigdir = '../qt_config/qt_W4_mse_det__A4_mse_det__G4_minmax_sto.yaml'
+    config.qconfig = Config.fromfile(config.qconfigdir)
+    ###########
+    
+    uniform_output_enabled = config["Global"].get("uniform_output_enabled",
+                                                  False)
+    if uniform_output_enabled:
+        if os.path.exists(
+                os.path.join(config["Global"]["output_dir"],
+                             "train_results.json")):
+            try:
+                os.remove(
+                    os.path.join(config["Global"]["output_dir"],
+                                 "train_results.json"))
+            except:
+                pass
+        config_dict = convert_to_dict(config)
+        with open(
+                os.path.join(config["Global"]["output_dir"], "config.yaml"),
+                "w") as f:
+            yaml.dump(config_dict, f)
+    engine = Engine_qt(config, mode="train")
+    engine.train()
diff --git a/example/low_precision_training/tools/train_qt.sh b/example/low_precision_training/tools/train_qt.sh
new file mode 100755
index 000000000..97741ea27
--- /dev/null
+++ b/example/low_precision_training/tools/train_qt.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# for single card train
+# python tools/train.py -c ./ppcls/configs/ImageNet/ResNet/ResNet50.yaml
+
+# for multi-cards train
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+python -m paddle.distributed.launch --gpus="4,5,6,7" train_qt.py -c ../ppcls/configs/ImageNet/ResNet/ResNet18_torch.yaml